From 0aa071f091117252cb431129fa9e137478432b1f Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 29 Jan 2025 12:09:17 -0500
Subject: [PATCH 01/49] first draft of data prep steps for downscaling workflow

---
 01b_woody_state/00-prepare.qmd | 240 +++++++++++++++++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 01b_woody_state/00-prepare.qmd

diff --git a/01b_woody_state/00-prepare.qmd b/01b_woody_state/00-prepare.qmd
new file mode 100644
index 0000000..1ac5268
--- /dev/null
+++ b/01b_woody_state/00-prepare.qmd
@@ -0,0 +1,240 @@
+---
+title: "Workflow Setup and Data Preparation"
+format: html
+author: David LeBauer
+date: sys.Date()
+---
+
+# Overview
+
+## TODO
+
+- Use consistent projection(s):
+  - California Albers EPSG:33110 for joins
+  - WGS84 EPSG:4326 for plotting, subsetting rasters?
+- Clean up domain code
+
+## Install & Load PEcAn
+
+See https://pecanproject.github.io/documentation/develop/
+
+```{r}
+
+options(repos = c(
+  pecanproject = 'https://pecanproject.r-universe.dev',
+  ropensci     = 'https://ropensci.r-universe.dev',
+  CRAN         = 'https://cloud.r-project.org'))
+
+# install.packages("PEcAn.all")
+library(PEcAn.all)
+library(dplyr)
+library(caladaptr)
+library(sf)
+
+
+## Required until PR 3423is merged https://github.com/PecanProject/pecan/pull/3423
+# check if PR is merged
+source('https://raw.githubusercontent.com/dlebauer/pecan/refs/heads/shp2gpkg/modules/data.land/R/landiq2std.R')
+source('https://raw.githubusercontent.com/dlebauer/pecan/refs/heads/shp2gpkg/modules/data.land/R/shp2gpkg.R')
+
+## Check available compute resources
+benchmarkme::get_ram()
+benchmarkme::get_cpu()
+
+```
+
+## Organize Input Data
+
+### Domain Polygons
+
+- ca_convex_hull_reduced: a simplified convex hull for CA
+- yolo_bbox: a smaller domain limited to Yolo County
+
+```{r eval = FALSE}
+# remotes::install_github("ucanr-igis/caladaptr")
+caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
+     sf::st_transform(4326) |>
+     sf::st_union() |>
+     sf::st_convex_hull()
+st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
+
+ca_counties_polygons <- ca_aoipreset_geom("counties") |>
+     dplyr::filter(state_name == "California") |>
+     dplyr::select(state_name, county_name = name, geom) |>
+     sf::st_transform(4326)
+
+ca_state_polygon <- ca_counties_polygons |>
+     group_by(state_name) |>
+     mutate(geom = sf::st_union(geom)) |>
+     sf::st_transform(33110)
+
+yolo_county_polygon <- ca_counties_polygons |>
+    filter(county_name=='Yolo')
+
+# yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
+yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
+# check if it is sufficiently simple to avoid unnecessary computational expensse
+# st_coordinates(yolo_county_convex_hull)
+ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+
+```
+
+### LandIQ Woody Polygons
+
+##### Convert LandIQ to standard
+
+```{r eval=FALSE}
+input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
+#input_file = "data/landiq_polygons.gpkg"
+output_gpkg = 'data/ca_fields.gpkg'
+output_csv = 'data/ca_field_attributes.csv'
+debugonce(landiq2std)
+#PEcAn.data.land::
+landiq2std(input_file, output_gpkg, output_csv)
+```
+
+##### Subset Fields
+
+```{r}
+## Subset woody fields
+
+# for development lets work with a subset
+#con <- DBI::dbConnect(RSQLite::SQLite(), 'data/ca_fields.gpkg')
+#ca_fields <- dplyr::tbl(con, "sites")
+#query = "select * from landiq_polygons where pft == 'woody perennial crop'"
+
+ca_fields <- sf::st_read("data/ca_fields.gpkg")
+readr::read_csv("data/ca_field_attributes.csv")
+
+ca <- ca_fields |>
+  dplyr::left_join(ca_attributes, by = c("id", "lat", "lon"))
+
+ca_woody <- ca |>
+  dplyr::filter(pft == "woody perennial crop")
+sf::st_write(ca_woody,
+        "data/ca_woody.gpkg", delete_layer = TRUE)
+```
+
+#### Create a subset for dev & test
+
+```{r eval=FALSE}
+set.seed(25)
+ca_woody_subset <- ca_woody  |>
+   dplyr::sample_n(200)
+
+sf::st_write(ca_woody_subset,
+        "data/ca_woody_subset.gpkg", delete_layer = TRUE)
+```
+
+### Woody Crop Polygons that will be used for subsetting during development
+
+TODO replace with
+
+```{r}
+
+woody_gpkg <- "data/ca_woody_subset.gpkg" # TODO replace with ca_woody.gpkg
+ca_woody <- sf::st_read(woody_gpkg)
+```
+
+### SoilGrids
+
+#### Download Soilgrids for California
+
+```{r eval=FALSE}
+
+download_soilgrids_raster(
+   variables  = c("clay", "sand"),
+   depths     = c("0-5", "5-15"),
+   polygon    = yolo_county_convex_hull,
+   output_dir = "~/soilgrids_out/"
+)
+
+```
+
+#### Load Prepared Soilgrids GeoTIFF
+
+```{r}
+soilgrids_north_america_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
+## if we want to clip to CA
+## use terra to read in that file and clip to california
+# soilgrids_california <- terra::crop(soilgrids_north_america, yolo_bbox)
+
+# convert polygons to points
+ca_woody_pts <- ca_woody  |>
+  sf::st_centroid()
+
+# read in the file
+soilgrids_north_america_rast <- terra::rast(soilgrids_north_america_tif)
+
+ca_woody_sg <- extract_raster_values(
+  raster_path = soilgrids_north_america_tif,
+  points_df = ca_woody_pts
+) |>
+dplyr::rename(clay = raster_value) |>
+  dplyr::mutate(clay = clay/10)
+
+```
+
+### Cal-Adapt
+
+#### Cal-Adapt Climate Regions
+
+```{r}
+ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
+                      # and so units are in meters
+
+
+ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
+    sf::st_transform(crs = ca_albers_crs) |>
+    dplyr::rename(climregion_id = id,
+           climregion_name = name)
+```
+
+```{r join_climregions}
+ca_woody_sg_cr <- ca_woody_sg |>
+  sf::st_transform(crs = ca_albers_crs) |>
+  sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
+
+```
+
+#### Cal-Adapt Climate
+
+##### Download Cal-Adapt Climate Rasters
+
+LOCA (CMIP5-based)
+
+```{r}
+precip <- ca_fetch_raster_polygon(
+  polygon = ca_state_polygon|> st_make_valid(),
+  var = "pr",
+  gcm = "ens32avg",
+  scenario = "historical",
+  period = "30yavg",
+  #start_year = 2006, end_year = 2010,
+  out_dir = "data/caladapt/")
+
+tmin <- precip <- ca_fetch_raster_polygon(
+  polygon = ca_state_polygon |> st_make_valid(),
+  var = "tasmin",
+  gcm = "ens32avg",
+  scenario = "historical",
+  period = "30yavg",
+  #start_year = 2006, end_year = 2010,
+  out_dir = "data/caladapt/")
+tmax <- precip <- ca_fetch_raster_polygon(
+  polygon = ca_state_polygon|> st_make_valid(),
+  var = "tasmax",
+  gcm = "ens32avg",
+  scenario = "historical",
+  period = "30yavg",
+  #start_year = 2006, end_year = 2010,
+  out_dir = "data/caladapt/")
+
+z <- terra::rast(caladapt_polygon_data$raster)
+
+z <- extract_raster_values(
+  raster_path = caladapt_polygon_data$raster,
+  points_df = ca_woody_pts
+)  |>
+  dplyr::rename(map = raster_value)
+```

From e46420b80e4674f3dab1047d382af66630cd542f Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Mon, 3 Feb 2025 02:24:45 -0500
Subject: [PATCH 02/49] first drafts of data prep and clustering workflows

---
 01b_woody_state/00-prepare.qmd                | 240 -----------
 downscale/00-prepare.qmd                      | 403 ++++++++++++++++++
 .../01_cluster_and_select_anchorsites.qmd     |  82 ++++
 3 files changed, 485 insertions(+), 240 deletions(-)
 delete mode 100644 01b_woody_state/00-prepare.qmd
 create mode 100644 downscale/00-prepare.qmd
 create mode 100644 downscale/01_cluster_and_select_anchorsites.qmd

diff --git a/01b_woody_state/00-prepare.qmd b/01b_woody_state/00-prepare.qmd
deleted file mode 100644
index 1ac5268..0000000
--- a/01b_woody_state/00-prepare.qmd
+++ /dev/null
@@ -1,240 +0,0 @@
----
-title: "Workflow Setup and Data Preparation"
-format: html
-author: David LeBauer
-date: sys.Date()
----
-
-# Overview
-
-## TODO
-
-- Use consistent projection(s):
-  - California Albers EPSG:33110 for joins
-  - WGS84 EPSG:4326 for plotting, subsetting rasters?
-- Clean up domain code
-
-## Install & Load PEcAn
-
-See https://pecanproject.github.io/documentation/develop/
-
-```{r}
-
-options(repos = c(
-  pecanproject = 'https://pecanproject.r-universe.dev',
-  ropensci     = 'https://ropensci.r-universe.dev',
-  CRAN         = 'https://cloud.r-project.org'))
-
-# install.packages("PEcAn.all")
-library(PEcAn.all)
-library(dplyr)
-library(caladaptr)
-library(sf)
-
-
-## Required until PR 3423is merged https://github.com/PecanProject/pecan/pull/3423
-# check if PR is merged
-source('https://raw.githubusercontent.com/dlebauer/pecan/refs/heads/shp2gpkg/modules/data.land/R/landiq2std.R')
-source('https://raw.githubusercontent.com/dlebauer/pecan/refs/heads/shp2gpkg/modules/data.land/R/shp2gpkg.R')
-
-## Check available compute resources
-benchmarkme::get_ram()
-benchmarkme::get_cpu()
-
-```
-
-## Organize Input Data
-
-### Domain Polygons
-
-- ca_convex_hull_reduced: a simplified convex hull for CA
-- yolo_bbox: a smaller domain limited to Yolo County
-
-```{r eval = FALSE}
-# remotes::install_github("ucanr-igis/caladaptr")
-caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
-     sf::st_transform(4326) |>
-     sf::st_union() |>
-     sf::st_convex_hull()
-st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
-
-ca_counties_polygons <- ca_aoipreset_geom("counties") |>
-     dplyr::filter(state_name == "California") |>
-     dplyr::select(state_name, county_name = name, geom) |>
-     sf::st_transform(4326)
-
-ca_state_polygon <- ca_counties_polygons |>
-     group_by(state_name) |>
-     mutate(geom = sf::st_union(geom)) |>
-     sf::st_transform(33110)
-
-yolo_county_polygon <- ca_counties_polygons |>
-    filter(county_name=='Yolo')
-
-# yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
-yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
-# check if it is sufficiently simple to avoid unnecessary computational expensse
-# st_coordinates(yolo_county_convex_hull)
-ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-
-```
-
-### LandIQ Woody Polygons
-
-##### Convert LandIQ to standard
-
-```{r eval=FALSE}
-input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
-#input_file = "data/landiq_polygons.gpkg"
-output_gpkg = 'data/ca_fields.gpkg'
-output_csv = 'data/ca_field_attributes.csv'
-debugonce(landiq2std)
-#PEcAn.data.land::
-landiq2std(input_file, output_gpkg, output_csv)
-```
-
-##### Subset Fields
-
-```{r}
-## Subset woody fields
-
-# for development lets work with a subset
-#con <- DBI::dbConnect(RSQLite::SQLite(), 'data/ca_fields.gpkg')
-#ca_fields <- dplyr::tbl(con, "sites")
-#query = "select * from landiq_polygons where pft == 'woody perennial crop'"
-
-ca_fields <- sf::st_read("data/ca_fields.gpkg")
-readr::read_csv("data/ca_field_attributes.csv")
-
-ca <- ca_fields |>
-  dplyr::left_join(ca_attributes, by = c("id", "lat", "lon"))
-
-ca_woody <- ca |>
-  dplyr::filter(pft == "woody perennial crop")
-sf::st_write(ca_woody,
-        "data/ca_woody.gpkg", delete_layer = TRUE)
-```
-
-#### Create a subset for dev & test
-
-```{r eval=FALSE}
-set.seed(25)
-ca_woody_subset <- ca_woody  |>
-   dplyr::sample_n(200)
-
-sf::st_write(ca_woody_subset,
-        "data/ca_woody_subset.gpkg", delete_layer = TRUE)
-```
-
-### Woody Crop Polygons that will be used for subsetting during development
-
-TODO replace with
-
-```{r}
-
-woody_gpkg <- "data/ca_woody_subset.gpkg" # TODO replace with ca_woody.gpkg
-ca_woody <- sf::st_read(woody_gpkg)
-```
-
-### SoilGrids
-
-#### Download Soilgrids for California
-
-```{r eval=FALSE}
-
-download_soilgrids_raster(
-   variables  = c("clay", "sand"),
-   depths     = c("0-5", "5-15"),
-   polygon    = yolo_county_convex_hull,
-   output_dir = "~/soilgrids_out/"
-)
-
-```
-
-#### Load Prepared Soilgrids GeoTIFF
-
-```{r}
-soilgrids_north_america_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
-## if we want to clip to CA
-## use terra to read in that file and clip to california
-# soilgrids_california <- terra::crop(soilgrids_north_america, yolo_bbox)
-
-# convert polygons to points
-ca_woody_pts <- ca_woody  |>
-  sf::st_centroid()
-
-# read in the file
-soilgrids_north_america_rast <- terra::rast(soilgrids_north_america_tif)
-
-ca_woody_sg <- extract_raster_values(
-  raster_path = soilgrids_north_america_tif,
-  points_df = ca_woody_pts
-) |>
-dplyr::rename(clay = raster_value) |>
-  dplyr::mutate(clay = clay/10)
-
-```
-
-### Cal-Adapt
-
-#### Cal-Adapt Climate Regions
-
-```{r}
-ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
-                      # and so units are in meters
-
-
-ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
-    sf::st_transform(crs = ca_albers_crs) |>
-    dplyr::rename(climregion_id = id,
-           climregion_name = name)
-```
-
-```{r join_climregions}
-ca_woody_sg_cr <- ca_woody_sg |>
-  sf::st_transform(crs = ca_albers_crs) |>
-  sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
-
-```
-
-#### Cal-Adapt Climate
-
-##### Download Cal-Adapt Climate Rasters
-
-LOCA (CMIP5-based)
-
-```{r}
-precip <- ca_fetch_raster_polygon(
-  polygon = ca_state_polygon|> st_make_valid(),
-  var = "pr",
-  gcm = "ens32avg",
-  scenario = "historical",
-  period = "30yavg",
-  #start_year = 2006, end_year = 2010,
-  out_dir = "data/caladapt/")
-
-tmin <- precip <- ca_fetch_raster_polygon(
-  polygon = ca_state_polygon |> st_make_valid(),
-  var = "tasmin",
-  gcm = "ens32avg",
-  scenario = "historical",
-  period = "30yavg",
-  #start_year = 2006, end_year = 2010,
-  out_dir = "data/caladapt/")
-tmax <- precip <- ca_fetch_raster_polygon(
-  polygon = ca_state_polygon|> st_make_valid(),
-  var = "tasmax",
-  gcm = "ens32avg",
-  scenario = "historical",
-  period = "30yavg",
-  #start_year = 2006, end_year = 2010,
-  out_dir = "data/caladapt/")
-
-z <- terra::rast(caladapt_polygon_data$raster)
-
-z <- extract_raster_values(
-  raster_path = caladapt_polygon_data$raster,
-  points_df = ca_woody_pts
-)  |>
-  dplyr::rename(map = raster_value)
-```
diff --git a/downscale/00-prepare.qmd b/downscale/00-prepare.qmd
new file mode 100644
index 0000000..c71a48f
--- /dev/null
+++ b/downscale/00-prepare.qmd
@@ -0,0 +1,403 @@
+---
+title: "Workflow Setup and Data Preparation"
+format: html
+author: David LeBauer
+date: sys.Date()
+---
+
+# Overview
+
+- Prepare Inputs
+  - Harmonized LandIQ dataset of woody California cropland from 2016-2023
+  - SoilGrids soil properties (clay, ?)
+  - CalAdapt climatology (mean annual temperature, mean annual precipitation)
+- Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
+
+## TODO
+
+- Use consistent projection(s):
+  - California Albers EPSG:33110 for joins
+  - WGS84 EPSG:4326 for plotting, subsetting rasters?
+- Clean up domain code
+
+## Install & Load PEcAn
+
+See https://pecanproject.github.io/documentation/develop/
+
+```{r}
+
+options(repos = c(
+  pecanproject = 'https://pecanproject.r-universe.dev',
+  ropensci     = 'https://ropensci.r-universe.dev',
+  CRAN         = 'https://cloud.r-project.org'))
+
+# install.packages("PEcAn.all")
+## Required until https://github.com/UCANR-IGIS/caladaptr/pull/3 is merged
+remotes::install_github("dlebauer/caladaptr")
+library(PEcAn.all)
+library(tidyverse)
+
+library(caladaptr)
+library(sf)
+library(terra)
+
+## Required until PR 3423is merged https://github.com/PecanProject/pecan/pull/3423
+# check if PR is merged
+devtools::load_all("../pecan/modules/data.land")
+
+## Check available compute resources
+benchmarkme::get_ram()
+benchmarkme::get_cpu()
+
+```
+
+## Organize Input Data
+
+### Domain Polygons
+
+- ca_convex_hull_reduced: a simplified convex hull for CA
+- yolo_bbox: a smaller domain limited to Yolo County
+
+```{r eval = FALSE}
+# remotes::install_github("ucanr-igis/caladaptr")
+caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
+     sf::st_transform(4326) |>
+     sf::st_union() |>
+     sf::st_convex_hull()
+st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
+
+ca_counties_polygons <- ca_aoipreset_geom("counties") |>
+     dplyr::filter(state_name == "California") |>
+     dplyr::select(state_name, county_name = name, geom) |>
+     sf::st_transform(4326)
+
+ca_state_polygon <- ca_counties_polygons |>
+     group_by(state_name) |>
+     mutate(geom = sf::st_union(geom))
+
+ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
+file.remove("data/ca_state_polygon_simplified.geojson")
+sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
+
+yolo_county_polygon <- ca_counties_polygons |>
+    filter(county_name=='Yolo')
+
+yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
+sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
+yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
+# check if it is sufficiently simple to avoid unnecessary computational expensse
+# st_coordinates(yolo_county_convex_hull)
+ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+
+```
+
+```{r}
+ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson")
+```
+
+### LandIQ Woody Polygons
+
+##### Convert LandIQ to standard
+
+```{r eval=FALSE}
+input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
+#input_file = "data/landiq_polygons.gpkg"
+output_gpkg = 'data/ca_fields.gpkg'
+output_csv = 'data/ca_field_attributes.csv'
+debugonce(landiq2std)
+#PEcAn.data.land::
+landiq2std(input_file, output_gpkg, output_csv)
+```
+
+##### Subset Fields
+
+```{r eval=FALSE}
+## Subset woody fields
+
+# for development lets work with a subset
+#con <- DBI::dbConnect(RSQLite::SQLite(), 'data/ca_fields.gpkg')
+#ca_fields <- dplyr::tbl(con, "sites")
+#query = "select * from sites where pft = 'woody perennial crop'"
+#ca_fields <- dplyr::tbl(con, dbplyr::sql(query))
+
+ca_fields <- sf::st_read("data/ca_fields.gpkg")
+ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
+
+ca <- ca_fields |>
+  select(-lat, -lon) |>
+  dplyr::left_join(ca_attributes, by = "id")
+
+ca_woody <- ca |>
+  dplyr::filter(pft == "woody perennial crop")
+sf::st_write(ca_woody,
+        "data/ca_woody.gpkg", delete_layer = TRUE)
+```
+
+#### Create a subset for dev & test
+
+```{r eval=FALSE}
+set.seed(25)
+ca_woody_subset <- ca_woody  |>
+   dplyr::sample_n(200)
+
+sf::st_write(ca_woody_subset,
+        "data/ca_woody_subset.gpkg", delete_layer = TRUE)
+```
+
+### Woody Crop Polygons that will be used for subsetting during development
+
+```{r}
+woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for testing
+ca_woody <- sf::st_read(woody_gpkg)
+```
+
+### SoilGrids
+
+#### Download Soilgrids for California
+
+#### Code to download soilgrids
+
+```{r eval=FALSE}
+
+download_soilgrids_raster(
+   variables  = c("clay", "sand"),
+   depths     = c("0-5", "5-15"),
+   polygon    = yolo_county_convex_hull,
+   output_dir = "~/soilgrids_out/"
+)
+
+```
+
+#### Load Prepared Soilgrids GeoTIFF
+
+Using already prepared SoilGrids layers
+
+```{r}
+soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
+soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
+## if we want to clip to CA
+## use terra to read in that file and clip to california
+# soilgrids_california <- terra::crop(soilgrids_north_america, yolo_bbox)
+
+# convert polygons to points
+ca_woody_pts <- ca_woody  |>
+  sf::st_centroid()
+
+# read in the file
+soilgrids_north_america_rast <- terra::rast(soilgrids_north_america_tif)
+```
+
+#### Extract clay from SoilGrids
+
+```{r}
+ca_woody_sg <- extract_raster_values(
+  raster_path = soilgrids_north_america_tif,
+  spatial_data = ca_woody_pts
+) |>
+dplyr::rename(clay = raster_value) |>
+  dplyr::mutate(clay = clay/10)
+
+```
+
+### Topographic Wetness Index
+
+```{r}
+twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
+
+twi_raster <- terra::rast(twi_tiff)
+twi <- twi_raster[['na_twi_500m']]
+twi <- terra::extract(twi, vect(ca_woody_sg_twi_cr))[,2]
+ca_woody_sg_twi <- ca_woody_sg |>
+  dplyr::mutate(twi = twi)
+
+```
+
+### Cal-Adapt Climate Regions
+
+```{r}
+ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
+                      # and so units are in meters
+
+
+ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
+    sf::st_transform(crs = ca_albers_crs) |>
+    dplyr::rename(climregion_id = id,
+           climregion_name = name)
+```
+
+```{r join_climregions}
+ca_woody_sg_twi_cr <- ca_woody_sg |>
+  sf::st_transform(crs = ca_albers_crs) |>
+  sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
+# cache for dev.
+save(ca_woody_sg_twi_cr, file = "data/ca_woody_sg_twi_cr.rda")
+```
+
+### GridMet
+
+```{r}
+gridmet_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
+
+# List all ERA5_met_*.tiff files for years 2012-2021
+raster_files <- list.files(
+  path = gridmet_dir,
+  pattern = "^ERA5_met_\\d{4}\\.tiff$",
+  full.names = TRUE
+)
+
+plan(multisession, workers = 3)  # Adjust the number of workers as needed
+
+# Read all rasters into a list of SpatRaster objects
+rasters_list <- map(
+  raster_files,
+  ~ rast(.x))
+
+
+years <- map_chr(rasters_list, ~ {
+  source_path <- terra::sources(.x)[1]
+  str_extract(source_path, "\\d{4}")
+})  |>
+  as.integer()
+
+names(rasters_list) <- years
+
+extract_temp_prec <- function(raster, year, points_sf) {
+  # Extract temperature and precipitation layers
+  temp <- raster[["temp"]]
+  prec <- raster[["prec"]]
+
+  # Extract values for each point
+  # terra::extract returns a data frame with ID and extracted values
+  temp_vals <- terra::extract(temp, vect(points_sf))[,2]
+  prec_vals <- terra::extract(prec, vect(points_sf))[,2]
+
+  # Combine into a tibble
+  tibble(
+    id = points_sf$id,
+    year = year,
+    mean_temp = temp_vals,
+    total_prec = prec_vals
+  )
+}
+
+climate_df <- rasters_list |>
+  imap_dfr(~ extract_temp_prec(.x, .y, ca_woody_pts))
+
+climate_df2 <- climate_df |>
+  dplyr::mutate(
+    precip = PEcAn.utils::ud_convert(total_prec, "second-1", "year-1")
+) |>
+  dplyr::group_by(id) |>
+  dplyr::summarise(
+    mean_temp = mean(mean_temp),
+    precip = mean(precip)
+  )  |>
+  dplyr::left_join(ca_woody_sg_twi_cr, by = "id")
+
+ca_woody_sg_twi_cr_climate <- climate_df2  |>
+  dplyr::select(id, mean_temp, precip, crop, clay, climregion_id)
+save(ca_woody_sg_twi_cr_climate, file = "data/ca_woody_sg_twi_cr_climate.rda")
+```
+
+````
+
+
+### Cal-Adapt Climate
+
+#### Cal-Adapt Catalog
+
+See `ca_catalog_search('30yavg_ens32avg_historical')`
+
+## make a paginated table
+
+```{r}
+library(dplyr)
+ca_catalog_search('ens32avg') |>
+  reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
+````
+
+##### Download Cal-Adapt Climate Rasters
+
+LOCA (CMIP5-based)
+
+```{r eval = FALSE}
+#source(here::here("../pecan/modules/data.land/R/download_caladapt.R"))
+
+#gcm = "ens32avg"
+gcm = "HadGEM2-ES"
+scenario = "historical"
+period = "year"
+start_year = 1961
+end_year = 1990
+out_dir = "data/caladapt/"
+polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+
+vars <- c("pr", "tasmin", "tasmax")
+
+num_workers <- parallel::detectCores()
+future::plan(future::multisession,
+            workers = 3)# num_workers
+### Not working because Rasters are all 0????
+caladapt_rasters <- furrr::future_map(
+  vars,
+  ~ {
+    gcms <- gcms # required for parallel
+    set.seed(1)
+    ca_fetch_raster_polygon(
+      polygon = polygon,
+      var = .x,
+      gcm = gcm,
+      scenario = scenario,
+      period = period,
+      start_year = start_year,
+      end_year = end_year,
+      out_dir = out_dir
+  )
+  },
+  .options = furrr_options(seed = 123)
+)
+
+# combine into a dataframe
+caladapt_rasters_df <- caladapt_rasters |>
+  purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
+  dplyr::group_by(variable) |>
+  dplyr::summarise(raster = list(terra::vrt(raster)))
+
+z <- caladapt_rasters_df |>
+  dplyr::group_by(variable) |>
+  dplyr::summarise(raster = extract_raster_values(
+    raster_path = raster,
+    spatial_data = ca_woody_pts
+  ))
+ca_woody_pts_sg_cr_ca <-z
+
+extract_raster_values(
+  raster_path = caladapt_rasters_df$raster[1],
+  spatial_data = ca_woody_pts)
+
+climate_df <- furrr::future_map(
+  caladapt_rasters_df$raster,
+  ~ extract_raster_values(
+      raster_input = .x,  # Pass SpatRaster object
+      spatial_data = ca_woody_pts,  # Your spatial points sf object
+      value_colname_prefix = paste0("value_", .y),  # Dynamic prefix based on variable
+      scaling_factor = 1  # Adjust if necessary
+    ),
+  .options = furrr_options(seed = 123)
+)
+
+
+```
+
+##### Cal-Adapt 3.0
+
+```{r}
+my_data <- read_3km_hourly_climate_terra(
+     data_dir = "local_folder",
+     variable = "T2",
+     domain = "d03",
+     year = 2015,
+     n_cores = 8
+   )
+```
diff --git a/downscale/01_cluster_and_select_anchorsites.qmd b/downscale/01_cluster_and_select_anchorsites.qmd
new file mode 100644
index 0000000..d27417c
--- /dev/null
+++ b/downscale/01_cluster_and_select_anchorsites.qmd
@@ -0,0 +1,82 @@
+---
+title: "Downscale Anchor Sites"
+---
+
+# Overview
+
+This workflow will:
+
+- Read in a dataset of site environmental data
+- Perform K-means clustering to identify clusters
+- Select anchor sites for each cluster
+
+## Setup
+
+```{r}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
+library(sf)
+library(terra)
+library(tidyverse)
+```
+
+## Load Site Environmental Data
+
+```{r}
+
+load("data/ca_woody_sg_twi_cr_climate.rda")
+site_env <- ca_woody_sg_twi_cr_climate
+```
+
+## Anchor Site Selection
+
+### K-means Clustering
+
+K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
+or treat 'crop' as categorical by some encoding if needed)
+For demonstration, let's assume numeric + factor encoding for crop
+
+```{r}
+
+```
+
+crop_ids <- site_env |>
+distinct(crop) |>
+mutate(crop_code = as.integer(as.factor(crop)))
+
+tmp <- site_env |>
+left_join(crop_ids, by = "crop") |>
+select(id, mean_temp, precip, clay, climregion_id, crop_code, twi)
+
+data_for_clust_with_ids <- tmp |>
+na.omit()
+
+data_for_clust <- data_for_clust_with_ids |>
+select(- id)
+
+````
+
+```{r}
+set.seed(123)
+
+km_result <- kmeans(data_for_clust, centers = 10)
+
+data_for_clust_with_ids$cluster <- km_result$cluster
+
+site_env_with_clusters <- data_for_clust_with_ids |>
+  select(id, cluster) |>
+  left_join(site_env, by = "id")
+````
+
+### Anchor Site Selection
+
+TODO: Come up with reasonable way to select anchor sites
+
+Anchor site selection (one anchor per cluster)
+e.g., ??????? pick the polygon with median temp in each cluster
+
+```{r}
+# select representative points from each cluster based on multiple environmental variables
+anchor_sites <- site_env_with_clusters |>
+  group_by(cluster) |>
+  ???????
+```

From d4a77536f391bbc8bc7acee7c6dccdf0f02dbb8c Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Mon, 3 Feb 2025 02:27:06 -0500
Subject: [PATCH 03/49] major revisions to prepare and clustering workflows

---
 downscale/00-prepare.qmd                      | 280 ++++++++++------
 .../01_cluster_and_select_anchorsites.qmd     |  82 -----
 .../01_cluster_and_select_design_points.qmd   | 302 ++++++++++++++++++
 3 files changed, 482 insertions(+), 182 deletions(-)
 delete mode 100644 downscale/01_cluster_and_select_anchorsites.qmd
 create mode 100644 downscale/01_cluster_and_select_design_points.qmd

diff --git a/downscale/00-prepare.qmd b/downscale/00-prepare.qmd
index c71a48f..8ee6a5e 100644
--- a/downscale/00-prepare.qmd
+++ b/downscale/00-prepare.qmd
@@ -19,6 +19,8 @@ date: sys.Date()
   - California Albers EPSG:33110 for joins
   - WGS84 EPSG:4326 for plotting, subsetting rasters?
 - Clean up domain code
+- Create a bunch of tables and join all at once at the end
+- Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
 
 ## Install & Load PEcAn
 
@@ -33,7 +35,7 @@ options(repos = c(
 
 # install.packages("PEcAn.all")
 ## Required until https://github.com/UCANR-IGIS/caladaptr/pull/3 is merged
-remotes::install_github("dlebauer/caladaptr")
+# remotes::install_github("dlebauer/caladaptr")
 library(PEcAn.all)
 library(tidyverse)
 
@@ -48,6 +50,14 @@ devtools::load_all("../pecan/modules/data.land")
 ## Check available compute resources
 benchmarkme::get_ram()
 benchmarkme::get_cpu()
+no_cores <- parallel::detectCores(logical = FALSE)
+future_multi <- ifelse(future::supportsMulticore(),
+       future::plan(future::multicore),
+       future::plan(future::multisession))
+future::plan(future_multi,
+             workers = 16)
+
+
 
 ```
 
@@ -55,17 +65,30 @@ benchmarkme::get_cpu()
 
 ### Domain Polygons
 
+Here we are generating domain polygons that will be used for subsetting.
+These are converted to convex hulls and simplified for computational efficiency.
+The Yolo County domain is a smaller domain that can be used for testing and debugging.
+
+These include:
+
+- caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
+- ca_convex_hull: a convex hull around CA
+- ca_state_polygon_simplified: a simplified convex hull for CA
+- ca_state_polygon: a convex hull for CA
+
 - ca_convex_hull_reduced: a simplified convex hull for CA
 - yolo_bbox: a smaller domain limited to Yolo County
 
 ```{r eval = FALSE}
 # remotes::install_github("ucanr-igis/caladaptr")
+## Cal-Adapt Domain
 caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
      sf::st_transform(4326) |>
      sf::st_union() |>
      sf::st_convex_hull()
 st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
 
+## California State
 ca_counties_polygons <- ca_aoipreset_geom("counties") |>
      dplyr::filter(state_name == "California") |>
      dplyr::select(state_name, county_name = name, geom) |>
@@ -79,6 +102,7 @@ ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 50
 file.remove("data/ca_state_polygon_simplified.geojson")
 sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
 
+## Yolo County
 yolo_county_polygon <- ca_counties_polygons |>
     filter(county_name=='Yolo')
 
@@ -87,6 +111,7 @@ sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplifie
 yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
 # check if it is sufficiently simple to avoid unnecessary computational expensse
 # st_coordinates(yolo_county_convex_hull)
+
 ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
 
 ```
@@ -98,34 +123,33 @@ yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson"
 
 ### LandIQ Woody Polygons
 
-##### Convert LandIQ to standard
+The first step is to convert LandIQ to a open, standard format.
+
+We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
+
+The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
 
 ```{r eval=FALSE}
 input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
-#input_file = "data/landiq_polygons.gpkg"
 output_gpkg = 'data/ca_fields.gpkg'
 output_csv = 'data/ca_field_attributes.csv'
-debugonce(landiq2std)
-#PEcAn.data.land::
+
 landiq2std(input_file, output_gpkg, output_csv)
 ```
 
-##### Subset Fields
+##### Subset Woody Perennial Crop Fields
 
-```{r eval=FALSE}
-## Subset woody fields
+Phase 1 focuses on Woody Perennial Crop fields.
 
-# for development lets work with a subset
-#con <- DBI::dbConnect(RSQLite::SQLite(), 'data/ca_fields.gpkg')
-#ca_fields <- dplyr::tbl(con, "sites")
-#query = "select * from sites where pft = 'woody perennial crop'"
-#ca_fields <- dplyr::tbl(con, dbplyr::sql(query))
+Next, we will subset the LandIQ data to only include woody perennial crop fields.
+At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
 
+```{r eval=FALSE}
 ca_fields <- sf::st_read("data/ca_fields.gpkg")
 ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
 
 ca <- ca_fields |>
-  select(-lat, -lon) |>
+  dplyr::select(-lat, -lon) |>
   dplyr::left_join(ca_attributes, by = "id")
 
 ca_woody <- ca |>
@@ -134,7 +158,26 @@ sf::st_write(ca_woody,
         "data/ca_woody.gpkg", delete_layer = TRUE)
 ```
 
-#### Create a subset for dev & test
+Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
+number of design points that will be selected in the clustering step.
+
+
+```{r}
+system.time(
+pft_area <- ca |>
+  dplyr::select(id, pft, area_ha) |>
+  dtplyr::lazy_dt() |>
+  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
+  dplyr::group_by(woody_indicator) |>
+  dplyr::summarize(pft_area = sum(area_ha))
+)
+
+
+```
+
+### Subset California Woody Crop Fields for development & testting
+
+Now, create a subset of the California Woody Crop Fields for development & testting
 
 ```{r eval=FALSE}
 set.seed(25)
@@ -145,30 +188,48 @@ sf::st_write(ca_woody_subset,
         "data/ca_woody_subset.gpkg", delete_layer = TRUE)
 ```
 
-### Woody Crop Polygons that will be used for subsetting during development
+### Convert Points to Polygons.
+
+For Phase 1, we will use points to query raster data.
+In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
 
 ```{r}
-woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for testing
-ca_woody <- sf::st_read(woody_gpkg)
+woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for development and testing
+
+# load fields, convert polygons to points, and subset
+ca_woody_pts <- sf::st_read(woody_gpkg) |>
+  sf::st_centroid() |>
+  # and keep only the columns we need
+  dplyr::select(id, crop, pft, geom)
+
 ```
 
-### SoilGrids
+## Anchor Sites
 
-#### Download Soilgrids for California
+```{r}
+# Anchor sites from UC Davis, UC Riverside, and Ameriflux.
+anchor_sites <- readr::read_csv("data/anchor_sites.csv")
 
-#### Code to download soilgrids
+anchor_sites_pts <- anchor_sites |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326)
 
-```{r eval=FALSE}
+# Join with ca_fields: keep only the rows associated with anchor sites
+# takes ~ 1 min on BU cluster w/ "Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz"
+anchor_sites_with_ids <- anchor_sites_pts |>
+   # spatial join find ca_fields that contain anchor site points
+  sf::st_join(ca_fields, join = sf::st_within)
 
-download_soilgrids_raster(
-   variables  = c("clay", "sand"),
-   depths     = c("0-5", "5-15"),
-   polygon    = yolo_county_convex_hull,
-   output_dir = "~/soilgrids_out/"
-)
+sf::st_write(anchor_sites_with_ids |>
+               dplyr::select(id, lat, lon, location, site_name, crops, pft),
+               dsn = "data/anchor_sites_ids.csv",
+               delete_layer = TRUE)
 
 ```
 
+## Environmental Covariates
+
+### SoilGrids
+
 #### Load Prepared Soilgrids GeoTIFF
 
 Using already prepared SoilGrids layers
@@ -180,23 +241,33 @@ soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA
 ## use terra to read in that file and clip to california
 # soilgrids_california <- terra::crop(soilgrids_north_america, yolo_bbox)
 
-# convert polygons to points
-ca_woody_pts <- ca_woody  |>
-  sf::st_centroid()
-
 # read in the file
-soilgrids_north_america_rast <- terra::rast(soilgrids_north_america_tif)
+# read two layers
+soilgrids_north_america_clay_rast <- terra::rast(soilgrids_north_america_clay_tif)
+soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
+
 ```
 
 #### Extract clay from SoilGrids
 
 ```{r}
-ca_woody_sg <- extract_raster_values(
-  raster_path = soilgrids_north_america_tif,
-  spatial_data = ca_woody_pts
-) |>
-dplyr::rename(clay = raster_value) |>
-  dplyr::mutate(clay = clay/10)
+
+clay <- (terra::extract(soilgrids_north_america_clay_rast, terra::vect(ca_woody_pts)) |>
+  dplyr::select(-ID)) |>
+  dplyr::pull()/ 10
+
+ocd <- terra::extract(soilgrids_north_america_ocd_rast, terra::vect(ca_woody_pts_clay)) |>
+  dplyr::select(-ID) |>
+  dplyr::pull()
+
+ca_woody_pts_clay_ocd <- cbind(ca_woody_pts,
+                               clay = clay,
+                               ocd = ocd)
+
+required_cols <- c("id", "crop", "pft", "clay", "ocd", "geom")
+assertthat::assert_that(
+  all(required_cols %in% colnames(ca_woody_pts_clay_ocd)),
+            msg = "ca_woody_pts_clay_ocd is missing expected columns")
 
 ```
 
@@ -204,34 +275,38 @@ dplyr::rename(clay = raster_value) |>
 
 ```{r}
 twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
+twi_rast <- terra::rast(twi_tiff)
+
+twi <- terra::extract(twi_rast, vect(ca_woody_pts)) |>
+  select(-ID) |>
+  dplyr::pull()
 
-twi_raster <- terra::rast(twi_tiff)
-twi <- twi_raster[['na_twi_500m']]
-twi <- terra::extract(twi, vect(ca_woody_sg_twi_cr))[,2]
-ca_woody_sg_twi <- ca_woody_sg |>
-  dplyr::mutate(twi = twi)
+ca_woody_pts_clay_ocd_twi <- cbind(ca_woody_pts_clay_ocd, twi = twi)
 
 ```
 
 ### Cal-Adapt Climate Regions
 
-```{r}
+```{r get_climregions}
+
 ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
                       # and so units are in meters
-
+                      # required that crs(x) == crs(y) for st_join
 
 ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
     sf::st_transform(crs = ca_albers_crs) |>
     dplyr::rename(climregion_id = id,
            climregion_name = name)
+save(ca_climregions, file = "data/ca_climregions.rda")
 ```
 
 ```{r join_climregions}
-ca_woody_sg_twi_cr <- ca_woody_sg |>
-  sf::st_transform(crs = ca_albers_crs) |>
+ca_woody_pts_clay_ocd_twi_cr <- ca_woody_pts_clay_ocd_twi |>
+  sf::st_transform(., crs = ca_albers_crs) |>
   sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
-# cache for dev.
-save(ca_woody_sg_twi_cr, file = "data/ca_woody_sg_twi_cr.rda")
+
+# convenience cache.
+save(ca_woody_pts_clay_ocd_twi_cr, file = "ca_woody_pts_clay_ocd_twi_cr.rda")
 ```
 
 ### GridMet
@@ -246,14 +321,11 @@ raster_files <- list.files(
   full.names = TRUE
 )
 
-plan(multisession, workers = 3)  # Adjust the number of workers as needed
-
 # Read all rasters into a list of SpatRaster objects
 rasters_list <- map(
   raster_files,
   ~ rast(.x))
 
-
 years <- map_chr(rasters_list, ~ {
   source_path <- terra::sources(.x)[1]
   str_extract(source_path, "\\d{4}")
@@ -262,46 +334,67 @@ years <- map_chr(rasters_list, ~ {
 
 names(rasters_list) <- years
 
-extract_temp_prec <- function(raster, year, points_sf) {
-  # Extract temperature and precipitation layers
-  temp <- raster[["temp"]]
-  prec <- raster[["prec"]]
-
-  # Extract values for each point
-  # terra::extract returns a data frame with ID and extracted values
-  temp_vals <- terra::extract(temp, vect(points_sf))[,2]
-  prec_vals <- terra::extract(prec, vect(points_sf))[,2]
-
-  # Combine into a tibble
-  tibble(
-    id = points_sf$id,
-    year = year,
-    mean_temp = temp_vals,
-    total_prec = prec_vals
-  )
+extract_clim <- function(raster, points_sf) {
+  terra::extract(raster, points_sf) |>
+    tibble::as_tibble() |>
+    select(-ID) |>
+    mutate(id = points_sf$id) |>
+    select(id, temp, prec, srad, vapr)
 }
 
-climate_df <- rasters_list |>
-  imap_dfr(~ extract_temp_prec(.x, .y, ca_woody_pts))
+.tmp <-  list(rasters_list) |>
+  furrr::future_map_dfr(
+    ~ extract_clim(.x, ca_woody_pts),
+      .id = "year",
+      .options = furrr::furrr_options(seed = 123))
 
-climate_df2 <- climate_df |>
+
+clim_summaries <- .tmp |>
   dplyr::mutate(
-    precip = PEcAn.utils::ud_convert(total_prec, "second-1", "year-1")
+    precip = PEcAn.utils::ud_convert(prec, "second-1", "year-1")
 ) |>
   dplyr::group_by(id) |>
   dplyr::summarise(
-    mean_temp = mean(mean_temp),
-    precip = mean(precip)
-  )  |>
-  dplyr::left_join(ca_woody_sg_twi_cr, by = "id")
-
-ca_woody_sg_twi_cr_climate <- climate_df2  |>
-  dplyr::select(id, mean_temp, precip, crop, clay, climregion_id)
-save(ca_woody_sg_twi_cr_climate, file = "data/ca_woody_sg_twi_cr_climate.rda")
+    mean_temp = mean(temp),
+    precip = mean(precip),
+    srad = mean(srad),
+    vapr = mean(vapr)
+  )
+```
+
+## Prepare Dataset for Clustering
+
+First, we will turn crop names into IDs to support hierarchical clustering
+
+```{r}
+crop_ids <- ca_woody_pts |>
+  distinct(crop) |>
+  mutate(crop_id = as.integer(as.factor(crop))) |>
+  write_csv("data/crop_ids.csv")
 ```
 
-````
+```{r join_and_subset}
+.all <- clim_summaries  |>
+  dplyr::left_join(ca_woody_pts_clay_ocd_twi_cr, by = "id")  |>
+  dplyr::left_join(crop_ids, by = "crop")
 
+assertthat::assert_that(nrow(.all) == nrow(clim_summaries) && nrow(.all) == nrow(ca_woody_pts_clay_ocd_twi_cr),
+                        msg = "join was not 1:1 as expected")
+assertthat::assert_that(ncol(.all) == ncol(ca_woody_pts_clay_ocd_twi_cr) + ncol(clim_summaries) - 1,
+                        msg = "lost some columns in the join")
+
+glimpse(.all)
+skimr::skim(.all)
+
+data_for_clust_with_ids <- .all |>
+  dplyr::select(id, mean_temp, precip, crop_id, clay, ocd, twi, climregion_id)  |>
+  na.omit() |>
+  mutate(across(where(is.numeric), ~ signif(., digits = 3)))
+
+save(data_for_clust_with_ids, file = "data/data_for_clust_with_ids.rda")
+```
+
+# Additional Code Unused
 
 ### Cal-Adapt Climate
 
@@ -315,7 +408,7 @@ See `ca_catalog_search('30yavg_ens32avg_historical')`
 library(dplyr)
 ca_catalog_search('ens32avg') |>
   reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
-````
+```
 
 ##### Download Cal-Adapt Climate Rasters
 
@@ -336,14 +429,13 @@ polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
 vars <- c("pr", "tasmin", "tasmax")
 
 num_workers <- parallel::detectCores()
-future::plan(future::multisession,
+future::plan(future_multi,
             workers = 3)# num_workers
 ### Not working because Rasters are all 0????
 caladapt_rasters <- furrr::future_map(
   vars,
   ~ {
     gcms <- gcms # required for parallel
-    set.seed(1)
     ca_fetch_raster_polygon(
       polygon = polygon,
       var = .x,
@@ -366,24 +458,12 @@ caladapt_rasters_df <- caladapt_rasters |>
 
 z <- caladapt_rasters_df |>
   dplyr::group_by(variable) |>
-  dplyr::summarise(raster = extract_raster_values(
-    raster_path = raster,
-    spatial_data = ca_woody_pts
-  ))
+  dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
 ca_woody_pts_sg_cr_ca <-z
 
-extract_raster_values(
-  raster_path = caladapt_rasters_df$raster[1],
-  spatial_data = ca_woody_pts)
-
 climate_df <- furrr::future_map(
   caladapt_rasters_df$raster,
-  ~ extract_raster_values(
-      raster_input = .x,  # Pass SpatRaster object
-      spatial_data = ca_woody_pts,  # Your spatial points sf object
-      value_colname_prefix = paste0("value_", .y),  # Dynamic prefix based on variable
-      scaling_factor = 1  # Adjust if necessary
-    ),
+  ~ terra::extract(.x, terra::vect(ca_woody_pts)),
   .options = furrr_options(seed = 123)
 )
 
diff --git a/downscale/01_cluster_and_select_anchorsites.qmd b/downscale/01_cluster_and_select_anchorsites.qmd
deleted file mode 100644
index d27417c..0000000
--- a/downscale/01_cluster_and_select_anchorsites.qmd
+++ /dev/null
@@ -1,82 +0,0 @@
----
-title: "Downscale Anchor Sites"
----
-
-# Overview
-
-This workflow will:
-
-- Read in a dataset of site environmental data
-- Perform K-means clustering to identify clusters
-- Select anchor sites for each cluster
-
-## Setup
-
-```{r}
-knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
-library(sf)
-library(terra)
-library(tidyverse)
-```
-
-## Load Site Environmental Data
-
-```{r}
-
-load("data/ca_woody_sg_twi_cr_climate.rda")
-site_env <- ca_woody_sg_twi_cr_climate
-```
-
-## Anchor Site Selection
-
-### K-means Clustering
-
-K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
-or treat 'crop' as categorical by some encoding if needed)
-For demonstration, let's assume numeric + factor encoding for crop
-
-```{r}
-
-```
-
-crop_ids <- site_env |>
-distinct(crop) |>
-mutate(crop_code = as.integer(as.factor(crop)))
-
-tmp <- site_env |>
-left_join(crop_ids, by = "crop") |>
-select(id, mean_temp, precip, clay, climregion_id, crop_code, twi)
-
-data_for_clust_with_ids <- tmp |>
-na.omit()
-
-data_for_clust <- data_for_clust_with_ids |>
-select(- id)
-
-````
-
-```{r}
-set.seed(123)
-
-km_result <- kmeans(data_for_clust, centers = 10)
-
-data_for_clust_with_ids$cluster <- km_result$cluster
-
-site_env_with_clusters <- data_for_clust_with_ids |>
-  select(id, cluster) |>
-  left_join(site_env, by = "id")
-````
-
-### Anchor Site Selection
-
-TODO: Come up with reasonable way to select anchor sites
-
-Anchor site selection (one anchor per cluster)
-e.g., ??????? pick the polygon with median temp in each cluster
-
-```{r}
-# select representative points from each cluster based on multiple environmental variables
-anchor_sites <- site_env_with_clusters |>
-  group_by(cluster) |>
-  ???????
-```
diff --git a/downscale/01_cluster_and_select_design_points.qmd b/downscale/01_cluster_and_select_design_points.qmd
new file mode 100644
index 0000000..5179f87
--- /dev/null
+++ b/downscale/01_cluster_and_select_design_points.qmd
@@ -0,0 +1,302 @@
+---
+title: "Cluster and Select Design Points"
+---
+
+# Overview
+
+This workflow will:
+
+- Read in a dataset of site environmental data
+- Perform K-means clustering to identify clusters
+- Select anchor sites for each cluster
+
+## Setup
+
+```{r}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
+# general utilities
+library(tidyverse)
+
+# spatial
+library(sf)
+library(terra)
+
+# parallel computing
+library(cluster)
+library(factoextra)
+library(pathviewr)
+library(furrr)
+library(doParallel)
+library(dplyr)
+
+# Set up parallel processing with a safe number of cores
+no_cores <- parallel::detectCores(logical = FALSE)
+plan(multicore,  workers = no_cores - 2)
+options(future.globals.maxSize = benchmarkme::get_ram() * 0.9)
+```
+
+## Load Site Environmental Data
+
+Environmental data was pre-processed in the previous workflow 00-prepare.qmd.
+
+```{r}
+
+load("data/data_for_clust_with_ids.rda", verbose = TRUE)
+```
+
+Below are summary statistics of the dataset
+
+- id: Unique identifier for each LandIQ polygon
+- temp: Mean Annual Temperature from ERA5
+- precip: Mean Annual Precipitation from ERA5
+- clay: Clay content from SoilGrids
+- ocd: Organic Carbon content from SoilGrids
+- twi: Topographic Wetness Index
+- crop_id: identifier for crop type, see table in crop_ids.csv
+- climregion_id: Climate Regions as defined by CalAdapt identifier for climate region, see table in climregion_ids.csv
+
+```{r}
+skimr::skim(data_for_clust)
+```
+
+## Anchor Site Selection
+
+```{r}
+## Get Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
+woody_anchor_sites <- readr::read_csv("data/anchor_sites_ids.csv") |>
+  dplyr::filter(pft == "woody perennial crop")
+anchorsites_for_clust <-
+  data_for_clust_with_ids |>
+  dplyr::filter(id %in% woody_anchor_sites$id)
+
+message("Anchor sites included in final selection:")
+knitr::kable(woody_anchor_sites |> dplyr::left_join(anchorsites_for_clust, by = 'id'))
+```
+
+### Subset LandIQ fields for clustering
+
+K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
+or treat 'crop' as categorical by some encoding if needed).
+
+```{r}
+set.seed(42)  # Set seed for random number generator for reproducibility
+
+# subsample for testing (full dataset exceeds available Resources)
+sample_size <- 20000
+data_for_clust <- data_for_clust_with_ids |>
+                    # remove anchor sites
+                    dplyr::filter(!id %in% anchorsites_for_clust$id) |>
+                    sample_n(sample_size - nrow(anchorsites_for_clust)) |>
+                    # row bind anchorsites_for_clust
+                    bind_rows(anchorsites_for_clust) |>
+                    dplyr::mutate(crop_id = factor(crop_id),
+                                  climregion_id = factor(climregion_id))
+assertthat::assert_that(nrow(data_for_clust) == sample_size)
+
+```
+
+### K-means Clustering
+
+First, create a function to perform hierarchical k-means and find optimal clusters.
+
+```{r}
+
+perform_clustering <- function(data) {
+  # Select numeric variables for clustering
+  clust_data <- data |> select(where(is.numeric))
+
+  # Standardize data
+  clust_data_scaled <- scale(clust_data)
+
+  # Determine optimal number of clusters using elbow method
+  k_range <- 3:12
+  tot.withinss <- future_map_dbl(k_range, function(k) {
+    model <- hkmeans(clust_data_scaled, k)
+    model$tot.withinss
+  }, .options = furrr_options(seed = TRUE))
+
+  # Find elbow point
+  elbow_df <- data.frame(k = k_range, tot.withinss = tot.withinss)
+  optimal_k <- find_curve_elbow(elbow_df)
+  message("Optimal number of clusters determined: ", optimal_k)
+
+  # Plot elbow method results
+  elbow_plot <- ggplot(elbow_df, aes(x = k, y = tot.withinss)) +
+    geom_line() +
+    geom_point() +
+    labs(title = "Elbow Method for Optimal k", x = "Number of Clusters", y = "Total Within-Cluster Sum of Squares")
+  print(elbow_plot)
+
+  # Compute silhouette scores to validate clustering quality
+  silhouette_scores <- future_map_dbl(k_range, function(k) {
+    model <- hkmeans(clust_data_scaled, k)
+    mean(silhouette(model$cluster, dist(clust_data_scaled))[, 3])
+  }, .options = furrr_options(seed = TRUE))
+
+  silhouette_df <- data.frame(k = k_range, silhouette = silhouette_scores)
+
+  message("Silhouette scores computed. Higher values indicate better-defined clusters.")
+  print(silhouette_df)
+
+  silhouette_plot <- ggplot(silhouette_df, aes(x = k, y = silhouette)) +
+    geom_line(color = "red") +
+    geom_point(color = "red") +
+    labs(title = "Silhouette Scores for Optimal k", x = "Number of Clusters", y = "Silhouette Score")
+  print(silhouette_plot)
+
+  # Perform hierarchical k-means clustering with optimal k
+  final_hkmeans <- hkmeans(clust_data_scaled, optimal_k)
+  data$cluster <- final_hkmeans$cluster
+
+  return(data)
+}
+```
+
+```{r}
+# Apply clustering function to the sampled dataset in parallel
+data_clustered <- perform_clustering(data_for_clust)
+save(data_clustered, file = "cache/data_clustered.rda")
+```
+
+### Check Clustering
+
+```{r}
+#load("cache/data_clustered.rda")
+# Summarize clusters
+cluster_summary <- data_clustered |>
+                      group_by(cluster) |>
+                      summarise(across(where(is.numeric), mean, na.rm = TRUE))
+# use ggplot to plot all pairwise numeric variables
+
+library(GGally)
+data_clustered |>
+  sample_n(1000) |>
+  ggpairs(columns=c(1,2,4,5,6)+1,
+          mapping = aes(color = as.factor(cluster), alpha = 0.8))+
+  theme_minimal()
+
+ggplot(data = cluster_summary, aes(x = cluster)) +
+  geom_line(aes(y = temp, color = "temp")) +
+  geom_line(aes(y = precip, color = "precip")) +
+  geom_line(aes(y = clay, color = "clay")) +
+  geom_line(aes(y = ocd, color = "ocd")) +
+  geom_line(aes(y = twi, color = "twi")) +
+  labs(x = "Cluster", y = "Value", color = "Variable")
+
+knitr::kable(cluster_summary |> round(0))
+
+```
+
+```{r}
+# Check stratification of clusters by categorical factors
+
+# cols should be character, factor
+crop_ids <- read_csv("data/crop_ids.csv",
+                     col_types = cols(
+                       crop_id = col_factor(),
+                       crop = col_character()))
+climregion_ids <- read_csv("data/climregion_ids.csv",
+                           col_types = cols(
+                             climregion_id = col_factor(),
+                             climregion_name = col_character()
+                           ))
+data_clustered2 <- data_clustered |>
+  left_join(crop_ids, by = "crop_id") |>
+  left_join(climregion_ids, by = "climregion_id")
+
+factor_stratification <- list(
+    crop_id = table(data_clustered2$cluster, data_clustered2$crop),
+    climregion_id = table(data_clustered2$cluster, data_clustered2$climregion_name))
+
+lapply(factor_stratification, knitr::kable)
+# Shut down parallel backend
+plan(sequential)
+```
+
+## Design Point Selection
+
+For phase 1b we need to supply design points for SIPNET runs. For development we will use 100 design points from the clustered dataset that are _not_ already anchor sites.
+
+For the final high resolution runs we expect to use approximately 10,000 design points.
+For woody croplands, we will start with a number proportional to the total number of sites with woody perennial pfts.
+
+```{r}
+# From the clustered data, remove anchor sites to avoid duplicates in design point selection.
+set.seed(19871023)
+design_points_ids <- data_clustered |>
+  filter(!id %in% woody_anchor_sites$id) |>
+  select(id)  |>
+  sample_n(100 - nrow(woody_anchor_sites))  |>
+  select(id)
+
+anchor_site_ids <- woody_anchor_sites |>
+   select(id)
+
+if(!exists("ca_fields")) {
+  ca_fields <- sf::st_read("data/ca_fields.gpkg")
+}
+
+final_design_points <- bind_rows(design_points_ids,
+   anchor_site_ids)  |>
+   left_join(ca_fields, by = "id")
+
+final_design_points |>
+   as_tibble()  |>
+   select(id, lat, lon) |>
+   write_csv("data/final_design_points.csv")
+
+```
+
+Now some analysis of how these design points are distributed
+
+```{r}
+# plot map of california and climregions
+load("data/ca_climregions.rda")
+final_design_points_clust <- final_design_points |>
+  left_join(data_clustered, by = "id") |>
+  select(lat, lon, cluster) |>
+  mutate(cluster = as.factor(cluster)) |>
+  st_as_sf(coords = c("lon", "lat"), crs = 4326)
+
+ca_fields_pts <- ca_fields  |>
+  st_as_sf(coords = c("lon", "lat"), crs = 4326)
+ggplot() +
+  geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.5) +
+  labs(color = "Climregion") +
+  theme_minimal() +
+  geom_sf(data = final_design_points_clust, aes(shape = cluster)) +
+  geom_sf(data = ca_fields_pts, fill = 'black', color = "grey", alpha = 0.5)
+
+
+
+```
+
+## Woody Cropland Proportion
+
+Here we calculate percent of California croplands that are woody perennial crops, in order to estimate the number of design points that will be selected in the clustering step
+
+```{r}
+ca <- sf::st_read("data/ca_fields.gpkg") |>
+  dplyr::select(-lat, -lon) |>
+  dplyr::left_join(readr::read_csv("data/ca_field_attributes.csv"), by = "id")
+
+system.time(
+pft_area <- ca |>
+  dplyr::sample_n(200) |>
+  dplyr::select(id, pft, area_ha) |>
+  dtplyr::lazy_dt() |>
+  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
+  dplyr::group_by(woody_indicator) |>
+  dplyr::summarize(pft_area = sum(area_ha))
+)
+# now calculate sum of pft_area and the proportion of woody perennial crops
+pft_area <- pft_area |>
+  dplyr::mutate(total_area = sum(pft_area)) |>
+  dplyr::mutate(area_pct = round(100 * pft_area / total_area)) |>
+  select(-total_area, -pft_area) |>
+  dplyr::rename("Woody Crops" = woody_indicator, "Area %" = area_pct) |>
+  kableExtra::kable()
+
+```
+
+Approximately `r pft_area|> filter(`Woody Crops` == 1) |> pull(`Area %`)` of California croplands are woody perennial crops.

From 4369a2f6af833281993ba329c51310dc881391f9 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Mon, 3 Feb 2025 02:29:52 -0500
Subject: [PATCH 04/49] very rough draft for design point simulations

---
 downscale/02_design_point_simulations.qmd | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 downscale/02_design_point_simulations.qmd

diff --git a/downscale/02_design_point_simulations.qmd b/downscale/02_design_point_simulations.qmd
new file mode 100644
index 0000000..3773c51
--- /dev/null
+++ b/downscale/02_design_point_simulations.qmd
@@ -0,0 +1,11 @@
+---
+title: "Untitled"
+format: html
+---
+
+# Overview
+
+This workflow will:
+
+- Use SIPNET to simulate SOC and biomass for each design point.
+- generate a dataframe with site_id, lat, lon, soil carbon, biomass

From 5b205123f96f95b1e0ec60d4c55377a470c40697 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 5 Feb 2025 04:13:21 -0500
Subject: [PATCH 05/49] Updated data preparation and clustering workflows so
 that they now compile

---
 downscale/00-prepare.qmd                      |  50 +++--
 .../01_cluster_and_select_design_points.qmd   | 173 ++++++++++++------
 2 files changed, 146 insertions(+), 77 deletions(-)

diff --git a/downscale/00-prepare.qmd b/downscale/00-prepare.qmd
index 8ee6a5e..84bcaa8 100644
--- a/downscale/00-prepare.qmd
+++ b/downscale/00-prepare.qmd
@@ -1,10 +1,9 @@
 ---
 title: "Workflow Setup and Data Preparation"
-format: html
-author: David LeBauer
-date: sys.Date()
+author: "David LeBauer"
 ---
 
+
 # Overview
 
 - Prepare Inputs
@@ -42,20 +41,19 @@ library(tidyverse)
 library(caladaptr)
 library(sf)
 library(terra)
-
 ## Required until PR 3423is merged https://github.com/PecanProject/pecan/pull/3423
 # check if PR is merged
-devtools::load_all("../pecan/modules/data.land")
+devtools::load_all(here::here("../pecan/modules/data.land"))
 
 ## Check available compute resources
 benchmarkme::get_ram()
-benchmarkme::get_cpu()
+ncpu <- benchmarkme::get_cpu()$no_of_cores
 no_cores <- parallel::detectCores(logical = FALSE)
 future_multi <- ifelse(future::supportsMulticore(),
        future::plan(future::multicore),
        future::plan(future::multisession))
 future::plan(future_multi,
-             workers = 16)
+             workers = ncpu - 2)
 
 
 
@@ -144,7 +142,7 @@ Phase 1 focuses on Woody Perennial Crop fields.
 Next, we will subset the LandIQ data to only include woody perennial crop fields.
 At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
 
-```{r eval=FALSE}
+```{r}
 ca_fields <- sf::st_read("data/ca_fields.gpkg")
 ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
 
@@ -188,12 +186,12 @@ sf::st_write(ca_woody_subset,
         "data/ca_woody_subset.gpkg", delete_layer = TRUE)
 ```
 
-### Convert Points to Polygons.
+### Convert Polygons to Points.
 
 For Phase 1, we will use points to query raster data.
 In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
 
-```{r}
+```{r polygons-to-points}
 woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for development and testing
 
 # load fields, convert polygons to points, and subset
@@ -206,7 +204,7 @@ ca_woody_pts <- sf::st_read(woody_gpkg) |>
 
 ## Anchor Sites
 
-```{r}
+```{r anchor-sites}
 # Anchor sites from UC Davis, UC Riverside, and Ameriflux.
 anchor_sites <- readr::read_csv("data/anchor_sites.csv")
 
@@ -234,7 +232,7 @@ sf::st_write(anchor_sites_with_ids |>
 
 Using already prepared SoilGrids layers
 
-```{r}
+```{r load-soilgrids}
 soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
 soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
 ## if we want to clip to CA
@@ -250,13 +248,13 @@ soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
 
 #### Extract clay from SoilGrids
 
-```{r}
+```{r sg-clay-ocd}
 
 clay <- (terra::extract(soilgrids_north_america_clay_rast, terra::vect(ca_woody_pts)) |>
   dplyr::select(-ID)) |>
   dplyr::pull()/ 10
 
-ocd <- terra::extract(soilgrids_north_america_ocd_rast, terra::vect(ca_woody_pts_clay)) |>
+ocd <- terra::extract(soilgrids_north_america_ocd_rast, terra::vect(ca_woody_pts)) |>
   dplyr::select(-ID) |>
   dplyr::pull()
 
@@ -273,7 +271,7 @@ assertthat::assert_that(
 
 ### Topographic Wetness Index
 
-```{r}
+```{r twi}
 twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
 twi_rast <- terra::rast(twi_tiff)
 
@@ -287,7 +285,7 @@ ca_woody_pts_clay_ocd_twi <- cbind(ca_woody_pts_clay_ocd, twi = twi)
 
 ### Cal-Adapt Climate Regions
 
-```{r get_climregions}
+```{r caladapt_climregions}
 
 ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
                       # and so units are in meters
@@ -342,7 +340,7 @@ extract_clim <- function(raster, points_sf) {
     select(id, temp, prec, srad, vapr)
 }
 
-.tmp <-  list(rasters_list) |>
+.tmp <-  rasters_list |>
   furrr::future_map_dfr(
     ~ extract_clim(.x, ca_woody_pts),
       .id = "year",
@@ -355,7 +353,7 @@ clim_summaries <- .tmp |>
 ) |>
   dplyr::group_by(id) |>
   dplyr::summarise(
-    mean_temp = mean(temp),
+    temp = mean(temp),
     precip = mean(precip),
     srad = mean(srad),
     vapr = mean(vapr)
@@ -380,21 +378,20 @@ crop_ids <- ca_woody_pts |>
 
 assertthat::assert_that(nrow(.all) == nrow(clim_summaries) && nrow(.all) == nrow(ca_woody_pts_clay_ocd_twi_cr),
                         msg = "join was not 1:1 as expected")
-assertthat::assert_that(ncol(.all) == ncol(ca_woody_pts_clay_ocd_twi_cr) + ncol(clim_summaries) - 1,
-                        msg = "lost some columns in the join")
 
 glimpse(.all)
 skimr::skim(.all)
 
 data_for_clust_with_ids <- .all |>
-  dplyr::select(id, mean_temp, precip, crop_id, clay, ocd, twi, climregion_id)  |>
+  #dplyr::select(-c(climregion_name))  |>
   na.omit() |>
   mutate(across(where(is.numeric), ~ signif(., digits = 3)))
 
-save(data_for_clust_with_ids, file = "data/data_for_clust_with_ids.rda")
+save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
 ```
 
-# Additional Code Unused
+<!--
+# [Draft] Unused Code
 
 ### Cal-Adapt Climate
 
@@ -404,7 +401,7 @@ See `ca_catalog_search('30yavg_ens32avg_historical')`
 
 ## make a paginated table
 
-```{r}
+```{r eval=FALSE}
 library(dplyr)
 ca_catalog_search('ens32avg') |>
   reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
@@ -415,7 +412,6 @@ ca_catalog_search('ens32avg') |>
 LOCA (CMIP5-based)
 
 ```{r eval = FALSE}
-#source(here::here("../pecan/modules/data.land/R/download_caladapt.R"))
 
 #gcm = "ens32avg"
 gcm = "HadGEM2-ES"
@@ -472,7 +468,7 @@ climate_df <- furrr::future_map(
 
 ##### Cal-Adapt 3.0
 
-```{r}
+```{r eval=FALSE}
 my_data <- read_3km_hourly_climate_terra(
      data_dir = "local_folder",
      variable = "T2",
@@ -481,3 +477,5 @@ my_data <- read_3km_hourly_climate_terra(
      n_cores = 8
    )
 ```
+-->
+
diff --git a/downscale/01_cluster_and_select_design_points.qmd b/downscale/01_cluster_and_select_design_points.qmd
index 5179f87..5108f5a 100644
--- a/downscale/01_cluster_and_select_design_points.qmd
+++ b/downscale/01_cluster_and_select_design_points.qmd
@@ -1,5 +1,6 @@
 ---
 title: "Cluster and Select Design Points"
+author: "David LeBauer"
 ---
 
 # Overview
@@ -12,8 +13,7 @@ This workflow will:
 
 ## Setup
 
-```{r}
-knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
+```{r setup}
 # general utilities
 library(tidyverse)
 
@@ -33,37 +33,71 @@ library(dplyr)
 no_cores <- parallel::detectCores(logical = FALSE)
 plan(multicore,  workers = no_cores - 2)
 options(future.globals.maxSize = benchmarkme::get_ram() * 0.9)
+
+# load climate regions for mapping
+load("data/ca_climregions.rda")
+# environmental covariates
+load("cache/data_for_clust_with_ids.rda")
+if('mean_temp' %in% names(data_for_clust_with_ids)){
+  data_for_clust_with_ids <- data_for_clust_with_ids |>
+    rename(temp = mean_temp)
+  PEcAn.logger::logger.warn("you should", 
+    "change mean_temp --> temp in data_for_clust_with_ids",
+    "when it is created in 00-prepare.qmd and then delete",
+    "this conditional chunk")
+}
+
+# set coordinate reference system
+ca_albers_crs <- 3310 # California Albers EPSG
+
 ```
 
 ## Load Site Environmental Data
 
 Environmental data was pre-processed in the previous workflow 00-prepare.qmd.
 
-```{r}
+Below is a sumary of the covariates dataset
 
-load("data/data_for_clust_with_ids.rda", verbose = TRUE)
-```
-
-Below are summary statistics of the dataset
-
-- id: Unique identifier for each LandIQ polygon
+- id: Unique identifier for each polygon
 - temp: Mean Annual Temperature from ERA5
 - precip: Mean Annual Precipitation from ERA5
+- srad: Solar Radiation
+- vapr: Vapor pressure deficit
 - clay: Clay content from SoilGrids
 - ocd: Organic Carbon content from SoilGrids
 - twi: Topographic Wetness Index
 - crop_id: identifier for crop type, see table in crop_ids.csv
 - climregion_id: Climate Regions as defined by CalAdapt identifier for climate region, see table in climregion_ids.csv
 
-```{r}
-skimr::skim(data_for_clust)
-```
 
 ## Anchor Site Selection
 
-```{r}
-## Get Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
-woody_anchor_sites <- readr::read_csv("data/anchor_sites_ids.csv") |>
+Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
+
+```{r anchor-sites-selection}
+anchor_sites <- readr::read_csv("data/anchor_sites.csv")
+ca_woody <- sf::st_read("data/ca_woody.gpkg") |> 
+  select(-pft) # duplicates pft column in anchor_sites
+
+anchor_sites_pts <- anchor_sites |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |> 
+  sf::st_join(ca_woody, join = sf::st_within) |> 
+  dplyr::select(id, lat, lon, location, site_name, crops, pft)
+
+# print a nice table of anchor sites
+knitr::kable(anchor_sites)
+# create map of anchor sites
+anchor_sites_pts |>
+  sf::st_transform(., crs = ca_albers_crs) |>
+  ggplot() +
+  geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.25) +
+  labs(color = "Climate Region") +
+  geom_sf(aes(color = pft)) +
+  scale_color_brewer(palette = "Dark2") +
+  labs(color = "PFT") +
+  theme_minimal()
+
+woody_anchor_sites <-  anchor_sites_pts |>
   dplyr::filter(pft == "woody perennial crop")
 anchorsites_for_clust <-
   data_for_clust_with_ids |>
@@ -75,31 +109,38 @@ knitr::kable(woody_anchor_sites |> dplyr::left_join(anchorsites_for_clust, by =
 
 ### Subset LandIQ fields for clustering
 
-K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
-or treat 'crop' as categorical by some encoding if needed).
+The following code does:
+- Read in a dataset of site environmental data
+- Removes anchor sites from the dataset that will be used for clustering
+- Subsample the dataset - 80GB RAM too small to cluster 100k rows
+- Bind anchor sites back to the dataset 
 
-```{r}
+```{r subset-for-clustering}
 set.seed(42)  # Set seed for random number generator for reproducibility
-
 # subsample for testing (full dataset exceeds available Resources)
 sample_size <- 20000
+
 data_for_clust <- data_for_clust_with_ids |>
                     # remove anchor sites
                     dplyr::filter(!id %in% anchorsites_for_clust$id) |>
                     sample_n(sample_size - nrow(anchorsites_for_clust)) |>
                     # row bind anchorsites_for_clust
                     bind_rows(anchorsites_for_clust) |>
-                    dplyr::mutate(crop_id = factor(crop_id),
+                    dplyr::mutate(crop = factor(crop),
                                   climregion_id = factor(climregion_id))
 assertthat::assert_that(nrow(data_for_clust) == sample_size)
-
+assertthat::assert_that('temp'%in% colnames(data_for_clust))
+skimr::skim(data_for_clust)
 ```
 
 ### K-means Clustering
 
-First, create a function to perform hierarchical k-means and find optimal clusters.
+First, create a function `perform_clustering` to perform hierarchical k-means and find optimal clusters.
 
-```{r}
+K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
+or treat 'crop' as categorical by some encoding if needed).
+
+```{r k-means-clustering-function}
 
 perform_clustering <- function(data) {
   # Select numeric variables for clustering
@@ -152,20 +193,30 @@ perform_clustering <- function(data) {
 }
 ```
 
-```{r}
-# Apply clustering function to the sampled dataset in parallel
+Apply clustering function to the sampled dataset.
+
+```{r clustering, eval=FALSE}
+
 data_clustered <- perform_clustering(data_for_clust)
 save(data_clustered, file = "cache/data_clustered.rda")
 ```
 
 ### Check Clustering
 
-```{r}
-#load("cache/data_clustered.rda")
+```{r check-clustering}
+load("cache/data_clustered.rda")
 # Summarize clusters
 cluster_summary <- data_clustered |>
                       group_by(cluster) |>
                       summarise(across(where(is.numeric), mean, na.rm = TRUE))
+if('mean_temp' %in% names(cluster_summary)){
+  cluster_summary <- cluster_summary |>
+    rename(temp = mean_temp)
+  PEcAn.logger::logger.warn("you should", 
+    "change mean_temp --> temp in cluster_summary",
+    "when it is created upstream and then delete this",
+    "conditional chunk")
+}
 # use ggplot to plot all pairwise numeric variables
 
 library(GGally)
@@ -187,7 +238,9 @@ knitr::kable(cluster_summary |> round(0))
 
 ```
 
-```{r}
+#### Stratification by Crops and Climate Regions
+
+```{r check-stratification}
 # Check stratification of clusters by categorical factors
 
 # cols should be character, factor
@@ -200,13 +253,10 @@ climregion_ids <- read_csv("data/climregion_ids.csv",
                              climregion_id = col_factor(),
                              climregion_name = col_character()
                            ))
-data_clustered2 <- data_clustered |>
-  left_join(crop_ids, by = "crop_id") |>
-  left_join(climregion_ids, by = "climregion_id")
 
 factor_stratification <- list(
-    crop_id = table(data_clustered2$cluster, data_clustered2$crop),
-    climregion_id = table(data_clustered2$cluster, data_clustered2$climregion_name))
+    crop_id = table(data_clustered$cluster, data_clustered$crop),
+    climregion_id = table(data_clustered$cluster, data_clustered$climregion_name))
 
 lapply(factor_stratification, knitr::kable)
 # Shut down parallel backend
@@ -220,9 +270,29 @@ For phase 1b we need to supply design points for SIPNET runs. For development we
 For the final high resolution runs we expect to use approximately 10,000 design points.
 For woody croplands, we will start with a number proportional to the total number of sites with woody perennial pfts.
 
-```{r}
+```{r design-point-selection}
 # From the clustered data, remove anchor sites to avoid duplicates in design point selection.
-set.seed(19871023)
+
+if(!exists("ca_fields")) {
+  ca_fields <- sf::st_read("data/ca_fields.gpkg")
+}
+
+missing_anchor_sites <- woody_anchor_sites|>
+               as_tibble()|>
+               left_join(ca_fields, by = 'id') |>
+               filter(is.na(id)) |> 
+               select(location, site_name, geometry)
+
+if(nrow(missing_anchor_sites) > 0){
+  woody_anchor_sites <- woody_anchor_sites |> 
+                          drop_na(lat, lon)
+  # there is an anchor site that doesn't match the ca_fields; 
+  # need to check on this. For now we will just remove it from the dataset.
+  PEcAn.logger::logger.warn("The following site(s) aren't within DWR crop fields:", 
+                             knitr::kable(missing_anchor_sites))
+}   
+
+set.seed(2222222)
 design_points_ids <- data_clustered |>
   filter(!id %in% woody_anchor_sites$id) |>
   select(id)  |>
@@ -232,10 +302,6 @@ design_points_ids <- data_clustered |>
 anchor_site_ids <- woody_anchor_sites |>
    select(id)
 
-if(!exists("ca_fields")) {
-  ca_fields <- sf::st_read("data/ca_fields.gpkg")
-}
-
 final_design_points <- bind_rows(design_points_ids,
    anchor_site_ids)  |>
    left_join(ca_fields, by = "id")
@@ -247,14 +313,17 @@ final_design_points |>
 
 ```
 
+### Design Point Map
+
 Now some analysis of how these design points are distributed
 
-```{r}
+```{r design-point-map}
 # plot map of california and climregions
-load("data/ca_climregions.rda")
+
 final_design_points_clust <- final_design_points |>
   left_join(data_clustered, by = "id") |>
-  select(lat, lon, cluster) |>
+  select(id, lat, lon, cluster) |>
+  drop_na(lat, lon) |>
   mutate(cluster = as.factor(cluster)) |>
   st_as_sf(coords = c("lon", "lat"), crs = 4326)
 
@@ -271,32 +340,34 @@ ggplot() +
 
 ```
 
+
 ## Woody Cropland Proportion
 
 Here we calculate percent of California croplands that are woody perennial crops, in order to estimate the number of design points that will be selected in the clustering step
 
-```{r}
-ca <- sf::st_read("data/ca_fields.gpkg") |>
+```{r woody-proportion}
+field_attributes <- read_csv("data/ca_field_attributes.csv")
+ca <- ca_fields |>
   dplyr::select(-lat, -lon) |>
-  dplyr::left_join(readr::read_csv("data/ca_field_attributes.csv"), by = "id")
+  dplyr::left_join(field_attributes, by = "id")
 
-system.time(
+set.seed(5050)
 pft_area <- ca |>
-  dplyr::sample_n(200) |>
+  dplyr::sample_n(2000) |>
   dplyr::select(id, pft, area_ha) |>
   dtplyr::lazy_dt() |>
   dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
   dplyr::group_by(woody_indicator) |>
   dplyr::summarize(pft_area = sum(area_ha))
-)
+
 # now calculate sum of pft_area and the proportion of woody perennial crops
 pft_area <- pft_area |>
   dplyr::mutate(total_area = sum(pft_area)) |>
   dplyr::mutate(area_pct = round(100 * pft_area / total_area)) |>
   select(-total_area, -pft_area) |>
-  dplyr::rename("Woody Crops" = woody_indicator, "Area %" = area_pct) |>
+  dplyr::rename("Woody Crops" = woody_indicator, "Area %" = area_pct) 
+  
+pft_area |>
   kableExtra::kable()
 
 ```
-
-Approximately `r pft_area|> filter(`Woody Crops` == 1) |> pull(`Area %`)` of California croplands are woody perennial crops.

From 4778094b3fed9bbd31cf472a9854af971f96f846 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 5 Feb 2025 04:14:32 -0500
Subject: [PATCH 06/49] Created new SIPNETWOPET model as a surrogate for design
 point runs in order to test downscaling

---
 downscale/02_design_point_simulations.qmd | 86 ++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 2 deletions(-)

diff --git a/downscale/02_design_point_simulations.qmd b/downscale/02_design_point_simulations.qmd
index 3773c51..c37d922 100644
--- a/downscale/02_design_point_simulations.qmd
+++ b/downscale/02_design_point_simulations.qmd
@@ -1,6 +1,6 @@
 ---
-title: "Untitled"
-format: html
+title: "Design Point Selection"
+author: "David LeBauer"
 ---
 
 # Overview
@@ -9,3 +9,85 @@ This workflow will:
 
 - Use SIPNET to simulate SOC and biomass for each design point.
 - generate a dataframe with site_id, lat, lon, soil carbon, biomass
+
+## SIPNETWOPET [surrogate model] 
+
+Until SIPNET predictions are available, we will introduce a new model, SIPNETWOPET, the Simpler Photosynthesis and EvapoTranspiration model, WithOut Photosynthesis and EvapoTranspiration.
+
+```{r}
+library(tidyverse)
+```
+
+
+```{r}
+# Define SIPNETWOPET function
+
+sipnetwopet <- function(
+  mean_temp, precip, clay, ocd, twi, seed = 8675.309
+) {
+  set.seed(seed)
+  # Manually scale inputs using predefined dataset statistics
+  # scaled = (x - mean(x)) / sd(x)
+  scaled_temp <- (mean_temp - 20) / 2
+  scaled_precip <- (precip - 5000) / 2000
+  scaled_clay <- (clay - 20) / 6
+  scaled_ocd <- (ocd - 300) / 60
+  scaled_twi <- (twi - 10) / 2
+  
+  # Add stochastic variation = 10% * sd
+  scaled_temp <- scaled_temp * rnorm(1, 1, 0.1)
+  scaled_precip <- scaled_precip * rnorm(1, 1, 0.1)
+  scaled_clay <- scaled_clay * rnorm(1, 1, 0.1)
+  scaled_ocd <- scaled_ocd * rnorm(1, 1, 0.1)
+  scaled_twi <- scaled_twi * rnorm(1, 1, 0.1)
+
+  # Simulate SOC with asymptotic bounds
+  .soc <- 80 + 15 * scaled_precip + 12 * scaled_temp + 50 * scaled_ocd + 15 * scaled_clay + 8 * scaled_twi +
+    rnorm(1, 0, 10)
+  soc <- 90 * (.soc / (100 + abs(.soc))) + rlnorm(1, meanlog = log(5), sdlog = 0.3)  # Asymptotic upper and soft lower bound
+
+  # Simulate AGB with soft lower bound constraint
+  .agb <- 120 + 25 * scaled_temp + 35 * scaled_precip + 10 * scaled_clay -
+    8 * scaled_twi + rnorm(1, 0, 15)
+  agb <- 450 * (.agb / (500 + abs(.agb))) + rlnorm(1, meanlog = log(20), sdlog = 0.4) # Asymptotic upper and soft lower bound
+
+  return(tibble::tibble(soc = soc, agb = agb))
+}
+
+```
+
+### SIPNETWOPET Demonstration
+
+```{r sipnetwopet-demo}
+# Example dataset
+n <- 100
+set.seed(77.77)
+example_sites <- tibble::tibble(
+  mean_temp = rnorm(n, 16, 2),
+  precip = rweibull(n, shape = 2, scale = 4000),
+  clay = 100 * rbeta(n, shape1 = 2, shape2 = 5),
+  ocd = rweibull(n, shape = 2, scale = 320),
+  twi = rweibull(n, shape = 2, scale = 15)
+)
+
+# Apply function using rowwise mapping
+example_results <- example_sites |> 
+  dplyr::rowwise() |> 
+  dplyr::mutate(result = list(sipnetwopet(mean_temp, precip, clay, ocd, twi))) |> 
+  tidyr::unnest(result)
+
+print(example_results)
+pairs(example_results)
+```
+
+### Design Point Covariates
+
+```{r}
+
+design_points <- read_csv('data/final_design_points.csv') 
+covariates <- read_csv("data/data_for_clust_with_ids.rda") |> get()
+
+design_point_covs <- design_points |> 
+  left_join(covariates, by = 'id')
+model_inputs <- covariates
+```
\ No newline at end of file

From 6ca6d6da3076f4493665ce2f307a2ecd012d0a19 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 6 Feb 2025 22:42:13 -0500
Subject: [PATCH 07/49] update readme and fix SIPNETWOPET

---
 README.md                                 | 28 +++++++
 downscale/02_design_point_simulations.qmd | 95 +++++++++++------------
 2 files changed, 73 insertions(+), 50 deletions(-)

diff --git a/README.md b/README.md
index aff3412..bd382b7 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,31 @@
 # Workflow files for CCMMF deliverables
 
 More to come here presently. Meanwhile see the READMEs for individual workflow folders.
+
+This workflow is divided into three steps:
+
+- prparation,
+- execution,
+- analysis
+
+## Data
+
+Data is stored in the `data/` folder.
+
+It includes the following files:
+
+Workflow Inputs
+├── ca_woody.gpkg
+├── anchor_sites.csv
+
+data/
+├── ca_climregions.rda
+├── ca_convex_hull.geojson
+├── ca_convex_hull_reduced.geojson
+├── ca_field_attributes.csv
+├── ca_fields.gpkg
+├── caladapt_domain_convex_hull.geojson
+├── ca_woody.gpkg
+└── yolo_county_polygon_simplified.geojson
+cache/
+├── data_for_clust_with_ids.rda
diff --git a/downscale/02_design_point_simulations.qmd b/downscale/02_design_point_simulations.qmd
index c37d922..3967553 100644
--- a/downscale/02_design_point_simulations.qmd
+++ b/downscale/02_design_point_simulations.qmd
@@ -5,60 +5,67 @@ author: "David LeBauer"
 
 # Overview
 
-This workflow will:
+In the future, this workflow will:
 
 - Use SIPNET to simulate SOC and biomass for each design point.
-- generate a dataframe with site_id, lat, lon, soil carbon, biomass
+- Generate a dataframe with site_id, lat, lon, soil carbon, biomass
+- (Maybe) use SIPNETWOPET to evaluate downscaling model skill?
+
+Curently, we will use a surrogate model, SIPNETWOPET, to simulate SOC and biomass for each design point.
 
 ## SIPNETWOPET [surrogate model] 
 
-Until SIPNET predictions are available, we will introduce a new model, SIPNETWOPET, the Simpler Photosynthesis and EvapoTranspiration model, WithOut Photosynthesis and EvapoTranspiration.
+
+
+## SIPNETWOPET Simulation of Design Points
+
+We introduce a new model, SIPNETWOPET, the "Simpler Photosynthesis and EvapoTranspiration model, WithOut Photosynthesis and EvapoTranspiration".
 
 ```{r}
 library(tidyverse)
+source("downscale/sipnetwopet.R")
 ```
 
 
+### Join Design Points with Covariates
+
 ```{r}
-# Define SIPNETWOPET function
-
-sipnetwopet <- function(
-  mean_temp, precip, clay, ocd, twi, seed = 8675.309
-) {
-  set.seed(seed)
-  # Manually scale inputs using predefined dataset statistics
-  # scaled = (x - mean(x)) / sd(x)
-  scaled_temp <- (mean_temp - 20) / 2
-  scaled_precip <- (precip - 5000) / 2000
-  scaled_clay <- (clay - 20) / 6
-  scaled_ocd <- (ocd - 300) / 60
-  scaled_twi <- (twi - 10) / 2
-  
-  # Add stochastic variation = 10% * sd
-  scaled_temp <- scaled_temp * rnorm(1, 1, 0.1)
-  scaled_precip <- scaled_precip * rnorm(1, 1, 0.1)
-  scaled_clay <- scaled_clay * rnorm(1, 1, 0.1)
-  scaled_ocd <- scaled_ocd * rnorm(1, 1, 0.1)
-  scaled_twi <- scaled_twi * rnorm(1, 1, 0.1)
-
-  # Simulate SOC with asymptotic bounds
-  .soc <- 80 + 15 * scaled_precip + 12 * scaled_temp + 50 * scaled_ocd + 15 * scaled_clay + 8 * scaled_twi +
-    rnorm(1, 0, 10)
-  soc <- 90 * (.soc / (100 + abs(.soc))) + rlnorm(1, meanlog = log(5), sdlog = 0.3)  # Asymptotic upper and soft lower bound
-
-  # Simulate AGB with soft lower bound constraint
-  .agb <- 120 + 25 * scaled_temp + 35 * scaled_precip + 10 * scaled_clay -
-    8 * scaled_twi + rnorm(1, 0, 15)
-  agb <- 450 * (.agb / (500 + abs(.agb))) + rlnorm(1, meanlog = log(20), sdlog = 0.4) # Asymptotic upper and soft lower bound
-
-  return(tibble::tibble(soc = soc, agb = agb))
-}
+design_points <- read_csv('data/final_design_points.csv') 
+covariates <- load("data/data_for_clust_with_ids.rda") |> get()
 
+design_point_covs <- design_points |> 
+  left_join(sf::st_drop_geometry(covariates), by = 'id')
 ```
 
-### SIPNETWOPET Demonstration
+### Run SIPNETWOPET
+
+```{r}
+set.seed(8675.309)
+design_point_results <- design_point_covs |> 
+  dplyr::rowwise() |> 
+  dplyr::mutate(result = list(sipnetwopet(temp, precip, clay, ocd, twi))) |> 
+  tidyr::unnest(result) |> 
+  dplyr::select(id, lat, lon, soc, agb, ensemble_id)
+
+
+ensemble_data <- design_point_results |>
+  dplyr::group_by(ensemble_id) |>
+  dplyr::summarize(
+    SOC = list(soc),
+    AGB = list(agb),
+    .groups = "drop"
+  )
+
+saveRDS(design_point_results, 'cache/design_point_results.rds')
 
-```{r sipnetwopet-demo}
+class(covariates)
+write_csv(design_point_results, 'cache/sipnetwopet_design_point_results.csv')
+```
+
+
+### SIPNETWOPET Example
+
+```{r sipnetwopet-demo, eval=FALSE}
 # Example dataset
 n <- 100
 set.seed(77.77)
@@ -78,16 +85,4 @@ example_results <- example_sites |>
 
 print(example_results)
 pairs(example_results)
-```
-
-### Design Point Covariates
-
-```{r}
-
-design_points <- read_csv('data/final_design_points.csv') 
-covariates <- read_csv("data/data_for_clust_with_ids.rda") |> get()
-
-design_point_covs <- design_points |> 
-  left_join(covariates, by = 'id')
-model_inputs <- covariates
 ```
\ No newline at end of file

From 8cde4b9d91b705119b2b756d078be365a3bf962d Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 6 Feb 2025 22:43:28 -0500
Subject: [PATCH 08/49] move sipnetwopet to standalone script

---
 downscale/sipnetwopet.R | 55 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 downscale/sipnetwopet.R

diff --git a/downscale/sipnetwopet.R b/downscale/sipnetwopet.R
new file mode 100644
index 0000000..56ddeb5
--- /dev/null
+++ b/downscale/sipnetwopet.R
@@ -0,0 +1,55 @@
+#' SIPNETWOPET, the "Simpler Photosynthesis and EvapoTranspiration model,
+#' WithOut Photosynthesis and EvapoTranspiration"
+#'
+#' This function simulates soil organic carbon (SOC) and aboveground
+#' biomass (AGB) using the SIPNETWOPET model. It is a surrogate for
+#' SIPNET, a process-based model that simulates the carbon and water.
+#' It can generate ensemble predictions for SOC and AGB and has its own
+#' internal stochastic model. SIPNETWOPET promises rough
+#' relationships between environmental variables and SOC and AGB.
+#'
+#' @param temp Mean annual temperature (<U+00B0>C)
+#' @param precip Mean annual precipitation (mm)
+#' @param clay Clay content (%)
+#' @param ocd Organic carbon density (g/cm^3)
+#' @param twi Topographic wetness index
+#' @param ensemble_size Number of ensemble predictions to generate (default 10)
+#'
+#'
+sipnetwopet <- function(
+    temp, precip, clay, ocd, twi, ensemble_size = 10) {
+    ensemble_results <- list()
+    for (i in seq_along(ensemble_size)) {
+        # Manually scale inputs using predefined dataset statistics
+        # scaled = (x - mean(x)) / sd(x)
+        scaled_temp <- (temp - 20) / 2
+        scaled_precip <- (precip - 5000) / 2000
+        scaled_clay <- (clay - 20) / 6
+        scaled_ocd <- (ocd - 300) / 60
+        scaled_twi <- (twi - 10) / 2
+
+        # Add stochastic variation = 10% * sd
+        scaled_temp <- scaled_temp * rnorm(1, 1, 0.1)
+        scaled_precip <- scaled_precip * rnorm(1, 1, 0.1)
+        scaled_clay <- scaled_clay * rnorm(1, 1, 0.1)
+        scaled_ocd <- scaled_ocd * rnorm(1, 1, 0.1)
+        scaled_twi <- scaled_twi * rnorm(1, 1, 0.1)
+
+        # Simulate SOC with various env effects and asymptotic bounds
+        .soc <- 80 + 15 * scaled_precip + 12 * scaled_temp + 50 * scaled_ocd + 15 * scaled_clay + 8 * scaled_twi +
+            rnorm(1, 0, 10)
+        soc <- max(90 * (.soc / (100 + abs(.soc))), rlnorm(1, meanlog = log(50), sdlog = 0.3)) # Asymptotic upper and soft lower bound
+
+        # Simulate AGB with various env effects and soft lower and asymptotic upper bound constraint
+        .agb <- 120 + 25 * scaled_temp + 35 * scaled_precip + 10 * scaled_clay -
+            8 * scaled_twi + rnorm(1, 0, 15)
+        agb <- max(450 * (.agb / (500 + abs(.agb))), rlnorm(1, meanlog = log(20), sdlog = 0.4)) # Asymptotic upper and soft lower bound
+
+        # Add to ensemble results
+        ensemble_results[[i]] <- tibble::tibble(soc = soc, agb = agb)
+    }
+
+    # Combine all ensemble members into a data frame
+    ensemble_data <- dplyr::bind_rows(ensemble_results, .id = "ensemble_id")
+    return(ensemble_data)
+}

From 191c895245129b285d052b3ab09941fbd8e5d42b Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 6 Feb 2025 22:46:11 -0500
Subject: [PATCH 09/49] converted qmd to R script

---
 downscale/00-prepare.R | 481 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 481 insertions(+)
 create mode 100644 downscale/00-prepare.R

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
new file mode 100644
index 0000000..417b9dc
--- /dev/null
+++ b/downscale/00-prepare.R
@@ -0,0 +1,481 @@
+#' ---
+#' title: "Workflow Setup and Data Preparation"
+#' author: "David LeBauer"
+#' ---
+#' 
+#' 
+#' # Overview
+#' 
+#' - Prepare Inputs
+#'   - Harmonized LandIQ dataset of woody California cropland from 2016-2023
+#'   - SoilGrids soil properties (clay, ?)
+#'   - CalAdapt climatology (mean annual temperature, mean annual precipitation)
+#' - Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
+#' 
+#' ## TODO
+#' 
+#' - Use consistent projection(s):
+#'   - California Albers EPSG:33110 for joins
+#'   - WGS84 EPSG:4326 for plotting, subsetting rasters?
+#' - Clean up domain code
+#' - Create a bunch of tables and join all at once at the end
+#' - Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
+#' 
+#' ## Install & Load PEcAn
+#' 
+#' See https://pecanproject.github.io/documentation/develop/
+#' 
+## -----------------------------------------------------------------------------
+
+options(repos = c(
+  pecanproject = 'https://pecanproject.r-universe.dev',
+  ropensci     = 'https://ropensci.r-universe.dev',
+  CRAN         = 'https://cloud.r-project.org'))
+
+# install.packages("PEcAn.all")
+## Required until https://github.com/UCANR-IGIS/caladaptr/pull/3 is merged
+# remotes::install_github("dlebauer/caladaptr")
+library(PEcAn.all)
+library(tidyverse)
+
+library(caladaptr)
+library(sf)
+library(terra)
+## Required until PR 3423is merged https://github.com/PecanProject/pecan/pull/3423
+# check if PR is merged
+devtools::load_all(here::here("../pecan/modules/data.land"))
+
+## Check available compute resources
+benchmarkme::get_ram()
+ncpu <- benchmarkme::get_cpu()$no_of_cores
+no_cores <- parallel::detectCores(logical = FALSE)
+future_multi <- ifelse(future::supportsMulticore(),
+       future::plan(future::multicore),
+       future::plan(future::multisession))
+future::plan(future_multi,
+             workers = ncpu - 2)
+
+
+
+
+#' 
+#' ## Organize Input Data
+#' 
+#' ### Domain Polygons
+#' 
+#' Here we are generating domain polygons that will be used for subsetting.
+#' These are converted to convex hulls and simplified for computational efficiency.
+#' The Yolo County domain is a smaller domain that can be used for testing and debugging.
+#' 
+#' These include:
+#' 
+#' - caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
+#' - ca_convex_hull: a convex hull around CA
+#' - ca_state_polygon_simplified: a simplified convex hull for CA
+#' - ca_state_polygon: a convex hull for CA
+#' 
+#' - ca_convex_hull_reduced: a simplified convex hull for CA
+#' - yolo_bbox: a smaller domain limited to Yolo County
+#' 
+## ----eval = FALSE-------------------------------------------------------------
+# # remotes::install_github("ucanr-igis/caladaptr")
+# ## Cal-Adapt Domain
+# caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
+#      sf::st_transform(4326) |>
+#      sf::st_union() |>
+#      sf::st_convex_hull()
+# st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
+# 
+# ## California State
+# ca_counties_polygons <- ca_aoipreset_geom("counties") |>
+#      dplyr::filter(state_name == "California") |>
+#      dplyr::select(state_name, county_name = name, geom) |>
+#      sf::st_transform(4326)
+# 
+# ca_state_polygon <- ca_counties_polygons |>
+#      group_by(state_name) |>
+#      mutate(geom = sf::st_union(geom))
+# 
+# ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
+# file.remove("data/ca_state_polygon_simplified.geojson")
+# sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
+# 
+# ## Yolo County
+# yolo_county_polygon <- ca_counties_polygons |>
+#     filter(county_name=='Yolo')
+# 
+# yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
+# sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
+# yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
+# # check if it is sufficiently simple to avoid unnecessary computational expensse
+# # st_coordinates(yolo_county_convex_hull)
+# 
+# ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+# 
+
+#' 
+## -----------------------------------------------------------------------------
+ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson")
+
+#' 
+#' ### LandIQ Woody Polygons
+#' 
+#' The first step is to convert LandIQ to a open, standard format.
+#' 
+#' We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
+#' 
+#' The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
+# output_gpkg = 'data/ca_fields.gpkg'
+# output_csv = 'data/ca_field_attributes.csv'
+# 
+# landiq2std(input_file, output_gpkg, output_csv)
+
+#' 
+#' ##### Subset Woody Perennial Crop Fields
+#' 
+#' Phase 1 focuses on Woody Perennial Crop fields.
+#' 
+#' Next, we will subset the LandIQ data to only include woody perennial crop fields.
+#' At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
+#' 
+## -----------------------------------------------------------------------------
+ca_fields <- sf::st_read("data/ca_fields.gpkg")
+ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
+
+ca <- ca_fields |>
+  dplyr::select(-lat, -lon) |>
+  dplyr::left_join(ca_attributes, by = "id")
+
+ca_woody <- ca |>
+  dplyr::filter(pft == "woody perennial crop")
+sf::st_write(ca_woody,
+        "data/ca_woody.gpkg", delete_layer = TRUE)
+
+#' 
+#' Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
+#' number of design points that will be selected in the clustering step.
+#' 
+#' 
+## -----------------------------------------------------------------------------
+system.time(
+pft_area <- ca |>
+  dplyr::select(id, pft, area_ha) |>
+  dtplyr::lazy_dt() |>
+  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
+  dplyr::group_by(woody_indicator) |>
+  dplyr::summarize(pft_area = sum(area_ha))
+)
+
+
+
+#' 
+#' ### Subset California Woody Crop Fields for development & testting
+#' 
+#' Now, create a subset of the California Woody Crop Fields for development & testting
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# set.seed(25)
+# ca_woody_subset <- ca_woody  |>
+#    dplyr::sample_n(200)
+# 
+# sf::st_write(ca_woody_subset,
+#         "data/ca_woody_subset.gpkg", delete_layer = TRUE)
+
+#' 
+#' ### Convert Polygons to Points.
+#' 
+#' For Phase 1, we will use points to query raster data.
+#' In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
+#' 
+## ----polygons-to-points-------------------------------------------------------
+woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for development and testing
+
+# load fields, convert polygons to points, and subset
+ca_woody_pts <- sf::st_read(woody_gpkg) |>
+  sf::st_centroid() |>
+  # and keep only the columns we need
+  dplyr::select(id, crop, pft, geom)
+
+
+#' 
+#' ## Anchor Sites
+#' 
+## ----anchor-sites-------------------------------------------------------------
+# Anchor sites from UC Davis, UC Riverside, and Ameriflux.
+anchor_sites <- readr::read_csv("data/anchor_sites.csv")
+
+anchor_sites_pts <- anchor_sites |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326)
+
+# Join with ca_fields: keep only the rows associated with anchor sites
+# takes ~ 1 min on BU cluster w/ "Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz"
+anchor_sites_with_ids <- anchor_sites_pts |>
+   # spatial join find ca_fields that contain anchor site points
+  sf::st_join(ca_fields, join = sf::st_within)
+
+sf::st_write(anchor_sites_with_ids |>
+               dplyr::select(id, lat, lon, location, site_name, crops, pft),
+               dsn = "data/anchor_sites_ids.csv",
+               delete_layer = TRUE)
+
+
+#' 
+#' ## Environmental Covariates
+#' 
+#' ### SoilGrids
+#' 
+#' #### Load Prepared Soilgrids GeoTIFF
+#' 
+#' Using already prepared SoilGrids layers
+#' 
+## ----load-soilgrids-----------------------------------------------------------
+soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
+soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
+## if we want to clip to CA
+## use terra to read in that file and clip to california
+# soilgrids_california <- terra::crop(soilgrids_north_america, yolo_bbox)
+
+# read in the file
+# read two layers
+soilgrids_north_america_clay_rast <- terra::rast(soilgrids_north_america_clay_tif)
+soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
+
+
+#' 
+#' #### Extract clay from SoilGrids
+#' 
+## ----sg-clay-ocd--------------------------------------------------------------
+
+clay <- (terra::extract(soilgrids_north_america_clay_rast, terra::vect(ca_woody_pts)) |>
+  dplyr::select(-ID)) |>
+  dplyr::pull()/ 10
+
+ocd <- terra::extract(soilgrids_north_america_ocd_rast, terra::vect(ca_woody_pts)) |>
+  dplyr::select(-ID) |>
+  dplyr::pull()
+
+ca_woody_pts_clay_ocd <- cbind(ca_woody_pts,
+                               clay = clay,
+                               ocd = ocd)
+
+required_cols <- c("id", "crop", "pft", "clay", "ocd", "geom")
+assertthat::assert_that(
+  all(required_cols %in% colnames(ca_woody_pts_clay_ocd)),
+            msg = "ca_woody_pts_clay_ocd is missing expected columns")
+
+
+#' 
+#' ### Topographic Wetness Index
+#' 
+## ----twi----------------------------------------------------------------------
+twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
+twi_rast <- terra::rast(twi_tiff)
+
+twi <- terra::extract(twi_rast, vect(ca_woody_pts)) |>
+  select(-ID) |>
+  dplyr::pull()
+
+ca_woody_pts_clay_ocd_twi <- cbind(ca_woody_pts_clay_ocd, twi = twi)
+
+
+#' 
+#' ### Cal-Adapt Climate Regions
+#' 
+## ----caladapt_climregions-----------------------------------------------------
+
+ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
+                      # and so units are in meters
+                      # required that crs(x) == crs(y) for st_join
+
+ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
+    sf::st_transform(crs = ca_albers_crs) |>
+    dplyr::rename(climregion_id = id,
+           climregion_name = name)
+save(ca_climregions, file = "data/ca_climregions.rda")
+
+#' 
+## ----join_climregions---------------------------------------------------------
+ca_woody_pts_clay_ocd_twi_cr <- ca_woody_pts_clay_ocd_twi |>
+  sf::st_transform(., crs = ca_albers_crs) |>
+  sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
+
+# convenience cache.
+save(ca_woody_pts_clay_ocd_twi_cr, file = "ca_woody_pts_clay_ocd_twi_cr.rda")
+
+#' 
+#' ### GridMet
+#' 
+## -----------------------------------------------------------------------------
+gridmet_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
+
+# List all ERA5_met_*.tiff files for years 2012-2021
+raster_files <- list.files(
+  path = gridmet_dir,
+  pattern = "^ERA5_met_\\d{4}\\.tiff$",
+  full.names = TRUE
+)
+
+# Read all rasters into a list of SpatRaster objects
+rasters_list <- map(
+  raster_files,
+  ~ rast(.x))
+
+years <- map_chr(rasters_list, ~ {
+  source_path <- terra::sources(.x)[1]
+  str_extract(source_path, "\\d{4}")
+})  |>
+  as.integer()
+
+names(rasters_list) <- years
+
+extract_clim <- function(raster, points_sf) {
+  terra::extract(raster, points_sf) |>
+    tibble::as_tibble() |>
+    select(-ID) |>
+    mutate(id = points_sf$id) |>
+    select(id, temp, prec, srad, vapr)
+}
+
+.tmp <-  rasters_list |>
+  furrr::future_map_dfr(
+    ~ extract_clim(.x, ca_woody_pts),
+      .id = "year",
+      .options = furrr::furrr_options(seed = 123))
+
+
+clim_summaries <- .tmp |>
+  dplyr::mutate(
+    precip = PEcAn.utils::ud_convert(prec, "second-1", "year-1")
+) |>
+  dplyr::group_by(id) |>
+  dplyr::summarise(
+    temp = mean(temp),
+    precip = mean(precip),
+    srad = mean(srad),
+    vapr = mean(vapr)
+  )
+
+#' 
+#' ## Prepare Dataset for Clustering
+#' 
+#' First, we will turn crop names into IDs to support hierarchical clustering
+#' 
+## -----------------------------------------------------------------------------
+crop_ids <- ca_woody_pts |>
+  distinct(crop) |>
+  mutate(crop_id = as.integer(as.factor(crop))) |>
+  write_csv("data/crop_ids.csv")
+
+#' 
+## ----join_and_subset----------------------------------------------------------
+.all <- clim_summaries  |>
+  dplyr::left_join(ca_woody_pts_clay_ocd_twi_cr, by = "id")  |>
+  dplyr::left_join(crop_ids, by = "crop")
+
+assertthat::assert_that(nrow(.all) == nrow(clim_summaries) && nrow(.all) == nrow(ca_woody_pts_clay_ocd_twi_cr),
+                        msg = "join was not 1:1 as expected")
+
+glimpse(.all)
+skimr::skim(.all)
+
+data_for_clust_with_ids <- .all |>
+  #dplyr::select(-c(climregion_name))  |>
+  na.omit() |>
+  mutate(across(where(is.numeric), ~ signif(., digits = 3)))
+
+save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
+
+#' 
+#' <!--
+#' # [Draft] Unused Code
+#' 
+#' ### Cal-Adapt Climate
+#' 
+#' #### Cal-Adapt Catalog
+#' 
+#' See `ca_catalog_search('30yavg_ens32avg_historical')`
+#' 
+#' ## make a paginated table
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# library(dplyr)
+# ca_catalog_search('ens32avg') |>
+#   reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
+
+#' 
+#' ##### Download Cal-Adapt Climate Rasters
+#' 
+#' LOCA (CMIP5-based)
+#' 
+## ----eval = FALSE-------------------------------------------------------------
+# 
+# #gcm = "ens32avg"
+# gcm = "HadGEM2-ES"
+# scenario = "historical"
+# period = "year"
+# start_year = 1961
+# end_year = 1990
+# out_dir = "data/caladapt/"
+# polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+# 
+# vars <- c("pr", "tasmin", "tasmax")
+# 
+# num_workers <- parallel::detectCores()
+# future::plan(future_multi,
+#             workers = 3)# num_workers
+# ### Not working because Rasters are all 0????
+# caladapt_rasters <- furrr::future_map(
+#   vars,
+#   ~ {
+#     gcms <- gcms # required for parallel
+#     ca_fetch_raster_polygon(
+#       polygon = polygon,
+#       var = .x,
+#       gcm = gcm,
+#       scenario = scenario,
+#       period = period,
+#       start_year = start_year,
+#       end_year = end_year,
+#       out_dir = out_dir
+#   )
+#   },
+#   .options = furrr_options(seed = 123)
+# )
+# 
+# # combine into a dataframe
+# caladapt_rasters_df <- caladapt_rasters |>
+#   purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
+#   dplyr::group_by(variable) |>
+#   dplyr::summarise(raster = list(terra::vrt(raster)))
+# 
+# z <- caladapt_rasters_df |>
+#   dplyr::group_by(variable) |>
+#   dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
+# ca_woody_pts_sg_cr_ca <-z
+# 
+# climate_df <- furrr::future_map(
+#   caladapt_rasters_df$raster,
+#   ~ terra::extract(.x, terra::vect(ca_woody_pts)),
+#   .options = furrr_options(seed = 123)
+# )
+# 
+# 
+
+#' 
+#' ##### Cal-Adapt 3.0
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# my_data <- read_3km_hourly_climate_terra(
+#      data_dir = "local_folder",
+#      variable = "T2",
+#      domain = "d03",
+#      year = 2015,
+#      n_cores = 8
+#    )
+
+#' -->
+#' 

From d3bb8f4e3326ebad74cb9fe04ab28e8631be2534 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 6 Feb 2025 23:05:02 -0500
Subject: [PATCH 10/49] rename 00-prepare.qmd --> 00-prepare.R

---
 downscale/00-prepare.R   | 532 +++++++++++++++++++--------------------
 downscale/00-prepare.qmd | 481 -----------------------------------
 2 files changed, 266 insertions(+), 747 deletions(-)
 delete mode 100644 downscale/00-prepare.qmd

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 417b9dc..84bcaa8 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -1,31 +1,31 @@
-#' ---
-#' title: "Workflow Setup and Data Preparation"
-#' author: "David LeBauer"
-#' ---
-#' 
-#' 
-#' # Overview
-#' 
-#' - Prepare Inputs
-#'   - Harmonized LandIQ dataset of woody California cropland from 2016-2023
-#'   - SoilGrids soil properties (clay, ?)
-#'   - CalAdapt climatology (mean annual temperature, mean annual precipitation)
-#' - Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
-#' 
-#' ## TODO
-#' 
-#' - Use consistent projection(s):
-#'   - California Albers EPSG:33110 for joins
-#'   - WGS84 EPSG:4326 for plotting, subsetting rasters?
-#' - Clean up domain code
-#' - Create a bunch of tables and join all at once at the end
-#' - Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
-#' 
-#' ## Install & Load PEcAn
-#' 
-#' See https://pecanproject.github.io/documentation/develop/
-#' 
-## -----------------------------------------------------------------------------
+---
+title: "Workflow Setup and Data Preparation"
+author: "David LeBauer"
+---
+
+
+# Overview
+
+- Prepare Inputs
+  - Harmonized LandIQ dataset of woody California cropland from 2016-2023
+  - SoilGrids soil properties (clay, ?)
+  - CalAdapt climatology (mean annual temperature, mean annual precipitation)
+- Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
+
+## TODO
+
+- Use consistent projection(s):
+  - California Albers EPSG:33110 for joins
+  - WGS84 EPSG:4326 for plotting, subsetting rasters?
+- Clean up domain code
+- Create a bunch of tables and join all at once at the end
+- Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
+
+## Install & Load PEcAn
+
+See https://pecanproject.github.io/documentation/develop/
+
+```{r}
 
 options(repos = c(
   pecanproject = 'https://pecanproject.r-universe.dev',
@@ -57,92 +57,92 @@ future::plan(future_multi,
 
 
 
+```
+
+## Organize Input Data
+
+### Domain Polygons
+
+Here we are generating domain polygons that will be used for subsetting.
+These are converted to convex hulls and simplified for computational efficiency.
+The Yolo County domain is a smaller domain that can be used for testing and debugging.
+
+These include:
+
+- caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
+- ca_convex_hull: a convex hull around CA
+- ca_state_polygon_simplified: a simplified convex hull for CA
+- ca_state_polygon: a convex hull for CA
+
+- ca_convex_hull_reduced: a simplified convex hull for CA
+- yolo_bbox: a smaller domain limited to Yolo County
+
+```{r eval = FALSE}
+# remotes::install_github("ucanr-igis/caladaptr")
+## Cal-Adapt Domain
+caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
+     sf::st_transform(4326) |>
+     sf::st_union() |>
+     sf::st_convex_hull()
+st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
+
+## California State
+ca_counties_polygons <- ca_aoipreset_geom("counties") |>
+     dplyr::filter(state_name == "California") |>
+     dplyr::select(state_name, county_name = name, geom) |>
+     sf::st_transform(4326)
+
+ca_state_polygon <- ca_counties_polygons |>
+     group_by(state_name) |>
+     mutate(geom = sf::st_union(geom))
+
+ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
+file.remove("data/ca_state_polygon_simplified.geojson")
+sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
 
-#' 
-#' ## Organize Input Data
-#' 
-#' ### Domain Polygons
-#' 
-#' Here we are generating domain polygons that will be used for subsetting.
-#' These are converted to convex hulls and simplified for computational efficiency.
-#' The Yolo County domain is a smaller domain that can be used for testing and debugging.
-#' 
-#' These include:
-#' 
-#' - caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
-#' - ca_convex_hull: a convex hull around CA
-#' - ca_state_polygon_simplified: a simplified convex hull for CA
-#' - ca_state_polygon: a convex hull for CA
-#' 
-#' - ca_convex_hull_reduced: a simplified convex hull for CA
-#' - yolo_bbox: a smaller domain limited to Yolo County
-#' 
-## ----eval = FALSE-------------------------------------------------------------
-# # remotes::install_github("ucanr-igis/caladaptr")
-# ## Cal-Adapt Domain
-# caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
-#      sf::st_transform(4326) |>
-#      sf::st_union() |>
-#      sf::st_convex_hull()
-# st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
-# 
-# ## California State
-# ca_counties_polygons <- ca_aoipreset_geom("counties") |>
-#      dplyr::filter(state_name == "California") |>
-#      dplyr::select(state_name, county_name = name, geom) |>
-#      sf::st_transform(4326)
-# 
-# ca_state_polygon <- ca_counties_polygons |>
-#      group_by(state_name) |>
-#      mutate(geom = sf::st_union(geom))
-# 
-# ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
-# file.remove("data/ca_state_polygon_simplified.geojson")
-# sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
-# 
-# ## Yolo County
-# yolo_county_polygon <- ca_counties_polygons |>
-#     filter(county_name=='Yolo')
-# 
-# yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
-# sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
-# yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
-# # check if it is sufficiently simple to avoid unnecessary computational expensse
-# # st_coordinates(yolo_county_convex_hull)
-# 
-# ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-# 
-
-#' 
-## -----------------------------------------------------------------------------
+## Yolo County
+yolo_county_polygon <- ca_counties_polygons |>
+    filter(county_name=='Yolo')
+
+yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
+sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
+yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
+# check if it is sufficiently simple to avoid unnecessary computational expensse
+# st_coordinates(yolo_county_convex_hull)
+
+ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+
+```
+
+```{r}
 ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
 yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson")
+```
+
+### LandIQ Woody Polygons
+
+The first step is to convert LandIQ to a open, standard format.
+
+We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
+
+The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
 
-#' 
-#' ### LandIQ Woody Polygons
-#' 
-#' The first step is to convert LandIQ to a open, standard format.
-#' 
-#' We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
-#' 
-#' The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
-#' 
-## ----eval=FALSE---------------------------------------------------------------
-# input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
-# output_gpkg = 'data/ca_fields.gpkg'
-# output_csv = 'data/ca_field_attributes.csv'
-# 
-# landiq2std(input_file, output_gpkg, output_csv)
-
-#' 
-#' ##### Subset Woody Perennial Crop Fields
-#' 
-#' Phase 1 focuses on Woody Perennial Crop fields.
-#' 
-#' Next, we will subset the LandIQ data to only include woody perennial crop fields.
-#' At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
-#' 
-## -----------------------------------------------------------------------------
+```{r eval=FALSE}
+input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
+output_gpkg = 'data/ca_fields.gpkg'
+output_csv = 'data/ca_field_attributes.csv'
+
+landiq2std(input_file, output_gpkg, output_csv)
+```
+
+##### Subset Woody Perennial Crop Fields
+
+Phase 1 focuses on Woody Perennial Crop fields.
+
+Next, we will subset the LandIQ data to only include woody perennial crop fields.
+At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
+
+```{r}
 ca_fields <- sf::st_read("data/ca_fields.gpkg")
 ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
 
@@ -154,13 +154,13 @@ ca_woody <- ca |>
   dplyr::filter(pft == "woody perennial crop")
 sf::st_write(ca_woody,
         "data/ca_woody.gpkg", delete_layer = TRUE)
+```
 
-#' 
-#' Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
-#' number of design points that will be selected in the clustering step.
-#' 
-#' 
-## -----------------------------------------------------------------------------
+Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
+number of design points that will be selected in the clustering step.
+
+
+```{r}
 system.time(
 pft_area <- ca |>
   dplyr::select(id, pft, area_ha) |>
@@ -171,27 +171,27 @@ pft_area <- ca |>
 )
 
 
+```
+
+### Subset California Woody Crop Fields for development & testting
+
+Now, create a subset of the California Woody Crop Fields for development & testting
 
-#' 
-#' ### Subset California Woody Crop Fields for development & testting
-#' 
-#' Now, create a subset of the California Woody Crop Fields for development & testting
-#' 
-## ----eval=FALSE---------------------------------------------------------------
-# set.seed(25)
-# ca_woody_subset <- ca_woody  |>
-#    dplyr::sample_n(200)
-# 
-# sf::st_write(ca_woody_subset,
-#         "data/ca_woody_subset.gpkg", delete_layer = TRUE)
-
-#' 
-#' ### Convert Polygons to Points.
-#' 
-#' For Phase 1, we will use points to query raster data.
-#' In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
-#' 
-## ----polygons-to-points-------------------------------------------------------
+```{r eval=FALSE}
+set.seed(25)
+ca_woody_subset <- ca_woody  |>
+   dplyr::sample_n(200)
+
+sf::st_write(ca_woody_subset,
+        "data/ca_woody_subset.gpkg", delete_layer = TRUE)
+```
+
+### Convert Polygons to Points.
+
+For Phase 1, we will use points to query raster data.
+In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
+
+```{r polygons-to-points}
 woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for development and testing
 
 # load fields, convert polygons to points, and subset
@@ -200,11 +200,11 @@ ca_woody_pts <- sf::st_read(woody_gpkg) |>
   # and keep only the columns we need
   dplyr::select(id, crop, pft, geom)
 
+```
+
+## Anchor Sites
 
-#' 
-#' ## Anchor Sites
-#' 
-## ----anchor-sites-------------------------------------------------------------
+```{r anchor-sites}
 # Anchor sites from UC Davis, UC Riverside, and Ameriflux.
 anchor_sites <- readr::read_csv("data/anchor_sites.csv")
 
@@ -222,17 +222,17 @@ sf::st_write(anchor_sites_with_ids |>
                dsn = "data/anchor_sites_ids.csv",
                delete_layer = TRUE)
 
+```
+
+## Environmental Covariates
+
+### SoilGrids
+
+#### Load Prepared Soilgrids GeoTIFF
+
+Using already prepared SoilGrids layers
 
-#' 
-#' ## Environmental Covariates
-#' 
-#' ### SoilGrids
-#' 
-#' #### Load Prepared Soilgrids GeoTIFF
-#' 
-#' Using already prepared SoilGrids layers
-#' 
-## ----load-soilgrids-----------------------------------------------------------
+```{r load-soilgrids}
 soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
 soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
 ## if we want to clip to CA
@@ -244,11 +244,11 @@ soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA
 soilgrids_north_america_clay_rast <- terra::rast(soilgrids_north_america_clay_tif)
 soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
 
+```
 
-#' 
-#' #### Extract clay from SoilGrids
-#' 
-## ----sg-clay-ocd--------------------------------------------------------------
+#### Extract clay from SoilGrids
+
+```{r sg-clay-ocd}
 
 clay <- (terra::extract(soilgrids_north_america_clay_rast, terra::vect(ca_woody_pts)) |>
   dplyr::select(-ID)) |>
@@ -267,11 +267,11 @@ assertthat::assert_that(
   all(required_cols %in% colnames(ca_woody_pts_clay_ocd)),
             msg = "ca_woody_pts_clay_ocd is missing expected columns")
 
+```
+
+### Topographic Wetness Index
 
-#' 
-#' ### Topographic Wetness Index
-#' 
-## ----twi----------------------------------------------------------------------
+```{r twi}
 twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
 twi_rast <- terra::rast(twi_tiff)
 
@@ -281,11 +281,11 @@ twi <- terra::extract(twi_rast, vect(ca_woody_pts)) |>
 
 ca_woody_pts_clay_ocd_twi <- cbind(ca_woody_pts_clay_ocd, twi = twi)
 
+```
+
+### Cal-Adapt Climate Regions
 
-#' 
-#' ### Cal-Adapt Climate Regions
-#' 
-## ----caladapt_climregions-----------------------------------------------------
+```{r caladapt_climregions}
 
 ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
                       # and so units are in meters
@@ -296,20 +296,20 @@ ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
     dplyr::rename(climregion_id = id,
            climregion_name = name)
 save(ca_climregions, file = "data/ca_climregions.rda")
+```
 
-#' 
-## ----join_climregions---------------------------------------------------------
+```{r join_climregions}
 ca_woody_pts_clay_ocd_twi_cr <- ca_woody_pts_clay_ocd_twi |>
   sf::st_transform(., crs = ca_albers_crs) |>
   sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
 
 # convenience cache.
 save(ca_woody_pts_clay_ocd_twi_cr, file = "ca_woody_pts_clay_ocd_twi_cr.rda")
+```
 
-#' 
-#' ### GridMet
-#' 
-## -----------------------------------------------------------------------------
+### GridMet
+
+```{r}
 gridmet_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
 
 # List all ERA5_met_*.tiff files for years 2012-2021
@@ -358,20 +358,20 @@ clim_summaries <- .tmp |>
     srad = mean(srad),
     vapr = mean(vapr)
   )
+```
+
+## Prepare Dataset for Clustering
 
-#' 
-#' ## Prepare Dataset for Clustering
-#' 
-#' First, we will turn crop names into IDs to support hierarchical clustering
-#' 
-## -----------------------------------------------------------------------------
+First, we will turn crop names into IDs to support hierarchical clustering
+
+```{r}
 crop_ids <- ca_woody_pts |>
   distinct(crop) |>
   mutate(crop_id = as.integer(as.factor(crop))) |>
   write_csv("data/crop_ids.csv")
+```
 
-#' 
-## ----join_and_subset----------------------------------------------------------
+```{r join_and_subset}
 .all <- clim_summaries  |>
   dplyr::left_join(ca_woody_pts_clay_ocd_twi_cr, by = "id")  |>
   dplyr::left_join(crop_ids, by = "crop")
@@ -388,94 +388,94 @@ data_for_clust_with_ids <- .all |>
   mutate(across(where(is.numeric), ~ signif(., digits = 3)))
 
 save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
+```
+
+<!--
+# [Draft] Unused Code
+
+### Cal-Adapt Climate
+
+#### Cal-Adapt Catalog
+
+See `ca_catalog_search('30yavg_ens32avg_historical')`
+
+## make a paginated table
+
+```{r eval=FALSE}
+library(dplyr)
+ca_catalog_search('ens32avg') |>
+  reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
+```
+
+##### Download Cal-Adapt Climate Rasters
+
+LOCA (CMIP5-based)
+
+```{r eval = FALSE}
+
+#gcm = "ens32avg"
+gcm = "HadGEM2-ES"
+scenario = "historical"
+period = "year"
+start_year = 1961
+end_year = 1990
+out_dir = "data/caladapt/"
+polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+
+vars <- c("pr", "tasmin", "tasmax")
+
+num_workers <- parallel::detectCores()
+future::plan(future_multi,
+            workers = 3)# num_workers
+### Not working because Rasters are all 0????
+caladapt_rasters <- furrr::future_map(
+  vars,
+  ~ {
+    gcms <- gcms # required for parallel
+    ca_fetch_raster_polygon(
+      polygon = polygon,
+      var = .x,
+      gcm = gcm,
+      scenario = scenario,
+      period = period,
+      start_year = start_year,
+      end_year = end_year,
+      out_dir = out_dir
+  )
+  },
+  .options = furrr_options(seed = 123)
+)
+
+# combine into a dataframe
+caladapt_rasters_df <- caladapt_rasters |>
+  purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
+  dplyr::group_by(variable) |>
+  dplyr::summarise(raster = list(terra::vrt(raster)))
+
+z <- caladapt_rasters_df |>
+  dplyr::group_by(variable) |>
+  dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
+ca_woody_pts_sg_cr_ca <-z
+
+climate_df <- furrr::future_map(
+  caladapt_rasters_df$raster,
+  ~ terra::extract(.x, terra::vect(ca_woody_pts)),
+  .options = furrr_options(seed = 123)
+)
+
+
+```
+
+##### Cal-Adapt 3.0
+
+```{r eval=FALSE}
+my_data <- read_3km_hourly_climate_terra(
+     data_dir = "local_folder",
+     variable = "T2",
+     domain = "d03",
+     year = 2015,
+     n_cores = 8
+   )
+```
+-->
 
-#' 
-#' <!--
-#' # [Draft] Unused Code
-#' 
-#' ### Cal-Adapt Climate
-#' 
-#' #### Cal-Adapt Catalog
-#' 
-#' See `ca_catalog_search('30yavg_ens32avg_historical')`
-#' 
-#' ## make a paginated table
-#' 
-## ----eval=FALSE---------------------------------------------------------------
-# library(dplyr)
-# ca_catalog_search('ens32avg') |>
-#   reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
-
-#' 
-#' ##### Download Cal-Adapt Climate Rasters
-#' 
-#' LOCA (CMIP5-based)
-#' 
-## ----eval = FALSE-------------------------------------------------------------
-# 
-# #gcm = "ens32avg"
-# gcm = "HadGEM2-ES"
-# scenario = "historical"
-# period = "year"
-# start_year = 1961
-# end_year = 1990
-# out_dir = "data/caladapt/"
-# polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-# 
-# vars <- c("pr", "tasmin", "tasmax")
-# 
-# num_workers <- parallel::detectCores()
-# future::plan(future_multi,
-#             workers = 3)# num_workers
-# ### Not working because Rasters are all 0????
-# caladapt_rasters <- furrr::future_map(
-#   vars,
-#   ~ {
-#     gcms <- gcms # required for parallel
-#     ca_fetch_raster_polygon(
-#       polygon = polygon,
-#       var = .x,
-#       gcm = gcm,
-#       scenario = scenario,
-#       period = period,
-#       start_year = start_year,
-#       end_year = end_year,
-#       out_dir = out_dir
-#   )
-#   },
-#   .options = furrr_options(seed = 123)
-# )
-# 
-# # combine into a dataframe
-# caladapt_rasters_df <- caladapt_rasters |>
-#   purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
-#   dplyr::group_by(variable) |>
-#   dplyr::summarise(raster = list(terra::vrt(raster)))
-# 
-# z <- caladapt_rasters_df |>
-#   dplyr::group_by(variable) |>
-#   dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
-# ca_woody_pts_sg_cr_ca <-z
-# 
-# climate_df <- furrr::future_map(
-#   caladapt_rasters_df$raster,
-#   ~ terra::extract(.x, terra::vect(ca_woody_pts)),
-#   .options = furrr_options(seed = 123)
-# )
-# 
-# 
-
-#' 
-#' ##### Cal-Adapt 3.0
-#' 
-## ----eval=FALSE---------------------------------------------------------------
-# my_data <- read_3km_hourly_climate_terra(
-#      data_dir = "local_folder",
-#      variable = "T2",
-#      domain = "d03",
-#      year = 2015,
-#      n_cores = 8
-#    )
-
-#' -->
-#' 
diff --git a/downscale/00-prepare.qmd b/downscale/00-prepare.qmd
deleted file mode 100644
index 84bcaa8..0000000
--- a/downscale/00-prepare.qmd
+++ /dev/null
@@ -1,481 +0,0 @@
----
-title: "Workflow Setup and Data Preparation"
-author: "David LeBauer"
----
-
-
-# Overview
-
-- Prepare Inputs
-  - Harmonized LandIQ dataset of woody California cropland from 2016-2023
-  - SoilGrids soil properties (clay, ?)
-  - CalAdapt climatology (mean annual temperature, mean annual precipitation)
-- Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
-
-## TODO
-
-- Use consistent projection(s):
-  - California Albers EPSG:33110 for joins
-  - WGS84 EPSG:4326 for plotting, subsetting rasters?
-- Clean up domain code
-- Create a bunch of tables and join all at once at the end
-- Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
-
-## Install & Load PEcAn
-
-See https://pecanproject.github.io/documentation/develop/
-
-```{r}
-
-options(repos = c(
-  pecanproject = 'https://pecanproject.r-universe.dev',
-  ropensci     = 'https://ropensci.r-universe.dev',
-  CRAN         = 'https://cloud.r-project.org'))
-
-# install.packages("PEcAn.all")
-## Required until https://github.com/UCANR-IGIS/caladaptr/pull/3 is merged
-# remotes::install_github("dlebauer/caladaptr")
-library(PEcAn.all)
-library(tidyverse)
-
-library(caladaptr)
-library(sf)
-library(terra)
-## Required until PR 3423is merged https://github.com/PecanProject/pecan/pull/3423
-# check if PR is merged
-devtools::load_all(here::here("../pecan/modules/data.land"))
-
-## Check available compute resources
-benchmarkme::get_ram()
-ncpu <- benchmarkme::get_cpu()$no_of_cores
-no_cores <- parallel::detectCores(logical = FALSE)
-future_multi <- ifelse(future::supportsMulticore(),
-       future::plan(future::multicore),
-       future::plan(future::multisession))
-future::plan(future_multi,
-             workers = ncpu - 2)
-
-
-
-```
-
-## Organize Input Data
-
-### Domain Polygons
-
-Here we are generating domain polygons that will be used for subsetting.
-These are converted to convex hulls and simplified for computational efficiency.
-The Yolo County domain is a smaller domain that can be used for testing and debugging.
-
-These include:
-
-- caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
-- ca_convex_hull: a convex hull around CA
-- ca_state_polygon_simplified: a simplified convex hull for CA
-- ca_state_polygon: a convex hull for CA
-
-- ca_convex_hull_reduced: a simplified convex hull for CA
-- yolo_bbox: a smaller domain limited to Yolo County
-
-```{r eval = FALSE}
-# remotes::install_github("ucanr-igis/caladaptr")
-## Cal-Adapt Domain
-caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
-     sf::st_transform(4326) |>
-     sf::st_union() |>
-     sf::st_convex_hull()
-st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
-
-## California State
-ca_counties_polygons <- ca_aoipreset_geom("counties") |>
-     dplyr::filter(state_name == "California") |>
-     dplyr::select(state_name, county_name = name, geom) |>
-     sf::st_transform(4326)
-
-ca_state_polygon <- ca_counties_polygons |>
-     group_by(state_name) |>
-     mutate(geom = sf::st_union(geom))
-
-ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
-file.remove("data/ca_state_polygon_simplified.geojson")
-sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
-
-## Yolo County
-yolo_county_polygon <- ca_counties_polygons |>
-    filter(county_name=='Yolo')
-
-yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
-sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
-yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
-# check if it is sufficiently simple to avoid unnecessary computational expensse
-# st_coordinates(yolo_county_convex_hull)
-
-ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-
-```
-
-```{r}
-ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson")
-```
-
-### LandIQ Woody Polygons
-
-The first step is to convert LandIQ to a open, standard format.
-
-We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
-
-The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
-
-```{r eval=FALSE}
-input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
-output_gpkg = 'data/ca_fields.gpkg'
-output_csv = 'data/ca_field_attributes.csv'
-
-landiq2std(input_file, output_gpkg, output_csv)
-```
-
-##### Subset Woody Perennial Crop Fields
-
-Phase 1 focuses on Woody Perennial Crop fields.
-
-Next, we will subset the LandIQ data to only include woody perennial crop fields.
-At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
-
-```{r}
-ca_fields <- sf::st_read("data/ca_fields.gpkg")
-ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
-
-ca <- ca_fields |>
-  dplyr::select(-lat, -lon) |>
-  dplyr::left_join(ca_attributes, by = "id")
-
-ca_woody <- ca |>
-  dplyr::filter(pft == "woody perennial crop")
-sf::st_write(ca_woody,
-        "data/ca_woody.gpkg", delete_layer = TRUE)
-```
-
-Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
-number of design points that will be selected in the clustering step.
-
-
-```{r}
-system.time(
-pft_area <- ca |>
-  dplyr::select(id, pft, area_ha) |>
-  dtplyr::lazy_dt() |>
-  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
-  dplyr::group_by(woody_indicator) |>
-  dplyr::summarize(pft_area = sum(area_ha))
-)
-
-
-```
-
-### Subset California Woody Crop Fields for development & testting
-
-Now, create a subset of the California Woody Crop Fields for development & testting
-
-```{r eval=FALSE}
-set.seed(25)
-ca_woody_subset <- ca_woody  |>
-   dplyr::sample_n(200)
-
-sf::st_write(ca_woody_subset,
-        "data/ca_woody_subset.gpkg", delete_layer = TRUE)
-```
-
-### Convert Polygons to Points.
-
-For Phase 1, we will use points to query raster data.
-In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
-
-```{r polygons-to-points}
-woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for development and testing
-
-# load fields, convert polygons to points, and subset
-ca_woody_pts <- sf::st_read(woody_gpkg) |>
-  sf::st_centroid() |>
-  # and keep only the columns we need
-  dplyr::select(id, crop, pft, geom)
-
-```
-
-## Anchor Sites
-
-```{r anchor-sites}
-# Anchor sites from UC Davis, UC Riverside, and Ameriflux.
-anchor_sites <- readr::read_csv("data/anchor_sites.csv")
-
-anchor_sites_pts <- anchor_sites |>
-  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326)
-
-# Join with ca_fields: keep only the rows associated with anchor sites
-# takes ~ 1 min on BU cluster w/ "Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz"
-anchor_sites_with_ids <- anchor_sites_pts |>
-   # spatial join find ca_fields that contain anchor site points
-  sf::st_join(ca_fields, join = sf::st_within)
-
-sf::st_write(anchor_sites_with_ids |>
-               dplyr::select(id, lat, lon, location, site_name, crops, pft),
-               dsn = "data/anchor_sites_ids.csv",
-               delete_layer = TRUE)
-
-```
-
-## Environmental Covariates
-
-### SoilGrids
-
-#### Load Prepared Soilgrids GeoTIFF
-
-Using already prepared SoilGrids layers
-
-```{r load-soilgrids}
-soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
-soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
-## if we want to clip to CA
-## use terra to read in that file and clip to california
-# soilgrids_california <- terra::crop(soilgrids_north_america, yolo_bbox)
-
-# read in the file
-# read two layers
-soilgrids_north_america_clay_rast <- terra::rast(soilgrids_north_america_clay_tif)
-soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
-
-```
-
-#### Extract clay from SoilGrids
-
-```{r sg-clay-ocd}
-
-clay <- (terra::extract(soilgrids_north_america_clay_rast, terra::vect(ca_woody_pts)) |>
-  dplyr::select(-ID)) |>
-  dplyr::pull()/ 10
-
-ocd <- terra::extract(soilgrids_north_america_ocd_rast, terra::vect(ca_woody_pts)) |>
-  dplyr::select(-ID) |>
-  dplyr::pull()
-
-ca_woody_pts_clay_ocd <- cbind(ca_woody_pts,
-                               clay = clay,
-                               ocd = ocd)
-
-required_cols <- c("id", "crop", "pft", "clay", "ocd", "geom")
-assertthat::assert_that(
-  all(required_cols %in% colnames(ca_woody_pts_clay_ocd)),
-            msg = "ca_woody_pts_clay_ocd is missing expected columns")
-
-```
-
-### Topographic Wetness Index
-
-```{r twi}
-twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
-twi_rast <- terra::rast(twi_tiff)
-
-twi <- terra::extract(twi_rast, vect(ca_woody_pts)) |>
-  select(-ID) |>
-  dplyr::pull()
-
-ca_woody_pts_clay_ocd_twi <- cbind(ca_woody_pts_clay_ocd, twi = twi)
-
-```
-
-### Cal-Adapt Climate Regions
-
-```{r caladapt_climregions}
-
-ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
-                      # and so units are in meters
-                      # required that crs(x) == crs(y) for st_join
-
-ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
-    sf::st_transform(crs = ca_albers_crs) |>
-    dplyr::rename(climregion_id = id,
-           climregion_name = name)
-save(ca_climregions, file = "data/ca_climregions.rda")
-```
-
-```{r join_climregions}
-ca_woody_pts_clay_ocd_twi_cr <- ca_woody_pts_clay_ocd_twi |>
-  sf::st_transform(., crs = ca_albers_crs) |>
-  sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
-
-# convenience cache.
-save(ca_woody_pts_clay_ocd_twi_cr, file = "ca_woody_pts_clay_ocd_twi_cr.rda")
-```
-
-### GridMet
-
-```{r}
-gridmet_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
-
-# List all ERA5_met_*.tiff files for years 2012-2021
-raster_files <- list.files(
-  path = gridmet_dir,
-  pattern = "^ERA5_met_\\d{4}\\.tiff$",
-  full.names = TRUE
-)
-
-# Read all rasters into a list of SpatRaster objects
-rasters_list <- map(
-  raster_files,
-  ~ rast(.x))
-
-years <- map_chr(rasters_list, ~ {
-  source_path <- terra::sources(.x)[1]
-  str_extract(source_path, "\\d{4}")
-})  |>
-  as.integer()
-
-names(rasters_list) <- years
-
-extract_clim <- function(raster, points_sf) {
-  terra::extract(raster, points_sf) |>
-    tibble::as_tibble() |>
-    select(-ID) |>
-    mutate(id = points_sf$id) |>
-    select(id, temp, prec, srad, vapr)
-}
-
-.tmp <-  rasters_list |>
-  furrr::future_map_dfr(
-    ~ extract_clim(.x, ca_woody_pts),
-      .id = "year",
-      .options = furrr::furrr_options(seed = 123))
-
-
-clim_summaries <- .tmp |>
-  dplyr::mutate(
-    precip = PEcAn.utils::ud_convert(prec, "second-1", "year-1")
-) |>
-  dplyr::group_by(id) |>
-  dplyr::summarise(
-    temp = mean(temp),
-    precip = mean(precip),
-    srad = mean(srad),
-    vapr = mean(vapr)
-  )
-```
-
-## Prepare Dataset for Clustering
-
-First, we will turn crop names into IDs to support hierarchical clustering
-
-```{r}
-crop_ids <- ca_woody_pts |>
-  distinct(crop) |>
-  mutate(crop_id = as.integer(as.factor(crop))) |>
-  write_csv("data/crop_ids.csv")
-```
-
-```{r join_and_subset}
-.all <- clim_summaries  |>
-  dplyr::left_join(ca_woody_pts_clay_ocd_twi_cr, by = "id")  |>
-  dplyr::left_join(crop_ids, by = "crop")
-
-assertthat::assert_that(nrow(.all) == nrow(clim_summaries) && nrow(.all) == nrow(ca_woody_pts_clay_ocd_twi_cr),
-                        msg = "join was not 1:1 as expected")
-
-glimpse(.all)
-skimr::skim(.all)
-
-data_for_clust_with_ids <- .all |>
-  #dplyr::select(-c(climregion_name))  |>
-  na.omit() |>
-  mutate(across(where(is.numeric), ~ signif(., digits = 3)))
-
-save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
-```
-
-<!--
-# [Draft] Unused Code
-
-### Cal-Adapt Climate
-
-#### Cal-Adapt Catalog
-
-See `ca_catalog_search('30yavg_ens32avg_historical')`
-
-## make a paginated table
-
-```{r eval=FALSE}
-library(dplyr)
-ca_catalog_search('ens32avg') |>
-  reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
-```
-
-##### Download Cal-Adapt Climate Rasters
-
-LOCA (CMIP5-based)
-
-```{r eval = FALSE}
-
-#gcm = "ens32avg"
-gcm = "HadGEM2-ES"
-scenario = "historical"
-period = "year"
-start_year = 1961
-end_year = 1990
-out_dir = "data/caladapt/"
-polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-
-vars <- c("pr", "tasmin", "tasmax")
-
-num_workers <- parallel::detectCores()
-future::plan(future_multi,
-            workers = 3)# num_workers
-### Not working because Rasters are all 0????
-caladapt_rasters <- furrr::future_map(
-  vars,
-  ~ {
-    gcms <- gcms # required for parallel
-    ca_fetch_raster_polygon(
-      polygon = polygon,
-      var = .x,
-      gcm = gcm,
-      scenario = scenario,
-      period = period,
-      start_year = start_year,
-      end_year = end_year,
-      out_dir = out_dir
-  )
-  },
-  .options = furrr_options(seed = 123)
-)
-
-# combine into a dataframe
-caladapt_rasters_df <- caladapt_rasters |>
-  purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
-  dplyr::group_by(variable) |>
-  dplyr::summarise(raster = list(terra::vrt(raster)))
-
-z <- caladapt_rasters_df |>
-  dplyr::group_by(variable) |>
-  dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
-ca_woody_pts_sg_cr_ca <-z
-
-climate_df <- furrr::future_map(
-  caladapt_rasters_df$raster,
-  ~ terra::extract(.x, terra::vect(ca_woody_pts)),
-  .options = furrr_options(seed = 123)
-)
-
-
-```
-
-##### Cal-Adapt 3.0
-
-```{r eval=FALSE}
-my_data <- read_3km_hourly_climate_terra(
-     data_dir = "local_folder",
-     variable = "T2",
-     domain = "d03",
-     year = 2015,
-     n_cores = 8
-   )
-```
--->
-

From 2c8ddbf6c64585967bf37da8f7f87c354e7a96fd Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 6 Feb 2025 23:14:22 -0500
Subject: [PATCH 11/49] convert 01 and 02 qmd to R

---
 ...ct_design_points.qmd => 01_cluster_and_select_design_points.R} | 0
 ...design_point_simulations.qmd => 02_design_point_simulations.R} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename downscale/{01_cluster_and_select_design_points.qmd => 01_cluster_and_select_design_points.R} (100%)
 rename downscale/{02_design_point_simulations.qmd => 02_design_point_simulations.R} (100%)

diff --git a/downscale/01_cluster_and_select_design_points.qmd b/downscale/01_cluster_and_select_design_points.R
similarity index 100%
rename from downscale/01_cluster_and_select_design_points.qmd
rename to downscale/01_cluster_and_select_design_points.R
diff --git a/downscale/02_design_point_simulations.qmd b/downscale/02_design_point_simulations.R
similarity index 100%
rename from downscale/02_design_point_simulations.qmd
rename to downscale/02_design_point_simulations.R

From 543264fbfb9dd3706b61536efca1df1caa2ab691 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 6 Feb 2025 23:18:25 -0500
Subject: [PATCH 12/49] Add downscaling and aggregation workflow for woody crop
 SOC stocks, purled qmd-->R

---
 downscale/00-prepare.R                        | 532 +++++++++---------
 .../01_cluster_and_select_design_points.R     | 196 +++----
 downscale/02_design_point_simulations.R       | 114 ++--
 downscale/03_downscale_and_agregate.R         | 101 ++++
 4 files changed, 522 insertions(+), 421 deletions(-)
 create mode 100644 downscale/03_downscale_and_agregate.R

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 84bcaa8..417b9dc 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -1,31 +1,31 @@
----
-title: "Workflow Setup and Data Preparation"
-author: "David LeBauer"
----
-
-
-# Overview
-
-- Prepare Inputs
-  - Harmonized LandIQ dataset of woody California cropland from 2016-2023
-  - SoilGrids soil properties (clay, ?)
-  - CalAdapt climatology (mean annual temperature, mean annual precipitation)
-- Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
-
-## TODO
-
-- Use consistent projection(s):
-  - California Albers EPSG:33110 for joins
-  - WGS84 EPSG:4326 for plotting, subsetting rasters?
-- Clean up domain code
-- Create a bunch of tables and join all at once at the end
-- Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
-
-## Install & Load PEcAn
-
-See https://pecanproject.github.io/documentation/develop/
-
-```{r}
+#' ---
+#' title: "Workflow Setup and Data Preparation"
+#' author: "David LeBauer"
+#' ---
+#' 
+#' 
+#' # Overview
+#' 
+#' - Prepare Inputs
+#'   - Harmonized LandIQ dataset of woody California cropland from 2016-2023
+#'   - SoilGrids soil properties (clay, ?)
+#'   - CalAdapt climatology (mean annual temperature, mean annual precipitation)
+#' - Use LandIQ to query covariates from SoilGrids and CalAdapt and create a table that includes crop type, soil properties, and climatology for each woody crop field
+#' 
+#' ## TODO
+#' 
+#' - Use consistent projection(s):
+#'   - California Albers EPSG:33110 for joins
+#'   - WGS84 EPSG:4326 for plotting, subsetting rasters?
+#' - Clean up domain code
+#' - Create a bunch of tables and join all at once at the end
+#' - Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
+#' 
+#' ## Install & Load PEcAn
+#' 
+#' See https://pecanproject.github.io/documentation/develop/
+#' 
+## -----------------------------------------------------------------------------
 
 options(repos = c(
   pecanproject = 'https://pecanproject.r-universe.dev',
@@ -57,92 +57,92 @@ future::plan(future_multi,
 
 
 
-```
-
-## Organize Input Data
-
-### Domain Polygons
-
-Here we are generating domain polygons that will be used for subsetting.
-These are converted to convex hulls and simplified for computational efficiency.
-The Yolo County domain is a smaller domain that can be used for testing and debugging.
-
-These include:
-
-- caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
-- ca_convex_hull: a convex hull around CA
-- ca_state_polygon_simplified: a simplified convex hull for CA
-- ca_state_polygon: a convex hull for CA
-
-- ca_convex_hull_reduced: a simplified convex hull for CA
-- yolo_bbox: a smaller domain limited to Yolo County
-
-```{r eval = FALSE}
-# remotes::install_github("ucanr-igis/caladaptr")
-## Cal-Adapt Domain
-caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
-     sf::st_transform(4326) |>
-     sf::st_union() |>
-     sf::st_convex_hull()
-st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
-
-## California State
-ca_counties_polygons <- ca_aoipreset_geom("counties") |>
-     dplyr::filter(state_name == "California") |>
-     dplyr::select(state_name, county_name = name, geom) |>
-     sf::st_transform(4326)
-
-ca_state_polygon <- ca_counties_polygons |>
-     group_by(state_name) |>
-     mutate(geom = sf::st_union(geom))
-
-ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
-file.remove("data/ca_state_polygon_simplified.geojson")
-sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
 
-## Yolo County
-yolo_county_polygon <- ca_counties_polygons |>
-    filter(county_name=='Yolo')
-
-yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
-sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
-yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
-# check if it is sufficiently simple to avoid unnecessary computational expensse
-# st_coordinates(yolo_county_convex_hull)
-
-ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-
-```
-
-```{r}
+#' 
+#' ## Organize Input Data
+#' 
+#' ### Domain Polygons
+#' 
+#' Here we are generating domain polygons that will be used for subsetting.
+#' These are converted to convex hulls and simplified for computational efficiency.
+#' The Yolo County domain is a smaller domain that can be used for testing and debugging.
+#' 
+#' These include:
+#' 
+#' - caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
+#' - ca_convex_hull: a convex hull around CA
+#' - ca_state_polygon_simplified: a simplified convex hull for CA
+#' - ca_state_polygon: a convex hull for CA
+#' 
+#' - ca_convex_hull_reduced: a simplified convex hull for CA
+#' - yolo_bbox: a smaller domain limited to Yolo County
+#' 
+## ----eval = FALSE-------------------------------------------------------------
+# # remotes::install_github("ucanr-igis/caladaptr")
+# ## Cal-Adapt Domain
+# caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
+#      sf::st_transform(4326) |>
+#      sf::st_union() |>
+#      sf::st_convex_hull()
+# st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
+# 
+# ## California State
+# ca_counties_polygons <- ca_aoipreset_geom("counties") |>
+#      dplyr::filter(state_name == "California") |>
+#      dplyr::select(state_name, county_name = name, geom) |>
+#      sf::st_transform(4326)
+# 
+# ca_state_polygon <- ca_counties_polygons |>
+#      group_by(state_name) |>
+#      mutate(geom = sf::st_union(geom))
+# 
+# ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
+# file.remove("data/ca_state_polygon_simplified.geojson")
+# sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
+# 
+# ## Yolo County
+# yolo_county_polygon <- ca_counties_polygons |>
+#     filter(county_name=='Yolo')
+# 
+# yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
+# sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
+# yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
+# # check if it is sufficiently simple to avoid unnecessary computational expensse
+# # st_coordinates(yolo_county_convex_hull)
+# 
+# ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+# 
+
+#' 
+## -----------------------------------------------------------------------------
 ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
 yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson")
-```
-
-### LandIQ Woody Polygons
-
-The first step is to convert LandIQ to a open, standard format.
-
-We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
-
-The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
 
-```{r eval=FALSE}
-input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
-output_gpkg = 'data/ca_fields.gpkg'
-output_csv = 'data/ca_field_attributes.csv'
-
-landiq2std(input_file, output_gpkg, output_csv)
-```
-
-##### Subset Woody Perennial Crop Fields
-
-Phase 1 focuses on Woody Perennial Crop fields.
-
-Next, we will subset the LandIQ data to only include woody perennial crop fields.
-At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
-
-```{r}
+#' 
+#' ### LandIQ Woody Polygons
+#' 
+#' The first step is to convert LandIQ to a open, standard format.
+#' 
+#' We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
+#' 
+#' The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
+# output_gpkg = 'data/ca_fields.gpkg'
+# output_csv = 'data/ca_field_attributes.csv'
+# 
+# landiq2std(input_file, output_gpkg, output_csv)
+
+#' 
+#' ##### Subset Woody Perennial Crop Fields
+#' 
+#' Phase 1 focuses on Woody Perennial Crop fields.
+#' 
+#' Next, we will subset the LandIQ data to only include woody perennial crop fields.
+#' At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
+#' 
+## -----------------------------------------------------------------------------
 ca_fields <- sf::st_read("data/ca_fields.gpkg")
 ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
 
@@ -154,13 +154,13 @@ ca_woody <- ca |>
   dplyr::filter(pft == "woody perennial crop")
 sf::st_write(ca_woody,
         "data/ca_woody.gpkg", delete_layer = TRUE)
-```
 
-Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
-number of design points that will be selected in the clustering step.
-
-
-```{r}
+#' 
+#' Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
+#' number of design points that will be selected in the clustering step.
+#' 
+#' 
+## -----------------------------------------------------------------------------
 system.time(
 pft_area <- ca |>
   dplyr::select(id, pft, area_ha) |>
@@ -171,27 +171,27 @@ pft_area <- ca |>
 )
 
 
-```
-
-### Subset California Woody Crop Fields for development & testting
-
-Now, create a subset of the California Woody Crop Fields for development & testting
 
-```{r eval=FALSE}
-set.seed(25)
-ca_woody_subset <- ca_woody  |>
-   dplyr::sample_n(200)
-
-sf::st_write(ca_woody_subset,
-        "data/ca_woody_subset.gpkg", delete_layer = TRUE)
-```
-
-### Convert Polygons to Points.
-
-For Phase 1, we will use points to query raster data.
-In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
-
-```{r polygons-to-points}
+#' 
+#' ### Subset California Woody Crop Fields for development & testting
+#' 
+#' Now, create a subset of the California Woody Crop Fields for development & testting
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# set.seed(25)
+# ca_woody_subset <- ca_woody  |>
+#    dplyr::sample_n(200)
+# 
+# sf::st_write(ca_woody_subset,
+#         "data/ca_woody_subset.gpkg", delete_layer = TRUE)
+
+#' 
+#' ### Convert Polygons to Points.
+#' 
+#' For Phase 1, we will use points to query raster data.
+#' In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
+#' 
+## ----polygons-to-points-------------------------------------------------------
 woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for development and testing
 
 # load fields, convert polygons to points, and subset
@@ -200,11 +200,11 @@ ca_woody_pts <- sf::st_read(woody_gpkg) |>
   # and keep only the columns we need
   dplyr::select(id, crop, pft, geom)
 
-```
-
-## Anchor Sites
 
-```{r anchor-sites}
+#' 
+#' ## Anchor Sites
+#' 
+## ----anchor-sites-------------------------------------------------------------
 # Anchor sites from UC Davis, UC Riverside, and Ameriflux.
 anchor_sites <- readr::read_csv("data/anchor_sites.csv")
 
@@ -222,17 +222,17 @@ sf::st_write(anchor_sites_with_ids |>
                dsn = "data/anchor_sites_ids.csv",
                delete_layer = TRUE)
 
-```
-
-## Environmental Covariates
-
-### SoilGrids
-
-#### Load Prepared Soilgrids GeoTIFF
-
-Using already prepared SoilGrids layers
 
-```{r load-soilgrids}
+#' 
+#' ## Environmental Covariates
+#' 
+#' ### SoilGrids
+#' 
+#' #### Load Prepared Soilgrids GeoTIFF
+#' 
+#' Using already prepared SoilGrids layers
+#' 
+## ----load-soilgrids-----------------------------------------------------------
 soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
 soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
 ## if we want to clip to CA
@@ -244,11 +244,11 @@ soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA
 soilgrids_north_america_clay_rast <- terra::rast(soilgrids_north_america_clay_tif)
 soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
 
-```
 
-#### Extract clay from SoilGrids
-
-```{r sg-clay-ocd}
+#' 
+#' #### Extract clay from SoilGrids
+#' 
+## ----sg-clay-ocd--------------------------------------------------------------
 
 clay <- (terra::extract(soilgrids_north_america_clay_rast, terra::vect(ca_woody_pts)) |>
   dplyr::select(-ID)) |>
@@ -267,11 +267,11 @@ assertthat::assert_that(
   all(required_cols %in% colnames(ca_woody_pts_clay_ocd)),
             msg = "ca_woody_pts_clay_ocd is missing expected columns")
 
-```
-
-### Topographic Wetness Index
 
-```{r twi}
+#' 
+#' ### Topographic Wetness Index
+#' 
+## ----twi----------------------------------------------------------------------
 twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
 twi_rast <- terra::rast(twi_tiff)
 
@@ -281,11 +281,11 @@ twi <- terra::extract(twi_rast, vect(ca_woody_pts)) |>
 
 ca_woody_pts_clay_ocd_twi <- cbind(ca_woody_pts_clay_ocd, twi = twi)
 
-```
-
-### Cal-Adapt Climate Regions
 
-```{r caladapt_climregions}
+#' 
+#' ### Cal-Adapt Climate Regions
+#' 
+## ----caladapt_climregions-----------------------------------------------------
 
 ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
                       # and so units are in meters
@@ -296,20 +296,20 @@ ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
     dplyr::rename(climregion_id = id,
            climregion_name = name)
 save(ca_climregions, file = "data/ca_climregions.rda")
-```
 
-```{r join_climregions}
+#' 
+## ----join_climregions---------------------------------------------------------
 ca_woody_pts_clay_ocd_twi_cr <- ca_woody_pts_clay_ocd_twi |>
   sf::st_transform(., crs = ca_albers_crs) |>
   sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
 
 # convenience cache.
 save(ca_woody_pts_clay_ocd_twi_cr, file = "ca_woody_pts_clay_ocd_twi_cr.rda")
-```
 
-### GridMet
-
-```{r}
+#' 
+#' ### GridMet
+#' 
+## -----------------------------------------------------------------------------
 gridmet_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
 
 # List all ERA5_met_*.tiff files for years 2012-2021
@@ -358,20 +358,20 @@ clim_summaries <- .tmp |>
     srad = mean(srad),
     vapr = mean(vapr)
   )
-```
-
-## Prepare Dataset for Clustering
 
-First, we will turn crop names into IDs to support hierarchical clustering
-
-```{r}
+#' 
+#' ## Prepare Dataset for Clustering
+#' 
+#' First, we will turn crop names into IDs to support hierarchical clustering
+#' 
+## -----------------------------------------------------------------------------
 crop_ids <- ca_woody_pts |>
   distinct(crop) |>
   mutate(crop_id = as.integer(as.factor(crop))) |>
   write_csv("data/crop_ids.csv")
-```
 
-```{r join_and_subset}
+#' 
+## ----join_and_subset----------------------------------------------------------
 .all <- clim_summaries  |>
   dplyr::left_join(ca_woody_pts_clay_ocd_twi_cr, by = "id")  |>
   dplyr::left_join(crop_ids, by = "crop")
@@ -388,94 +388,94 @@ data_for_clust_with_ids <- .all |>
   mutate(across(where(is.numeric), ~ signif(., digits = 3)))
 
 save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
-```
-
-<!--
-# [Draft] Unused Code
-
-### Cal-Adapt Climate
-
-#### Cal-Adapt Catalog
-
-See `ca_catalog_search('30yavg_ens32avg_historical')`
-
-## make a paginated table
-
-```{r eval=FALSE}
-library(dplyr)
-ca_catalog_search('ens32avg') |>
-  reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
-```
-
-##### Download Cal-Adapt Climate Rasters
-
-LOCA (CMIP5-based)
-
-```{r eval = FALSE}
-
-#gcm = "ens32avg"
-gcm = "HadGEM2-ES"
-scenario = "historical"
-period = "year"
-start_year = 1961
-end_year = 1990
-out_dir = "data/caladapt/"
-polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-
-vars <- c("pr", "tasmin", "tasmax")
-
-num_workers <- parallel::detectCores()
-future::plan(future_multi,
-            workers = 3)# num_workers
-### Not working because Rasters are all 0????
-caladapt_rasters <- furrr::future_map(
-  vars,
-  ~ {
-    gcms <- gcms # required for parallel
-    ca_fetch_raster_polygon(
-      polygon = polygon,
-      var = .x,
-      gcm = gcm,
-      scenario = scenario,
-      period = period,
-      start_year = start_year,
-      end_year = end_year,
-      out_dir = out_dir
-  )
-  },
-  .options = furrr_options(seed = 123)
-)
-
-# combine into a dataframe
-caladapt_rasters_df <- caladapt_rasters |>
-  purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
-  dplyr::group_by(variable) |>
-  dplyr::summarise(raster = list(terra::vrt(raster)))
-
-z <- caladapt_rasters_df |>
-  dplyr::group_by(variable) |>
-  dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
-ca_woody_pts_sg_cr_ca <-z
-
-climate_df <- furrr::future_map(
-  caladapt_rasters_df$raster,
-  ~ terra::extract(.x, terra::vect(ca_woody_pts)),
-  .options = furrr_options(seed = 123)
-)
-
-
-```
-
-##### Cal-Adapt 3.0
-
-```{r eval=FALSE}
-my_data <- read_3km_hourly_climate_terra(
-     data_dir = "local_folder",
-     variable = "T2",
-     domain = "d03",
-     year = 2015,
-     n_cores = 8
-   )
-```
--->
 
+#' 
+#' <!--
+#' # [Draft] Unused Code
+#' 
+#' ### Cal-Adapt Climate
+#' 
+#' #### Cal-Adapt Catalog
+#' 
+#' See `ca_catalog_search('30yavg_ens32avg_historical')`
+#' 
+#' ## make a paginated table
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# library(dplyr)
+# ca_catalog_search('ens32avg') |>
+#   reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
+
+#' 
+#' ##### Download Cal-Adapt Climate Rasters
+#' 
+#' LOCA (CMIP5-based)
+#' 
+## ----eval = FALSE-------------------------------------------------------------
+# 
+# #gcm = "ens32avg"
+# gcm = "HadGEM2-ES"
+# scenario = "historical"
+# period = "year"
+# start_year = 1961
+# end_year = 1990
+# out_dir = "data/caladapt/"
+# polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
+# 
+# vars <- c("pr", "tasmin", "tasmax")
+# 
+# num_workers <- parallel::detectCores()
+# future::plan(future_multi,
+#             workers = 3)# num_workers
+# ### Not working because Rasters are all 0????
+# caladapt_rasters <- furrr::future_map(
+#   vars,
+#   ~ {
+#     gcms <- gcms # required for parallel
+#     ca_fetch_raster_polygon(
+#       polygon = polygon,
+#       var = .x,
+#       gcm = gcm,
+#       scenario = scenario,
+#       period = period,
+#       start_year = start_year,
+#       end_year = end_year,
+#       out_dir = out_dir
+#   )
+#   },
+#   .options = furrr_options(seed = 123)
+# )
+# 
+# # combine into a dataframe
+# caladapt_rasters_df <- caladapt_rasters |>
+#   purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
+#   dplyr::group_by(variable) |>
+#   dplyr::summarise(raster = list(terra::vrt(raster)))
+# 
+# z <- caladapt_rasters_df |>
+#   dplyr::group_by(variable) |>
+#   dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
+# ca_woody_pts_sg_cr_ca <-z
+# 
+# climate_df <- furrr::future_map(
+#   caladapt_rasters_df$raster,
+#   ~ terra::extract(.x, terra::vect(ca_woody_pts)),
+#   .options = furrr_options(seed = 123)
+# )
+# 
+# 
+
+#' 
+#' ##### Cal-Adapt 3.0
+#' 
+## ----eval=FALSE---------------------------------------------------------------
+# my_data <- read_3km_hourly_climate_terra(
+#      data_dir = "local_folder",
+#      variable = "T2",
+#      domain = "d03",
+#      year = 2015,
+#      n_cores = 8
+#    )
+
+#' -->
+#' 
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index 5108f5a..ea618d5 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -1,19 +1,19 @@
----
-title: "Cluster and Select Design Points"
-author: "David LeBauer"
----
-
-# Overview
-
-This workflow will:
-
-- Read in a dataset of site environmental data
-- Perform K-means clustering to identify clusters
-- Select anchor sites for each cluster
-
-## Setup
-
-```{r setup}
+#' ---
+#' title: "Cluster and Select Design Points"
+#' author: "David LeBauer"
+#' ---
+#' 
+#' # Overview
+#' 
+#' This workflow will:
+#' 
+#' - Read in a dataset of site environmental data
+#' - Perform K-means clustering to identify clusters
+#' - Select anchor sites for each cluster
+#' 
+#' ## Setup
+#' 
+## ----setup--------------------------------------------------------------------
 # general utilities
 library(tidyverse)
 
@@ -50,31 +50,31 @@ if('mean_temp' %in% names(data_for_clust_with_ids)){
 # set coordinate reference system
 ca_albers_crs <- 3310 # California Albers EPSG
 
-```
-
-## Load Site Environmental Data
-
-Environmental data was pre-processed in the previous workflow 00-prepare.qmd.
-
-Below is a sumary of the covariates dataset
 
-- id: Unique identifier for each polygon
-- temp: Mean Annual Temperature from ERA5
-- precip: Mean Annual Precipitation from ERA5
-- srad: Solar Radiation
-- vapr: Vapor pressure deficit
-- clay: Clay content from SoilGrids
-- ocd: Organic Carbon content from SoilGrids
-- twi: Topographic Wetness Index
-- crop_id: identifier for crop type, see table in crop_ids.csv
-- climregion_id: Climate Regions as defined by CalAdapt identifier for climate region, see table in climregion_ids.csv
-
-
-## Anchor Site Selection
-
-Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
-
-```{r anchor-sites-selection}
+#' 
+#' ## Load Site Environmental Data
+#' 
+#' Environmental data was pre-processed in the previous workflow 00-prepare.qmd.
+#' 
+#' Below is a sumary of the covariates dataset
+#' 
+#' - id: Unique identifier for each polygon
+#' - temp: Mean Annual Temperature from ERA5
+#' - precip: Mean Annual Precipitation from ERA5
+#' - srad: Solar Radiation
+#' - vapr: Vapor pressure deficit
+#' - clay: Clay content from SoilGrids
+#' - ocd: Organic Carbon content from SoilGrids
+#' - twi: Topographic Wetness Index
+#' - crop_id: identifier for crop type, see table in crop_ids.csv
+#' - climregion_id: Climate Regions as defined by CalAdapt identifier for climate region, see table in climregion_ids.csv
+#' 
+#' 
+#' ## Anchor Site Selection
+#' 
+#' Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
+#' 
+## ----anchor-sites-selection---------------------------------------------------
 anchor_sites <- readr::read_csv("data/anchor_sites.csv")
 ca_woody <- sf::st_read("data/ca_woody.gpkg") |> 
   select(-pft) # duplicates pft column in anchor_sites
@@ -105,17 +105,17 @@ anchorsites_for_clust <-
 
 message("Anchor sites included in final selection:")
 knitr::kable(woody_anchor_sites |> dplyr::left_join(anchorsites_for_clust, by = 'id'))
-```
 
-### Subset LandIQ fields for clustering
-
-The following code does:
-- Read in a dataset of site environmental data
-- Removes anchor sites from the dataset that will be used for clustering
-- Subsample the dataset - 80GB RAM too small to cluster 100k rows
-- Bind anchor sites back to the dataset 
-
-```{r subset-for-clustering}
+#' 
+#' ### Subset LandIQ fields for clustering
+#' 
+#' The following code does:
+#' - Read in a dataset of site environmental data
+#' - Removes anchor sites from the dataset that will be used for clustering
+#' - Subsample the dataset - 80GB RAM too small to cluster 100k rows
+#' - Bind anchor sites back to the dataset 
+#' 
+## ----subset-for-clustering----------------------------------------------------
 set.seed(42)  # Set seed for random number generator for reproducibility
 # subsample for testing (full dataset exceeds available Resources)
 sample_size <- 20000
@@ -131,16 +131,16 @@ data_for_clust <- data_for_clust_with_ids |>
 assertthat::assert_that(nrow(data_for_clust) == sample_size)
 assertthat::assert_that('temp'%in% colnames(data_for_clust))
 skimr::skim(data_for_clust)
-```
-
-### K-means Clustering
-
-First, create a function `perform_clustering` to perform hierarchical k-means and find optimal clusters.
 
-K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
-or treat 'crop' as categorical by some encoding if needed).
-
-```{r k-means-clustering-function}
+#' 
+#' ### K-means Clustering
+#' 
+#' First, create a function `perform_clustering` to perform hierarchical k-means and find optimal clusters.
+#' 
+#' K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
+#' or treat 'crop' as categorical by some encoding if needed).
+#' 
+## ----k-means-clustering-function----------------------------------------------
 
 perform_clustering <- function(data) {
   # Select numeric variables for clustering
@@ -191,19 +191,19 @@ perform_clustering <- function(data) {
 
   return(data)
 }
-```
-
-Apply clustering function to the sampled dataset.
-
-```{r clustering, eval=FALSE}
-
-data_clustered <- perform_clustering(data_for_clust)
-save(data_clustered, file = "cache/data_clustered.rda")
-```
 
-### Check Clustering
-
-```{r check-clustering}
+#' 
+#' Apply clustering function to the sampled dataset.
+#' 
+## ----clustering, eval=FALSE---------------------------------------------------
+# 
+# data_clustered <- perform_clustering(data_for_clust)
+# save(data_clustered, file = "cache/data_clustered.rda")
+
+#' 
+#' ### Check Clustering
+#' 
+## ----check-clustering---------------------------------------------------------
 load("cache/data_clustered.rda")
 # Summarize clusters
 cluster_summary <- data_clustered |>
@@ -236,11 +236,11 @@ ggplot(data = cluster_summary, aes(x = cluster)) +
 
 knitr::kable(cluster_summary |> round(0))
 
-```
-
-#### Stratification by Crops and Climate Regions
 
-```{r check-stratification}
+#' 
+#' #### Stratification by Crops and Climate Regions
+#' 
+## ----check-stratification-----------------------------------------------------
 # Check stratification of clusters by categorical factors
 
 # cols should be character, factor
@@ -261,16 +261,16 @@ factor_stratification <- list(
 lapply(factor_stratification, knitr::kable)
 # Shut down parallel backend
 plan(sequential)
-```
-
-## Design Point Selection
-
-For phase 1b we need to supply design points for SIPNET runs. For development we will use 100 design points from the clustered dataset that are _not_ already anchor sites.
 
-For the final high resolution runs we expect to use approximately 10,000 design points.
-For woody croplands, we will start with a number proportional to the total number of sites with woody perennial pfts.
-
-```{r design-point-selection}
+#' 
+#' ## Design Point Selection
+#' 
+#' For phase 1b we need to supply design points for SIPNET runs. For development we will use 100 design points from the clustered dataset that are _not_ already anchor sites.
+#' 
+#' For the final high resolution runs we expect to use approximately 10,000 design points.
+#' For woody croplands, we will start with a number proportional to the total number of sites with woody perennial pfts.
+#' 
+## ----design-point-selection---------------------------------------------------
 # From the clustered data, remove anchor sites to avoid duplicates in design point selection.
 
 if(!exists("ca_fields")) {
@@ -311,13 +311,13 @@ final_design_points |>
    select(id, lat, lon) |>
    write_csv("data/final_design_points.csv")
 
-```
-
-### Design Point Map
 
-Now some analysis of how these design points are distributed
-
-```{r design-point-map}
+#' 
+#' ### Design Point Map
+#' 
+#' Now some analysis of how these design points are distributed
+#' 
+## ----design-point-map---------------------------------------------------------
 # plot map of california and climregions
 
 final_design_points_clust <- final_design_points |>
@@ -338,14 +338,14 @@ ggplot() +
 
 
 
-```
-
 
-## Woody Cropland Proportion
-
-Here we calculate percent of California croplands that are woody perennial crops, in order to estimate the number of design points that will be selected in the clustering step
-
-```{r woody-proportion}
+#' 
+#' 
+#' ## Woody Cropland Proportion
+#' 
+#' Here we calculate percent of California croplands that are woody perennial crops, in order to estimate the number of design points that will be selected in the clustering step
+#' 
+## ----woody-proportion---------------------------------------------------------
 field_attributes <- read_csv("data/ca_field_attributes.csv")
 ca <- ca_fields |>
   dplyr::select(-lat, -lon) |>
@@ -370,4 +370,4 @@ pft_area <- pft_area |>
 pft_area |>
   kableExtra::kable()
 
-```
+
diff --git a/downscale/02_design_point_simulations.R b/downscale/02_design_point_simulations.R
index 3967553..2fdc12e 100644
--- a/downscale/02_design_point_simulations.R
+++ b/downscale/02_design_point_simulations.R
@@ -1,45 +1,45 @@
----
-title: "Design Point Selection"
-author: "David LeBauer"
----
-
-# Overview
-
-In the future, this workflow will:
-
-- Use SIPNET to simulate SOC and biomass for each design point.
-- Generate a dataframe with site_id, lat, lon, soil carbon, biomass
-- (Maybe) use SIPNETWOPET to evaluate downscaling model skill?
-
-Curently, we will use a surrogate model, SIPNETWOPET, to simulate SOC and biomass for each design point.
-
-## SIPNETWOPET [surrogate model] 
-
-
-
-## SIPNETWOPET Simulation of Design Points
-
-We introduce a new model, SIPNETWOPET, the "Simpler Photosynthesis and EvapoTranspiration model, WithOut Photosynthesis and EvapoTranspiration".
-
-```{r}
+#' ---
+#' title: "Design Point Selection"
+#' author: "David LeBauer"
+#' ---
+#' 
+#' # Overview
+#' 
+#' In the future, this workflow will:
+#' 
+#' - Use SIPNET to simulate SOC and biomass for each design point.
+#' - Generate a dataframe with site_id, lat, lon, soil carbon, biomass
+#' - (Maybe) use SIPNETWOPET to evaluate downscaling model skill?
+#' 
+#' Curently, we will use a surrogate model, SIPNETWOPET, to simulate SOC and biomass for each design point.
+#' 
+#' ## SIPNETWOPET [surrogate model] 
+#' 
+#' 
+#' 
+#' ## SIPNETWOPET Simulation of Design Points
+#' 
+#' We introduce a new model, SIPNETWOPET, the "Simpler Photosynthesis and EvapoTranspiration model, WithOut Photosynthesis and EvapoTranspiration".
+#' 
+## -----------------------------------------------------------------------------
 library(tidyverse)
 source("downscale/sipnetwopet.R")
-```
-
 
-### Join Design Points with Covariates
-
-```{r}
+#' 
+#' 
+#' ### Join Design Points with Covariates
+#' 
+## -----------------------------------------------------------------------------
 design_points <- read_csv('data/final_design_points.csv') 
 covariates <- load("data/data_for_clust_with_ids.rda") |> get()
 
 design_point_covs <- design_points |> 
   left_join(sf::st_drop_geometry(covariates), by = 'id')
-```
-
-### Run SIPNETWOPET
 
-```{r}
+#' 
+#' ### Run SIPNETWOPET
+#' 
+## -----------------------------------------------------------------------------
 set.seed(8675.309)
 design_point_results <- design_point_covs |> 
   dplyr::rowwise() |> 
@@ -60,29 +60,29 @@ saveRDS(design_point_results, 'cache/design_point_results.rds')
 
 class(covariates)
 write_csv(design_point_results, 'cache/sipnetwopet_design_point_results.csv')
-```
 
+#' 
+#' 
+#' ### SIPNETWOPET Example
+#' 
+## ----sipnetwopet-demo, eval=FALSE---------------------------------------------
+# # Example dataset
+# n <- 100
+# set.seed(77.77)
+# example_sites <- tibble::tibble(
+#   mean_temp = rnorm(n, 16, 2),
+#   precip = rweibull(n, shape = 2, scale = 4000),
+#   clay = 100 * rbeta(n, shape1 = 2, shape2 = 5),
+#   ocd = rweibull(n, shape = 2, scale = 320),
+#   twi = rweibull(n, shape = 2, scale = 15)
+# )
+# 
+# # Apply function using rowwise mapping
+# example_results <- example_sites |>
+#   dplyr::rowwise() |>
+#   dplyr::mutate(result = list(sipnetwopet(mean_temp, precip, clay, ocd, twi))) |>
+#   tidyr::unnest(result)
+# 
+# print(example_results)
+# pairs(example_results)
 
-### SIPNETWOPET Example
-
-```{r sipnetwopet-demo, eval=FALSE}
-# Example dataset
-n <- 100
-set.seed(77.77)
-example_sites <- tibble::tibble(
-  mean_temp = rnorm(n, 16, 2),
-  precip = rweibull(n, shape = 2, scale = 4000),
-  clay = 100 * rbeta(n, shape1 = 2, shape2 = 5),
-  ocd = rweibull(n, shape = 2, scale = 320),
-  twi = rweibull(n, shape = 2, scale = 15)
-)
-
-# Apply function using rowwise mapping
-example_results <- example_sites |> 
-  dplyr::rowwise() |> 
-  dplyr::mutate(result = list(sipnetwopet(mean_temp, precip, clay, ocd, twi))) |> 
-  tidyr::unnest(result)
-
-print(example_results)
-pairs(example_results)
-```
\ No newline at end of file
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
new file mode 100644
index 0000000..4e4d423
--- /dev/null
+++ b/downscale/03_downscale_and_agregate.R
@@ -0,0 +1,101 @@
+#' ---
+#' Title: Downscale and Agregate Woody Crop SOC stocks
+#' author: "David LeBauer"
+#' ---
+#' 
+## ----setup--------------------------------------------------------------------
+#remotes::install_github("dlebauer/pecan@ensemble_downscaling", subdir = "modules/assim.sequential", ref = "da96331")
+library(tidyverse)
+library(sf)
+library(terra)
+
+library(PEcAnAssimSequential)
+
+#' 
+#' # Overview
+#' 
+#' This workflow will:
+#' 
+#' - Use environmental covariates to predict SIPNET estimated SOC for each woody crop field in the LandIQ dataset
+#'   - Uses Random Forest [maybe change to CNN later] trained on site-scale model runs.
+#'   - Build a model for each ensemble member
+#' - Write out a table with predicted biomass and SOC to maintain ensemble structure, ensuring correct error propagation and spatial covariance.
+#' - Aggregate County-level biomass and SOC inventories
+#' 
+#' ## Get Site Level Outputs
+#' 
+#' Here we we read in a table with site-level model outputs generated by SIPNETWOPET in 02_anchorsite_simulations.qmd.
+#' 
+#' TODO:
+#' - replace SIPNETWOPET outputs with SIPNET results
+#' 
+## -----------------------------------------------------------------------------
+sipnetwopet_output <- read_csv("cache/sipnetwopet_design_point_results.csv")
+
+ensemble_data <- readRDS("cache/ensemble_data.rds")
+
+
+
+
+#' 
+#' ### Random Forest using PEcAn downscale workflow
+#' 
+#' 
+## -----------------------------------------------------------------------------
+site_attributes <- readr::read_csv("data/ca_field_attributes.csv") |>
+  filter(id %in% site_ids)
+
+
+covariates <- load("data/data_for_clust_with_ids.rda") |>
+  get() # maybe there is a less convoluted way; maybe I just need to rename data_for_clust...
+
+covariates_points <- design_points |> left_join(covariates, by = "id")
+covariates_vect <- covariates_points |>
+  vect(geom = c("lon", "lat"), crs = "EPSG:4326")
+
+raster::stack(
+
+)
+downscale_output <- SDA_downscale(
+  preprocessed = preprocessed,
+  date = "2020-01-01",
+  carbon_pool = "SOC",
+  covariates = covariates_stack,  # replace with your covariates SpatRaster stack
+  model_type = "rf",  # or "cnn" for Convolutional Neural Network
+  seed = 123
+)
+
+metrics <- SDA_downscale_metrics(downscale_output, carbon_pool = "SOC")
+print(metrics)
+
+#' 
+#' 
+#' ## Aggregate to County Level
+#' 
+## -----------------------------------------------------------------------------
+library(sf)
+library(dplyr)
+
+# Load CA county boundaries
+# These are provided by Cal-Adapt as 'Areas of Interest'
+county_boundaries <- st_read("data/counties.gpkg")
+
+# check if attributes has county name
+# Append county name to predicted table
+grid_with_counties <- st_join(ca_grid, county_boundaries, join = st_intersects)
+
+# Calculate county-level mean, median, and standard deviation.
+county_aggregates <- grid_with_counties  |>
+  st_drop_geometry()  |>   # drop geometry for faster summarization
+  group_by(county_name)  |>   # replace with your actual county identifier
+  summarize(
+    mean_biomass   = mean(predicted_biomass, na.rm = TRUE),
+    median_biomass = median(predicted_biomass, na.rm = TRUE),
+    sd_biomass     = sd(predicted_biomass, na.rm = TRUE)
+  )
+
+print(county_aggregates)
+
+# For state-level, do the same but don't group_by county
+
+#' ````

From c4cd51d3bdb90e5bbbc9132afe4290730ae898d0 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 12 Feb 2025 00:22:05 -0500
Subject: [PATCH 13/49] Refactor SIPNETWOPET workflow: streamline data
 processing and still fiddling with downscaling

---
 downscale/02_design_point_simulations.R | 101 ++++++++++--------------
 downscale/03_downscale_and_agregate.R   | 100 +++++++++++++----------
 downscale/sipnetwopet.R                 |  25 +++---
 3 files changed, 112 insertions(+), 114 deletions(-)

diff --git a/downscale/02_design_point_simulations.R b/downscale/02_design_point_simulations.R
index 2fdc12e..8f82314 100644
--- a/downscale/02_design_point_simulations.R
+++ b/downscale/02_design_point_simulations.R
@@ -2,87 +2,66 @@
 #' title: "Design Point Selection"
 #' author: "David LeBauer"
 #' ---
-#' 
+#'
 #' # Overview
-#' 
+#'
 #' In the future, this workflow will:
-#' 
+#'
 #' - Use SIPNET to simulate SOC and biomass for each design point.
 #' - Generate a dataframe with site_id, lat, lon, soil carbon, biomass
 #' - (Maybe) use SIPNETWOPET to evaluate downscaling model skill?
-#' 
+#'
 #' Curently, we will use a surrogate model, SIPNETWOPET, to simulate SOC and biomass for each design point.
-#' 
-#' ## SIPNETWOPET [surrogate model] 
-#' 
-#' 
-#' 
+#'
+#' ## SIPNETWOPET [surrogate model]
+#'
+#'
+#'
 #' ## SIPNETWOPET Simulation of Design Points
-#' 
+#'
 #' We introduce a new model, SIPNETWOPET, the "Simpler Photosynthesis and EvapoTranspiration model, WithOut Photosynthesis and EvapoTranspiration".
-#' 
+#'
 ## -----------------------------------------------------------------------------
 library(tidyverse)
 source("downscale/sipnetwopet.R")
 
-#' 
-#' 
+#'
+#'
 #' ### Join Design Points with Covariates
-#' 
+#'
 ## -----------------------------------------------------------------------------
-design_points <- read_csv('data/final_design_points.csv') 
+design_points <- read_csv("data/final_design_points.csv")
 covariates <- load("data/data_for_clust_with_ids.rda") |> get()
 
-design_point_covs <- design_points |> 
-  left_join(sf::st_drop_geometry(covariates), by = 'id')
+# Remove duplicate entries using 'id'
+covariates_df <- sf::st_drop_geometry(covariates)
 
-#' 
+
+design_point_covs <- design_points |>
+  left_join(covariates_df, by = "id")
+
+#'
 #' ### Run SIPNETWOPET
-#' 
+#'
 ## -----------------------------------------------------------------------------
 set.seed(8675.309)
-design_point_results <- design_point_covs |> 
-  dplyr::rowwise() |> 
-  dplyr::mutate(result = list(sipnetwopet(temp, precip, clay, ocd, twi))) |> 
-  tidyr::unnest(result) |> 
-  dplyr::select(id, lat, lon, soc, agb, ensemble_id)
-
-
-ensemble_data <- design_point_results |>
-  dplyr::group_by(ensemble_id) |>
-  dplyr::summarize(
-    SOC = list(soc),
-    AGB = list(agb),
-    .groups = "drop"
-  )
-
-saveRDS(design_point_results, 'cache/design_point_results.rds')
+design_point_results <- design_point_covs |>
+  dplyr::rowwise() |>
+  dplyr::mutate(result = list(sipnetwopet(temp, precip, clay, ocd, twi))) |>
+  tidyr::unnest(result) |>
+  dplyr::select(id, ensemble_id, SOC = soc, AGB = agb)
 
-class(covariates)
-write_csv(design_point_results, 'cache/sipnetwopet_design_point_results.csv')
+# Transform long to wide format, unwrapping SOC list to numeric value
+design_point_wide <- design_point_results |>
+  tidyr::pivot_wider(
+    id_cols = id,
+    names_from = ensemble_id,
+    values_from = SOC,
+    names_prefix = "ensemble"
+  ) |>
+  dplyr::mutate(across(starts_with("ensemble"), ~ unlist(.)))
 
-#' 
-#' 
-#' ### SIPNETWOPET Example
-#' 
-## ----sipnetwopet-demo, eval=FALSE---------------------------------------------
-# # Example dataset
-# n <- 100
-# set.seed(77.77)
-# example_sites <- tibble::tibble(
-#   mean_temp = rnorm(n, 16, 2),
-#   precip = rweibull(n, shape = 2, scale = 4000),
-#   clay = 100 * rbeta(n, shape1 = 2, shape2 = 5),
-#   ocd = rweibull(n, shape = 2, scale = 320),
-#   twi = rweibull(n, shape = 2, scale = 15)
-# )
-# 
-# # Apply function using rowwise mapping
-# example_results <- example_sites |>
-#   dplyr::rowwise() |>
-#   dplyr::mutate(result = list(sipnetwopet(mean_temp, precip, clay, ocd, twi))) |>
-#   tidyr::unnest(result)
-# 
-# print(example_results)
-# pairs(example_results)
+# Save ensemble_data as a list with date naming, matching the expected shape
+ensemble_data <- list("2020-01-01" = design_point_wide)
 
+saveRDS(ensemble_data, "cache/ensemble_data.rds")
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 4e4d423..833fa16 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -2,76 +2,90 @@
 #' Title: Downscale and Agregate Woody Crop SOC stocks
 #' author: "David LeBauer"
 #' ---
-#' 
-## ----setup--------------------------------------------------------------------
-#remotes::install_github("dlebauer/pecan@ensemble_downscaling", subdir = "modules/assim.sequential", ref = "da96331")
-library(tidyverse)
-library(sf)
-library(terra)
-
-library(PEcAnAssimSequential)
+#'
 
-#' 
+#'
 #' # Overview
-#' 
+#'
 #' This workflow will:
-#' 
+#'
 #' - Use environmental covariates to predict SIPNET estimated SOC for each woody crop field in the LandIQ dataset
 #'   - Uses Random Forest [maybe change to CNN later] trained on site-scale model runs.
 #'   - Build a model for each ensemble member
 #' - Write out a table with predicted biomass and SOC to maintain ensemble structure, ensuring correct error propagation and spatial covariance.
 #' - Aggregate County-level biomass and SOC inventories
-#' 
+#'
+## ----setup--------------------------------------------------------------------
+# remotes::install_github("dlebauer/pecan@ensemble_downscaling", subdir = "modules/assim.sequential", ref = "da96331")
+library(tidyverse)
+library(sf)
+library(terra)
+devtools::load_all(here::here("../pecan/modules/assim.sequential/"))
+# library(PEcAnAssimSequential)
+
+options(readr.show_col_types = FALSE)
+
 #' ## Get Site Level Outputs
-#' 
-#' Here we we read in a table with site-level model outputs generated by SIPNETWOPET in 02_anchorsite_simulations.qmd.
-#' 
+#'
+#' Next we read in site-level model outputs generated by SIPNETWOPET in 02_anchorsite_simulations.R.
+#'
 #' TODO:
-#' - replace SIPNETWOPET outputs with SIPNET results
-#' 
+#' - replace SIPNETWOPET outputs with SIPNET results following regional runs
+#'
 ## -----------------------------------------------------------------------------
-sipnetwopet_output <- read_csv("cache/sipnetwopet_design_point_results.csv")
 
 ensemble_data <- readRDS("cache/ensemble_data.rds")
 
-
-
-
-#' 
 #' ### Random Forest using PEcAn downscale workflow
-#' 
-#' 
+#'
+#'
 ## -----------------------------------------------------------------------------
-site_attributes <- readr::read_csv("data/ca_field_attributes.csv") |>
-  filter(id %in% site_ids)
-
+site_coords <- readr::read_csv("data/ca_field_attributes.csv") |>
+  filter(pft == "woody perennial crop") |>
+  select(id, lon, lat, year)
 
 covariates <- load("data/data_for_clust_with_ids.rda") |>
   get() # maybe there is a less convoluted way; maybe I just need to rename data_for_clust...
 
-covariates_points <- design_points |> left_join(covariates, by = "id")
-covariates_vect <- covariates_points |>
-  vect(geom = c("lon", "lat"), crs = "EPSG:4326")
+covariates_points <- site_coords |>
+  left_join(covariates, by = "id")
+covariates_sf <- covariates_points |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = "EPSG:4326")
+
+# Preprocess using renamed objects and updated ensemble data with date
+preprocessed <- SDA_downscale_preprocess(
+  ensemble_data = ensemble_data,
+  site_coords   = site_coords,
+  date          = "2020-01-01",
+  carbon_pool   = "SOC"
+)
 
-raster::stack(
+####################### Start Here ############################
+## Next steps
+debugonce(SDA_downscale)
+## Stop at randomForest and see what is expected vs what is provided
+## Error in model.frame.default(formula = formula, data = train_data, na.action = function (object,  :
+##  invalid type (list) for variable 'ensemble1'
 
-)
+
+# Downscale the data
 downscale_output <- SDA_downscale(
   preprocessed = preprocessed,
-  date = "2020-01-01",
-  carbon_pool = "SOC",
-  covariates = covariates_stack,  # replace with your covariates SpatRaster stack
-  model_type = "rf",  # or "cnn" for Convolutional Neural Network
-  seed = 123
+  carbon_pool  = "SOC",
+  covariates   = covariates_vect,
+  model_type   = "rf",
+  seed         = 123
 )
 
+
+
 metrics <- SDA_downscale_metrics(downscale_output, carbon_pool = "SOC")
 print(metrics)
 
-#' 
-#' 
+#'
+#'
 #' ## Aggregate to County Level
-#' 
+#'
 ## -----------------------------------------------------------------------------
 library(sf)
 library(dplyr)
@@ -85,9 +99,9 @@ county_boundaries <- st_read("data/counties.gpkg")
 grid_with_counties <- st_join(ca_grid, county_boundaries, join = st_intersects)
 
 # Calculate county-level mean, median, and standard deviation.
-county_aggregates <- grid_with_counties  |>
-  st_drop_geometry()  |>   # drop geometry for faster summarization
-  group_by(county_name)  |>   # replace with your actual county identifier
+county_aggregates <- grid_with_counties |>
+  st_drop_geometry() |> # drop geometry for faster summarization
+  group_by(county_name) |> # replace with your actual county identifier
   summarize(
     mean_biomass   = mean(predicted_biomass, na.rm = TRUE),
     median_biomass = median(predicted_biomass, na.rm = TRUE),
diff --git a/downscale/sipnetwopet.R b/downscale/sipnetwopet.R
index 56ddeb5..64cbc9a 100644
--- a/downscale/sipnetwopet.R
+++ b/downscale/sipnetwopet.R
@@ -8,48 +8,53 @@
 #' internal stochastic model. SIPNETWOPET promises rough
 #' relationships between environmental variables and SOC and AGB.
 #'
-#' @param temp Mean annual temperature (<U+00B0>C)
+#' @param temp Mean annual temperature (degrees Celsius)
 #' @param precip Mean annual precipitation (mm)
 #' @param clay Clay content (%)
 #' @param ocd Organic carbon density (g/cm^3)
 #' @param twi Topographic wetness index
 #' @param ensemble_size Number of ensemble predictions to generate (default 10)
 #'
+#' @return A tibble with columns \code{ensemble_id}, \code{soc}, and \code{agb}
+#' for each ensemble member. Unspecified units.
+#'
+#' @author David LeBauer
+#'
+#' @examples
+#' @examples
+#' sipnetwopet(temp = 22, precip = 4500, clay = 25, ocd = 0.3, twi = 9, ensemble_size = 5)
 #'
 sipnetwopet <- function(
     temp, precip, clay, ocd, twi, ensemble_size = 10) {
     ensemble_results <- list()
-    for (i in seq_along(ensemble_size)) {
+    for (i in seq_len(ensemble_size)) {
         # Manually scale inputs using predefined dataset statistics
-        # scaled = (x - mean(x)) / sd(x)
         scaled_temp <- (temp - 20) / 2
         scaled_precip <- (precip - 5000) / 2000
         scaled_clay <- (clay - 20) / 6
         scaled_ocd <- (ocd - 300) / 60
         scaled_twi <- (twi - 10) / 2
 
-        # Add stochastic variation = 10% * sd
+        # Add stochastic variation
         scaled_temp <- scaled_temp * rnorm(1, 1, 0.1)
         scaled_precip <- scaled_precip * rnorm(1, 1, 0.1)
         scaled_clay <- scaled_clay * rnorm(1, 1, 0.1)
         scaled_ocd <- scaled_ocd * rnorm(1, 1, 0.1)
         scaled_twi <- scaled_twi * rnorm(1, 1, 0.1)
 
-        # Simulate SOC with various env effects and asymptotic bounds
+        # Simulate SOC with environmental effects and bounds
         .soc <- 80 + 15 * scaled_precip + 12 * scaled_temp + 50 * scaled_ocd + 15 * scaled_clay + 8 * scaled_twi +
             rnorm(1, 0, 10)
-        soc <- max(90 * (.soc / (100 + abs(.soc))), rlnorm(1, meanlog = log(50), sdlog = 0.3)) # Asymptotic upper and soft lower bound
+        soc <- max(90 * (.soc / (100 + abs(.soc))), rlnorm(1, meanlog = log(50), sdlog = 0.3))
 
-        # Simulate AGB with various env effects and soft lower and asymptotic upper bound constraint
+        # Simulate AGB with environmental effects and bounds
         .agb <- 120 + 25 * scaled_temp + 35 * scaled_precip + 10 * scaled_clay -
             8 * scaled_twi + rnorm(1, 0, 15)
-        agb <- max(450 * (.agb / (500 + abs(.agb))), rlnorm(1, meanlog = log(20), sdlog = 0.4)) # Asymptotic upper and soft lower bound
+        agb <- max(450 * (.agb / (500 + abs(.agb))), rlnorm(1, meanlog = log(20), sdlog = 0.4))
 
-        # Add to ensemble results
         ensemble_results[[i]] <- tibble::tibble(soc = soc, agb = agb)
     }
 
-    # Combine all ensemble members into a data frame
     ensemble_data <- dplyr::bind_rows(ensemble_results, .id = "ensemble_id")
     return(ensemble_data)
 }

From bf672b0c511d0055ad68e62479cf73e0acf5bf74 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 18 Feb 2025 22:48:06 -0500
Subject: [PATCH 14/49] Initial draft w/ targets package

---
 _targets.R                                    | 42 +++++++++++++++++++
 downscale/00-prepare.R                        |  8 +++-
 .../01_cluster_and_select_design_points.R     |  2 +
 downscale/02_design_point_simulations.R       |  2 +
 downscale/03_downscale_and_agregate.R         |  6 ++-
 5 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 _targets.R

diff --git a/_targets.R b/_targets.R
new file mode 100644
index 0000000..9082ca4
--- /dev/null
+++ b/_targets.R
@@ -0,0 +1,42 @@
+library(targets)
+library(tarchetypes)
+
+tar_option_set(
+    packages = c(
+        "tidyverse", "dplyr", "sf", "terra",
+        "randomForest", "keras3", "PEcAn.all", "caladaptr"
+    )
+)
+
+list(
+    tar_target(prepare_data, {
+        source("downscale/00-prepare.R")
+        data_for_clust_with_ids # output from 00-prepare.R
+    }),
+    tar_target(cluster_sites,
+        {
+            source("downscale/01_cluster_and_select_design_points.R")
+            cluster_output # output from 01_cluster_and_select_design_points.R
+        },
+        deps = prepare_data
+    ),
+    tar_target(simulations,
+        {
+            source("downscale/02_design_point_simulations.R")
+            design_point_wide # output from 02-design_point_simulations.R
+        },
+        deps = cluster_sites
+    ),
+    tar_target(downscale,
+        {
+            source("downscale/03_downscale_and_agregate.R")
+            ensemble_data # output from 03_downscale_and_agregate.R
+        },
+        deps = simulations
+    ),
+    tar_quarto(
+        analysis_report,
+        path = "04-analysis.qmd",
+        deps = list(simulations, downscale)
+    )
+)
diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 417b9dc..ca73bdd 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -389,6 +389,12 @@ data_for_clust_with_ids <- .all |>
 
 save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
 
+# Final output for targets; if not in targets, suppress return
+if (exists("IN_TARGETS") && IN_TARGETS) {
+  data_for_clust_with_ids
+} else {
+  invisible(data_for_clust_with_ids)
+}
 #' 
 #' <!--
 #' # [Draft] Unused Code
@@ -478,4 +484,4 @@ save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
 #    )
 
 #' -->
-#' 
+#'
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index ea618d5..7ebed98 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -370,4 +370,6 @@ pft_area <- pft_area |>
 pft_area |>
   kableExtra::kable()
 
+cluster_output  # final output from clustering and design point selection
+
 
diff --git a/downscale/02_design_point_simulations.R b/downscale/02_design_point_simulations.R
index 8f82314..1137727 100644
--- a/downscale/02_design_point_simulations.R
+++ b/downscale/02_design_point_simulations.R
@@ -65,3 +65,5 @@ design_point_wide <- design_point_results |>
 ensemble_data <- list("2020-01-01" = design_point_wide)
 
 saveRDS(ensemble_data, "cache/ensemble_data.rds")
+
+design_point_wide  # final simulated design point output
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 833fa16..b14dc51 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -53,6 +53,9 @@ covariates_sf <- covariates_points |>
   sf::st_as_sf(coords = c("lon", "lat"), crs = "EPSG:4326")
 
 # Preprocess using renamed objects and updated ensemble data with date
+
+####################### Start Here ############################
+# Find example inputs: `find "/projectnb/dietzelab/jploshay/" -name "*.rds"`
 preprocessed <- SDA_downscale_preprocess(
   ensemble_data = ensemble_data,
   site_coords   = site_coords,
@@ -60,7 +63,7 @@ preprocessed <- SDA_downscale_preprocess(
   carbon_pool   = "SOC"
 )
 
-####################### Start Here ############################
+
 ## Next steps
 debugonce(SDA_downscale)
 ## Stop at randomForest and see what is expected vs what is provided
@@ -82,6 +85,7 @@ downscale_output <- SDA_downscale(
 metrics <- SDA_downscale_metrics(downscale_output, carbon_pool = "SOC")
 print(metrics)
 
+
 #'
 #'
 #' ## Aggregate to County Level

From 968f153e1f33997aae32825ee7103c646b86ad1f Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 18 Feb 2025 22:48:06 -0500
Subject: [PATCH 15/49] Initial draft w/ targets package

---
 _targets.R                                    |  42 ++++++++
 data/anchor_sites_ids.csv                     |  26 +++++
 data/design_points.csv                        | 101 ++++++++++++++++++
 downscale/00-prepare.R                        |   8 +-
 .../01_cluster_and_select_design_points.R     |  50 ++++++---
 downscale/02_design_point_simulations.R       |   2 +
 downscale/03_downscale_and_agregate.R         |   6 +-
 7 files changed, 218 insertions(+), 17 deletions(-)
 create mode 100644 _targets.R
 create mode 100644 data/anchor_sites_ids.csv
 create mode 100644 data/design_points.csv

diff --git a/_targets.R b/_targets.R
new file mode 100644
index 0000000..9082ca4
--- /dev/null
+++ b/_targets.R
@@ -0,0 +1,42 @@
+library(targets)
+library(tarchetypes)
+
+tar_option_set(
+    packages = c(
+        "tidyverse", "dplyr", "sf", "terra",
+        "randomForest", "keras3", "PEcAn.all", "caladaptr"
+    )
+)
+
+list(
+    tar_target(prepare_data, {
+        source("downscale/00-prepare.R")
+        data_for_clust_with_ids # output from 00-prepare.R
+    }),
+    tar_target(cluster_sites,
+        {
+            source("downscale/01_cluster_and_select_design_points.R")
+            cluster_output # output from 01_cluster_and_select_design_points.R
+        },
+        deps = prepare_data
+    ),
+    tar_target(simulations,
+        {
+            source("downscale/02_design_point_simulations.R")
+            design_point_wide # output from 02-design_point_simulations.R
+        },
+        deps = cluster_sites
+    ),
+    tar_target(downscale,
+        {
+            source("downscale/03_downscale_and_agregate.R")
+            ensemble_data # output from 03_downscale_and_agregate.R
+        },
+        deps = simulations
+    ),
+    tar_quarto(
+        analysis_report,
+        path = "04-analysis.qmd",
+        deps = list(simulations, downscale)
+    )
+)
diff --git a/data/anchor_sites_ids.csv b/data/anchor_sites_ids.csv
new file mode 100644
index 0000000..028f6fd
--- /dev/null
+++ b/data/anchor_sites_ids.csv
@@ -0,0 +1,26 @@
+id,lat,lon,location,site_name,distance,geometry
+2e11530b93cd492f,36.177799888824005,-120.2041070387766,US-ASH,San Joaquin Valley Almond High Salinity,0,"c(-18200.4533456107, -204351.896427268)"
+3747151a269036b7,36.94728213363369,-120.10533383829694,US-ASL,San Joaquin Valley Almond Low Salinity,0,"c(-9106.72692414989, -118913.170740419)"
+2e11530b93cd492f,36.177799888824005,-120.2041070387766,US-ASM,San Joaquin Valley Almond Medium Salinity,0,"c(-18200.4533456107, -204351.896427268)"
+e07417e56c78efbc,36.82778534752612,-120.13964104692187,US-PSH,San Joaquin Valley Pistachio High Salinity,0,"c(-12443.4766403388, -132135.913488504)"
+e07417e56c78efbc,36.82778534752612,-120.13964104692187,US-PSL,San Joaquin Valley Pistachio Low Salinity,0,"c(-12443.4766403388, -132135.913488504)"
+278cc612aa84983a,38.099349553216186,-121.5015478519803,US-Bi1,Bouldin Island Alfalfa,0,"c(-131305.841205359, 10246.5206530676)"
+dce435941adb4679,38.109586695737626,-121.53611176665588,US-Bi2,Bouldin Island Corn,0,"c(-134423.030434314, 11397.1114464211)"
+3cea434df9ea5a61,32.81280501889075,-115.44206224718269,US-Dea,UC Desert REC Alfalfa,0,"c(427219.032719703, -567399.727086836)"
+aec9f214083b8f69,38.131541484694665,-121.55191049337776,US-DS1,Staten Corn 1,0,"c(-134720.473528729, 14114.7427923167)"
+713f08eb3322d4df,38.134598947289916,-121.51170668226486,US-DS2,Staten Corn 2,0,"c(-132525.126666226, 14524.1023560269)"
+175358d15d27e2f2,38.12345747466536,-121.54982768148254,US-DS3,Staten Rice 1,0,"c(-135613.932821504, 13017.4923524545)"
+bfee4301f7c99060,36.35733308587667,-119.09282519975774,US-Lin,Lindcove Orchard,0,"c(81360.1815020208, -184101.87965719)"
+86e23fd30a7c0360,39.577102324150175,-121.86224204465502,US-RGB,Butte County Rice Farm,0,"c(-159486.330544182, 175158.013580008)"
+e34fd28a6e4fabee,37.69947277766893,-121.1391060770639,US-RGF,Stanislaus Forage Farm,0,"c(-100102.043437352, -34624.8600835721)"
+62b121132228b0f8,39.59613673857021,-122.02261183095081,US-RGG,Glenn County Rice Farm,0,"c(-173815.733795438, 177251.993701875)"
+7a561525ca4933d3,39.678693479807265,-122.00275036599245,US-RGO,Glenn County Organic Rice,1.48947665615348,"c(-171665.812216293, 186775.251166129)"
+528fcb9581143cf5,38.167462870628164,-121.50580831669866,US-Si1,Staten Island Fallow,0,"c(-131645.501113361, 18645.981438884)"
+38f1738eb5750e06,38.16651300978058,-121.52120421290903,US-Si2,Staten Island Flooded,0,"c(-132693.326336371, 18784.9819894633)"
+5628e37756df9d41,38.10355405637426,-121.6381351773307,US-Tw2,Twitchell Corn,0,"c(-143324.768603639, 10189.6513305134)"
+66de78917c6222b0,38.1157201973455,-121.64507432545884,US-Tw3,Twitchell Alfalfa,0,"c(-144200.168244178, 12239.4602416055)"
+0fb8904477a3d0ce,38.10812386794265,-121.65528241872406,US-Twt,Twitchell Island,0,"c(-144755.572864808, 11526.4416088737)"
+93d63eb396c4fcf9,38.54295166338029,-121.87235457852974,russell-ranch,Russell Ranch,0.20145926374138,"c(-163142.459970614, 60154.8849130729)"
+106ced48ab2599e0,36.34077318361296,-120.10544017924985,west-side,West Side Research Center,55.7382484621095,"c(-9681.32377587362, -186108.326980472)"
+04681ffea182d977,38.50242041442077,-121.97641993262302,wolfskill,Wolfskill Experimental Orchards,0,"c(-172200.763852519, 55891.621553943)"
+9c562bb603f3b156,33.96726803332844,-117.34048733581272,ucr-citrus,UC Riverside Citrus Research,0,"c(245813.938552732, -446378.780685794)"
diff --git a/data/design_points.csv b/data/design_points.csv
new file mode 100644
index 0000000..7964aba
--- /dev/null
+++ b/data/design_points.csv
@@ -0,0 +1,101 @@
+id,lat,lon
+84389254458b3a58,39.15317154882152,-120.95862855945161
+9e3b1749179a4a68,37.99925059252099,-121.20417111702339
+729a6ef989c5a277,35.96903914573576,-118.98552352403604
+8c23aa17d5b211ed,37.23624485456503,-120.325416895509
+57ea98e608333b76,36.9069695937337,-121.68079166613182
+6c1e7a1eb4c36ea5,36.376933120111964,-119.45050526430943
+5b18c5e0bebe449b,34.472437079253254,-119.22014978387774
+058d96dea56318e8,39.845445833262076,-121.99765987838188
+df703141000c59c9,33.90118525984266,-117.4062351725988
+b8b2dd94a7c44662,33.353055572072144,-117.19182032970713
+dbb2c95a5cb30b5d,35.05617905596573,-119.27349643104922
+32b66a2605ee700d,36.62790357625534,-119.55109778115332
+4c3498f22c47aa3f,36.473246624392274,-119.58999372486372
+6def26440f10476c,36.55241300729422,-119.45881344136238
+59d94ec22335aba5,40.112945252187885,-122.12625218266525
+04268ba7e72d7bfb,36.04216376334651,-119.36168176985615
+f0c5457e0db29788,36.18711363649504,-118.99794513559745
+69151056c07813d8,36.01650482108036,-119.12213181961096
+096c9eebd28d0280,34.91295471748269,-120.40345170635351
+ebb783e86d2ac6fb,33.5784705873275,-116.03157176577903
+cf0be21778f4dfa5,36.64072986620738,-119.42861772753282
+b68880e61d448b93,36.628935713055334,-119.6304967981404
+1cb62b1edd188368,37.62449120008097,-120.54849914677672
+92e5e100e1a3a472,39.685053412615225,-122.3378033660457
+148e91e1a4352adb,39.883929087235735,-122.17230280623504
+e664ce5ac2fb2b6f,39.91336805758695,-122.11623004432802
+b33f7db23b64a377,37.95408699615648,-121.21347717890643
+268a52869b01fe54,36.703002551108156,-120.23482871624192
+bd841534f8883a00,36.79087105078594,-120.42721632875711
+60f846ca525da31e,36.557282635126995,-119.2223647492762
+e5d633da3758c491,38.698241380798265,-122.82083682384803
+ce81d5b03685a33e,36.48936289567316,-119.44392501284031
+ae81f35c7ed7c3f1,37.58450083241338,-121.28798414625811
+01609970faa3c5b9,38.247486225652,-122.1132785251325
+08da82da7c956979,38.10501623512874,-121.2107110261319
+d1dd970659a71943,34.37258472945189,-119.03318351437721
+39b7dba5debef188,36.643806231195825,-119.28748125011076
+8b27eb5fc60d506c,36.25395494422166,-119.26318761718049
+29baab2f12e2fd80,36.62869595451381,-119.55927338546941
+7f1efab0660af0a9,38.29195260046744,-121.54984763089621
+9c19b1bcbd391e00,36.665696469841485,-120.07237592351424
+462a123c3840df4e,36.88372637787987,-119.96643408398732
+2bd084fb5a916420,37.52907547714805,-121.1363874822378
+a05ea26a797f4bad,36.74201351767665,-119.66029084888854
+6fe9d9756f962d11,38.999929717176904,-122.85197002568268
+8c37c59148a2b480,36.950873550214546,-121.79638001291717
+cf9c2d9309d78058,35.656293805156665,-119.9791657782526
+765ad4a2cb597b04,37.61727674898043,-120.85985117950787
+d69343ed38ea49c7,35.93301364948056,-119.0188799294081
+c8ca707e38904560,37.430698407931075,-120.825633670933
+984a04e7d14a80a3,37.378490673407306,-120.70411638617661
+cf1c223d4e408116,37.07745477447745,-120.44392158740519
+11aa82e47c15b8df,34.38595949634103,-118.81445533388944
+f5cde95be18bb83d,39.61336729923421,-121.84504941875572
+2c9f5faf24bff9fd,36.139246198074844,-119.10524906004886
+30d7e9a739591579,38.694890771466596,-122.04115169522713
+889d6cb7440babd8,39.14835505068194,-121.78504578064799
+7afa2a6c097c1f5b,36.362595802193916,-119.10984333853257
+7b2e85c30ecfcbe8,36.036952558710894,-118.96561932450625
+ecc585936d3203a5,36.63049097515202,-119.58155869596567
+a860cdc63cfc7cd1,35.98506301531852,-119.01458795848025
+17b056351d1484f8,36.738759687243714,-120.03573971347572
+e968e9c8f8574cb2,36.383833240819285,-119.65554268083451
+000f186bcf4b2be5,37.29108282118807,-121.04882363442482
+54d2ebadd8e4e8e2,37.491197984797665,-121.0085531562697
+6616b64433b71c3c,35.772843204255615,-119.09087891682833
+1114b0b499d4854c,36.51520247850828,-119.42062406057501
+4032fbd64a0b5147,36.76356458613677,-119.50957323217338
+6328b22167f31aad,33.86884047977172,-117.40837842690881
+9a28cf783448dc03,34.29707712580956,-119.06013978221256
+25d92d00fe5f31ea,35.99612889983196,-119.13104338547221
+254f091dee798a22,39.867933961656206,-122.16830730681002
+f87888e9df3fab14,39.70189739031584,-121.79205467074456
+05ce4008f973ff67,36.49534520970872,-119.5917993311476
+132df615713a9102,36.96032567057031,-121.37833422008488
+64cf20b7075f16bd,36.28704555525085,-119.12755106080951
+8c469dfef3eac688,39.73425797216054,-122.13900242983709
+d4aa2f4b173e52d9,39.894655731543544,-122.218282577988
+7a8855e8f27579dd,36.82200408868894,-120.01066959023188
+02ed5f046523985e,37.66867911623095,-121.71245595264062
+3bde8bc179b8fcae,36.90691122790127,-119.67871974793037
+5e21d98f69f14285,35.49646210616489,-119.7232326291538
+5aabe8a4b61ac4b1,37.70219947002282,-120.98628447816145
+ba5921d796f688e5,39.54046659749015,-122.22839029392931
+bf27f77fb0ee40e0,39.14372601708733,-121.57068428157207
+4c49ef413147e623,38.50692937324865,-121.97470253705194
+427606829eff9a98,36.60051373911106,-119.25963820359836
+24766015396f3dde,36.63384726465495,-119.49151662644384
+95af7876982dad47,39.667825249028695,-122.25810998713746
+e3daa31507eccfd0,39.16472984057072,-121.76550564744215
+b609ae97085420fb,37.87291873963135,-121.30336862647805
+1e113c95a1054df5,38.01749742979522,-121.19637437982693
+d6ce58a349148d32,36.20587304779096,-120.25336201321787
+2e11530b93cd492f,36.177799888824005,-120.2041070387766
+3747151a269036b7,36.94728213363369,-120.10533383829694
+2e11530b93cd492f,36.177799888824005,-120.2041070387766
+e07417e56c78efbc,36.82778534752612,-120.13964104692187
+e07417e56c78efbc,36.82778534752612,-120.13964104692187
+bfee4301f7c99060,36.35733308587667,-119.09282519975774
+9c562bb603f3b156,33.96726803332844,-117.34048733581272
diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 417b9dc..ca73bdd 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -389,6 +389,12 @@ data_for_clust_with_ids <- .all |>
 
 save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
 
+# Final output for targets; if not in targets, suppress return
+if (exists("IN_TARGETS") && IN_TARGETS) {
+  data_for_clust_with_ids
+} else {
+  invisible(data_for_clust_with_ids)
+}
 #' 
 #' <!--
 #' # [Draft] Unused Code
@@ -478,4 +484,4 @@ save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
 #    )
 
 #' -->
-#' 
+#'
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index ea618d5..dd8ce49 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -47,10 +47,6 @@ if('mean_temp' %in% names(data_for_clust_with_ids)){
     "this conditional chunk")
 }
 
-# set coordinate reference system
-ca_albers_crs <- 3310 # California Albers EPSG
-
-
 #' 
 #' ## Load Site Environmental Data
 #' 
@@ -75,19 +71,41 @@ ca_albers_crs <- 3310 # California Albers EPSG
 #' Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
 #' 
 ## ----anchor-sites-selection---------------------------------------------------
-anchor_sites <- readr::read_csv("data/anchor_sites.csv")
-ca_woody <- sf::st_read("data/ca_woody.gpkg") |> 
-  select(-pft) # duplicates pft column in anchor_sites
 
-anchor_sites_pts <- anchor_sites |>
-  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |> 
-  sf::st_join(ca_woody, join = sf::st_within) |> 
-  dplyr::select(id, lat, lon, location, site_name, crops, pft)
+# set coordinate reference system, local and in meters for faster joins
+ca_albers_crs <- 3310 # California Albers EPSG
 
-# print a nice table of anchor sites
-knitr::kable(anchor_sites)
+anchor_sites_pts <- readr::read_csv("data/anchor_sites.csv") |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
+  sf::st_transform(crs = ca_albers_crs) |>
+  dplyr::mutate(pt_geometry = geometry) |> 
+  rename(anchor_site_pft = pft)
+
+# ca_woody <- sf::st_read("data/ca_woody.gpkg")
+
+ca_fields <- sf::st_read("data/ca_fields.gpkg") |>
+  # must use st_crs(anchor_sites_pts) b/c  !identical(ca_albers_crs, st_crs(ca_albers_crs))
+  sf::st_transform(crs = st_crs(anchor_sites_pts))  |> 
+  rename(landiq_pft = pft)
+
+# Get the index of the nearest polygon for each point
+nearest_idx <- st_nearest_feature(anchor_sites_pts, ca_fields)
+site_field_distances <- diag(st_distance(anchor_sites_pts, ca_fields |> slice(nearest_idx)))
+ca_field_ids  <- ca_fields |> 
+  dplyr::slice(nearest_idx) |>
+  dplyr::select(id, lat, lon)
+
+anchor_sites_ids <- dplyr::bind_cols(
+  anchor_sites_pts,
+  ca_field_ids,
+  distance = site_field_distances
+) |>
+  dplyr::select(id, lat, lon, location, site_name, distance) #,anchor_site_pft, landiq_pft)
+  
+anchor_sites_ids |>
+  readr::write_csv("data/anchor_sites_ids.csv")
 # create map of anchor sites
-anchor_sites_pts |>
+anchor_sites_ids |>
   sf::st_transform(., crs = ca_albers_crs) |>
   ggplot() +
   geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.25) +
@@ -309,7 +327,7 @@ final_design_points <- bind_rows(design_points_ids,
 final_design_points |>
    as_tibble()  |>
    select(id, lat, lon) |>
-   write_csv("data/final_design_points.csv")
+   write_csv("data/design_points.csv")
 
 
 #' 
@@ -370,4 +388,6 @@ pft_area <- pft_area |>
 pft_area |>
   kableExtra::kable()
 
+cluster_output  # final output from clustering and design point selection
+
 
diff --git a/downscale/02_design_point_simulations.R b/downscale/02_design_point_simulations.R
index 8f82314..1137727 100644
--- a/downscale/02_design_point_simulations.R
+++ b/downscale/02_design_point_simulations.R
@@ -65,3 +65,5 @@ design_point_wide <- design_point_results |>
 ensemble_data <- list("2020-01-01" = design_point_wide)
 
 saveRDS(ensemble_data, "cache/ensemble_data.rds")
+
+design_point_wide  # final simulated design point output
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 833fa16..b14dc51 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -53,6 +53,9 @@ covariates_sf <- covariates_points |>
   sf::st_as_sf(coords = c("lon", "lat"), crs = "EPSG:4326")
 
 # Preprocess using renamed objects and updated ensemble data with date
+
+####################### Start Here ############################
+# Find example inputs: `find "/projectnb/dietzelab/jploshay/" -name "*.rds"`
 preprocessed <- SDA_downscale_preprocess(
   ensemble_data = ensemble_data,
   site_coords   = site_coords,
@@ -60,7 +63,7 @@ preprocessed <- SDA_downscale_preprocess(
   carbon_pool   = "SOC"
 )
 
-####################### Start Here ############################
+
 ## Next steps
 debugonce(SDA_downscale)
 ## Stop at randomForest and see what is expected vs what is provided
@@ -82,6 +85,7 @@ downscale_output <- SDA_downscale(
 metrics <- SDA_downscale_metrics(downscale_output, carbon_pool = "SOC")
 print(metrics)
 
+
 #'
 #'
 #' ## Aggregate to County Level

From 6b64c7a5e3c715ab4b07015adca369cf1cd257f1 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 13 Mar 2025 01:29:38 -0400
Subject: [PATCH 16/49] add first version of script to  extract sipnet output

---
 downscale/02_extract_sipnet_output.R          | 131 ++++++++++++++++++
 ...tions.R => _02_design_point_simulations.R} |   0
 2 files changed, 131 insertions(+)
 create mode 100644 downscale/02_extract_sipnet_output.R
 rename downscale/{02_design_point_simulations.R => _02_design_point_simulations.R} (100%)

diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
new file mode 100644
index 0000000..aa14e34
--- /dev/null
+++ b/downscale/02_extract_sipnet_output.R
@@ -0,0 +1,131 @@
+library(PEcAn.logger)
+library(lubridate)
+library(dplyr)
+here::i_am('.here')
+
+# Define base directory for ensemble outputs
+basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_98sites_20reps_20250312"
+outdir <- file.path(basedir, "out")
+
+# Variables to extract
+variables <- c("AGB", "TotSoilCarb")
+
+# Read Settings
+settings <- PEcAn.settings::read.settings(file.path(basedir, "pecan.CONFIGS.xml"))
+ensemble_size <- settings$ensemble$size |> as.numeric()
+start_date <- settings$run$settings.1$start.date # TODO make this unique for each site
+end_date <- settings$run$settings.1$end.date
+end_year <- lubridate::year(end_date)
+
+#' **Available Variables**
+#'
+#' See SIPNET parameters.md for more details
+#'
+#' | Variable                      | Description                              |
+#' |-------------------------------|------------------------------------------|
+#' | GPP                           | Gross Primary Productivity               |
+#' | NPP                           | Net Primary Productivity                 |
+#' | TotalResp                     | Total Respiration                        |
+#' | AutoResp                      | Autotrophic Respiration                  |
+#' | HeteroResp                    | Heterotrophic Respiration                |
+#' | SoilResp                      | Soil Respiration                         |
+#' | NEE                           | Net Ecosystem Exchange                   |
+#' | AbvGrndWood                   | Above ground woody biomass               |
+#' | leaf_carbon_content           | Leaf Carbon Content                      |
+#' | TotLivBiom                    | Total living biomass                     |
+#' | TotSoilCarb                   | Total Soil Carbon                        |
+#' | Qle                           | Latent heat                              |
+#' | Transp                        | Total transpiration                      |
+#' | SoilMoist                     | Average Layer Soil Moisture              |
+#' | SoilMoistFrac                 | Average Layer Fraction of Saturation     |
+#' | SWE                           | Snow Water Equivalent                    |
+#' | litter_carbon_content         | Litter Carbon Content                    |
+#' | litter_mass_content_of_water  | Average layer litter moisture            |
+#' | LAI                           | Leaf Area Index                          |
+#' | fine_root_carbon_content      | Fine Root Carbon Content                 |
+#' | coarse_root_carbon_content    | Coarse Root Carbon Content               |
+#' | GWBI                          | Gross Woody Biomass Increment            |
+#' | AGB                           | Total aboveground biomass                |
+#' | time_bounds                   | history time interval endpoints          |
+
+# Preallocate 3-D array for 98 sites, 2 variables, and 20 ensemble members
+site_ids <- readr::read_csv(here::here("data/design_points.csv")) |>
+    pull(id) |>
+    unique()
+ens_ids <- PEcAn.utils::left.pad.zeros(1:ensemble_size)
+
+##-----TESTING SUBSET-----##
+# comment out for full run # 
+#site_ids <- site_ids[1:5]
+#ens_ids <- ens_ids[1:5]
+
+ens_dirs <- expand.grid(ens = ens_ids, site = site_ids, stringsAsFactors = FALSE) |>
+    mutate(dir = file.path(outdir, paste("ENS", ens, site, sep = "-")))
+# check that all ens dirs exist
+existing_dirs <- file.exists(ens_dirs$dir)
+if (!all(existing_dirs)) {
+    missing_dirs <- ens_dirs[!existing_dirs]
+    PEcAn.logger::logger.warn("Missing expected ensemble directories: ", paste(missing_dirs, collapse = ", "))
+}
+
+# Loop through ensemble folders and extract output via read.output
+library(furrr)
+plan(multisession)
+
+# Use purrr and dplyr to process ensemble directories in parallel
+ens_results <- furrr::future_pmap_dfr(
+    ens_dirs,
+    function(ens, site, dir) {
+        out_df <- PEcAn.utils::read.output(
+            runid = paste(ens, site, sep = "-"),
+            outdir = dir,
+            start.year = end_year, # only reading in final year
+            end.year = end_year,
+            variables = variables,
+            dataframe = TRUE,
+            verbose = FALSE
+        ) |>
+            mutate(site = site, ens = ens)
+    },
+    .options = furrr::furrr_options(seed = TRUE)
+) |>
+    group_by(ens, site) |>
+    filter(posix == max(posix)) |>
+    ungroup() |>
+    arrange(ens, site)
+
+
+ens_array <- array(NA,
+    dim = c(length(site_ids), length(variables), length(ens_ids)),
+    dimnames = list(
+        site = site_ids,
+        variable = variables,
+        ensemble = ens_ids
+    )
+)
+
+i_site     <- match(ens_results$site, site_ids)
+i_variable <- match(ens_results$variable, variables)
+i_ens      <- match(ens_results$ens, ens_ids)
+ens_array[cbind(i_site, i_variable, i_ens)] <- ens_results$value
+
+save(ens_array, file = file.path(outdir, "ens_array.RData"))
+
+## Create EFI std data structure
+logfile <- dir(basedir, pattern = "pecan_workflow_runlog")
+pattern <- "^pecan_workflow_runlog_([0-9]{14})_([0-9]+-[0-9]+)\\.log$"
+matches <- stringr::str_match(logfile, pattern)
+forecast_time_string <- matches[2]
+forecast_unique_id <- matches[3]
+
+efi_std <- ens_results |>
+    left_join(readr::read_csv("data/design_points.csv") |> distinct(), by = c("site" = "id")) |>
+    mutate(
+        forecast_iteration_id = forecast_unique_id,
+        forecast_time = as.POSIXct(forecast_time_string, format = "%Y%m%d%H%M%S"),
+        obs_flag = 0
+    ) |>
+    rename(time = posix, ensemble = ens, X = lon, Y = lat) |>
+    select(time, ensemble, X, Y, TotSoilCarb, AGB, obs_flag)
+
+readr::write_csv(efi_std, file.path(outdir, "efi_std_ens_results.csv"))
diff --git a/downscale/02_design_point_simulations.R b/downscale/_02_design_point_simulations.R
similarity index 100%
rename from downscale/02_design_point_simulations.R
rename to downscale/_02_design_point_simulations.R

From 7c461ff0157bfa5d75c874c42ee3f758c9290b12 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Thu, 13 Mar 2025 22:12:27 -0400
Subject: [PATCH 17/49] create EFI standard tables and arrays

---
 .../01_cluster_and_select_design_points.R     |  40 +--
 downscale/02_extract_sipnet_output.R          | 232 ++++++++++++++----
 2 files changed, 207 insertions(+), 65 deletions(-)

diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index dd8ce49..040628d 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -9,7 +9,9 @@
 #' 
 #' - Read in a dataset of site environmental data
 #' - Perform K-means clustering to identify clusters
-#' - Select anchor sites for each cluster
+#' - Select design points for each cluster
+#' - create design_points.csv and anchor_sites.csv
+#' 
 #' 
 #' ## Setup
 #' 
@@ -265,7 +267,8 @@ knitr::kable(cluster_summary |> round(0))
 crop_ids <- read_csv("data/crop_ids.csv",
                      col_types = cols(
                        crop_id = col_factor(),
-                       crop = col_character()))
+                       crop = col_character())
+                       )
 climregion_ids <- read_csv("data/climregion_ids.csv",
                            col_types = cols(
                              climregion_id = col_factor(),
@@ -283,7 +286,8 @@ plan(sequential)
 #' 
 #' ## Design Point Selection
 #' 
-#' For phase 1b we need to supply design points for SIPNET runs. For development we will use 100 design points from the clustered dataset that are _not_ already anchor sites.
+#' For phase 1b we need to supply design points for SIPNET runs. 
+#' For development we will use 100 design points from the clustered dataset that are _not_ already anchor sites.
 #' 
 #' For the final high resolution runs we expect to use approximately 10,000 design points.
 #' For woody croplands, we will start with a number proportional to the total number of sites with woody perennial pfts.
@@ -304,10 +308,15 @@ missing_anchor_sites <- woody_anchor_sites|>
 if(nrow(missing_anchor_sites) > 0){
   woody_anchor_sites <- woody_anchor_sites |> 
                           drop_na(lat, lon)
-  # there is an anchor site that doesn't match the ca_fields; 
+  # there is an anchor site that doesn't match the ca_fields;
   # need to check on this. For now we will just remove it from the dataset.
-  PEcAn.logger::logger.warn("The following site(s) aren't within DWR crop fields:", 
-                             knitr::kable(missing_anchor_sites))
+  PEcAn.logger::logger.warn(
+    "The following site(s) aren't within DWR crop fields:",
+    knitr::kable(missing_anchor_sites)
+  )
+  PEcAn.logger::logger.info(
+    "Check the sf::st_nearest_feature join at the beginning of this script"
+  )
 }   
 
 set.seed(2222222)
@@ -355,10 +364,7 @@ ggplot() +
   geom_sf(data = ca_fields_pts, fill = 'black', color = "grey", alpha = 0.5)
 
 
-
-
-#' 
-#' 
+#'
 #' ## Woody Cropland Proportion
 #' 
 #' Here we calculate percent of California croplands that are woody perennial crops, in order to estimate the number of design points that will be selected in the clustering step
@@ -384,10 +390,16 @@ pft_area <- pft_area |>
   dplyr::mutate(area_pct = round(100 * pft_area / total_area)) |>
   select(-total_area, -pft_area) |>
   dplyr::rename("Woody Crops" = woody_indicator, "Area %" = area_pct) 
-  
-pft_area |>
-  kableExtra::kable()
 
-cluster_output  # final output from clustering and design point selection
+PEcAn.logger::logger.info(
+  "Total area and proportion of fields that are woody perennial",
+  "crops in California croplands:",
+  pft_area |> kableExtra::kable()
+)
+
+PEcAn.logger::logger.info(
+  "final output from clustering and design point selection:",
+  cluster_summary |> knitr::kable()
+) #
 
 
diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index aa14e34..4e8dfdf 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -1,24 +1,46 @@
 library(PEcAn.logger)
 library(lubridate)
 library(dplyr)
-here::i_am('.here')
+library(ncdf4)
+library(furrr)
+library(stringr)
+
+no_cores <- parallel::detectCores(logical = FALSE)
+plan(multicore, workers = no_cores - 1)
 
 # Define base directory for ensemble outputs
 basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_98sites_20reps_20250312"
 outdir <- file.path(basedir, "out")
 
-# Variables to extract
-variables <- c("AGB", "TotSoilCarb")
+# Get Run metadata from log filename 
+# ??? is there a more reliable way to do this?
+logfile <- dir(basedir, pattern = "pecan_workflow_runlog")
+pattern <- "^pecan_workflow_runlog_([0-9]{14})_([0-9]+-[0-9]+)\\.log$"
+matches <- stringr::str_match(logfile, pattern)
+forecast_time_string <- matches[2]
+forecast_iteration_id <- matches[3]
+forecast_time <- as.POSIXct(forecast_time_string, format = "%Y%m%d%H%M%S")
+obs_flag <- 0
 
-# Read Settings
+# Read settings file and extract run information
 settings <- PEcAn.settings::read.settings(file.path(basedir, "pecan.CONFIGS.xml"))
-ensemble_size <- settings$ensemble$size |> as.numeric()
+ensemble_size <- settings$ensemble$size |>
+    as.numeric()
 start_date <- settings$run$settings.1$start.date # TODO make this unique for each site
+start_year <- lubridate::year(start_date)
 end_date <- settings$run$settings.1$end.date
 end_year <- lubridate::year(end_date)
 
+# Site Information
+design_points <- readr::read_csv(here::here("data/design_points.csv"), show_col_types = FALSE) |>
+    distinct()
+
+# Variables to extract
+variables <- c("AGB", "TotSoilCarb")
+
 #' **Available Variables**
-#'
+#' This list is from the YYYY.nc.var files, and may change, 
+#'   e.g. if we write out less information in order to save time and storage space
 #' See SIPNET parameters.md for more details
 #'
 #' | Variable                      | Description                              |
@@ -49,83 +71,191 @@ end_year <- lubridate::year(end_date)
 #' | time_bounds                   | history time interval endpoints          |
 
 # Preallocate 3-D array for 98 sites, 2 variables, and 20 ensemble members
-site_ids <- readr::read_csv(here::here("data/design_points.csv")) |>
+site_ids <- design_points |>
     pull(id) |>
     unique()
-ens_ids <- PEcAn.utils::left.pad.zeros(1:ensemble_size)
+ens_ids <- 1:ensemble_size
 
 ##-----TESTING SUBSET-----##
 # comment out for full run # 
-#site_ids <- site_ids[1:5]
-#ens_ids <- ens_ids[1:5]
+# site_ids   <- site_ids[1:5]
+# ens_ids    <- ens_ids[1:5]
+# start_year <- end_year - 1
 
-ens_dirs <- expand.grid(ens = ens_ids, site = site_ids, stringsAsFactors = FALSE) |>
+ens_dirs <- expand.grid(ens = PEcAn.utils::left.pad.zeros(ens_ids), 
+                        site = site_ids, 
+                        stringsAsFactors = FALSE) |>
     mutate(dir = file.path(outdir, paste("ENS", ens, site, sep = "-")))
-# check that all ens dirs exist
+# Check that all ens dirs exist
 existing_dirs <- file.exists(ens_dirs$dir)
 if (!all(existing_dirs)) {
     missing_dirs <- ens_dirs[!existing_dirs]
     PEcAn.logger::logger.warn("Missing expected ensemble directories: ", paste(missing_dirs, collapse = ", "))
 }
 
-# Loop through ensemble folders and extract output via read.output
-library(furrr)
-plan(multisession)
-
-# Use purrr and dplyr to process ensemble directories in parallel
+# extract output via read.output
 ens_results <- furrr::future_pmap_dfr(
     ens_dirs,
     function(ens, site, dir) {
         out_df <- PEcAn.utils::read.output(
             runid = paste(ens, site, sep = "-"),
             outdir = dir,
-            start.year = end_year, # only reading in final year
+            start.year = start_year,
             end.year = end_year,
             variables = variables,
             dataframe = TRUE,
             verbose = FALSE
         ) |>
-            mutate(site = site, ens = ens)
+            dplyr::mutate(site = site, ensemble = as.numeric(ens)) |>
+            dplyr::rename(time = posix)
     },
     .options = furrr::furrr_options(seed = TRUE)
 ) |>
-    group_by(ens, site) |>
-    filter(posix == max(posix)) |>
+    group_by(ensemble, site, year) |>
+    filter(year <= end_year) |>
+    filter(time == max(time)) |> # only take last value
     ungroup() |>
-    arrange(ens, site)
+    arrange(ensemble, site, year)  |> 
+    tidyr::pivot_longer(cols = all_of(variables), names_to = "variable", values_to = "prediction")
 
+# --- Create 4-D array ---
+# Add a time dimension (even if of length 1) so that dimensions are: [time, site, ensemble, variable]
+unique_times <- sort(unique(ens_results$time))
+if(length(unique_times) != length(start_year:end_year)){
+    # this check may fail if we are using > one time point per year, 
+    # i.e. if the code above including group_by(.., year) is changed
+    PEcAn.logger::logger.warn( 
+        "there should only be one unique time per year",
+        "unless we are doing a time series with multiple time points per year"
+    )
+}
 
-ens_array <- array(NA,
-    dim = c(length(site_ids), length(variables), length(ens_ids)),
-    dimnames = list(
-        site = site_ids,
-        variable = variables,
-        ensemble = ens_ids
+# Create a list to hold one 3-D array per variable
+ens_arrays <- list()
+for (var in variables) {
+    # Preallocate 3-D array for time, site, ensemble for each variable
+    arr <- array(NA,
+        dim = c(length(unique_times), length(site_ids), length(ens_ids)),
+        dimnames = list(
+            datetime = as.character(unique_times),
+            site = site_ids,
+            ensemble = as.character(ens_ids)
+        )
     )
+    
+    # Get rows corresponding to the current variable
+    subset_idx <- which(ens_results$variable == var)
+    if (length(subset_idx) > 0) {
+        i_time <- match(ens_results$time[subset_idx], unique_times)
+        i_site <- match(ens_results$site[subset_idx], site_ids)
+        i_ens <- match(ens_results$ensemble[subset_idx], ens_ids)
+        arr[cbind(i_time, i_site, i_ens)] <- ens_results$prediction[subset_idx]
+    }
+    
+    ens_arrays[[var]] <- arr
+}
+
+save(ens_arrays, file = file.path(outdir, "efi_ens_arrays.RData"))
+    
+efi_long <- ens_results |>
+    rename(datetime = time) |>
+    select(datetime, site, ensemble, variable, prediction)
+
+readr::write_csv(efi_long, file.path(outdir, "efi_ens_long.csv"))
+
+
+####--- Generate EFI Standard v1.0 NetCDF files
+library(ncdf4)
+# Assume these objects already exist (created above):
+#   unique_times: vector of unique datetime strings
+#   design_points: data frame with columns lat, lon, and id (site_ids)
+#   ens_ids: vector of ensemble member numbers (numeric)
+#   ens_arrays: list with elements "AGB" and "TotSoilCarb" that are arrays
+#       with dimensions: datetime, site, ensemble
+
+# Get dimension names / site IDs
+time_char <- unique_times
+
+lat <- design_points |>
+    filter(id %in% site_ids) |> # only required when testing w/ subset
+    dplyr::pull(lat)
+lon <- design_points |>
+    filter(id %in% site_ids) |>
+    dplyr::pull(lon)
+
+# Convert time to CF-compliant values using PEcAn.utils::datetime2cf
+time_units <- "days since 1970-01-01 00:00:00"
+cf_time <- PEcAn.utils::datetime2cf(time_char, unit = time_units)
+
+# TODO: could accept start year as an argument to the to_ncdim function if variable = 'time'? Or set default? 
+#       Otherwise this returns an invalid dimension 
+# time_dim <- PEcAn.utils::to_ncdim("time", cf_time)
+time_dim <- ncdf4::ncdim_def(
+    name = "datetime",
+    longname = "time",
+    units = time_units,
+    vals = cf_time,
+    calendar = "standard",
+    unlim = TRUE
 )
+# For ensemble, we use the available ens_ids; for site, we use the indices of site_ids.
+ensemble_dim <- ncdim_def("ensemble", "", vals = ens_ids, longname = "ensemble member", unlim = FALSE)
+site_dim <- ncdim_def("site", "", vals = seq_along(site_ids), longname = "Site ID", unlim = FALSE)
 
-i_site     <- match(ens_results$site, site_ids)
-i_variable <- match(ens_results$variable, variables)
-i_ens      <- match(ens_results$ens, ens_ids)
-ens_array[cbind(i_site, i_variable, i_ens)] <- ens_results$value
+# Use dims in reversed order so that the unlimited (time) dimension ends up as the record dimension:
+dims <- list(time_dim, site_dim, ensemble_dim)
 
-save(ens_array, file = file.path(outdir, "ens_array.RData"))
+# Define forecast variables:
+agb_ncvar <- ncvar_def(
+    name = "AGB",
+    units = "kg C m-2",
+    dim = dims,
+    longname = "Total aboveground biomass"
+)
+soc_ncvar <- ncvar_def(
+    name = "TotSoilCarb",
+    units = "kg C m-2",
+    dim = dims,
+    longname = "Total Soil Carbon"
+)
 
-## Create EFI std data structure
-logfile <- dir(basedir, pattern = "pecan_workflow_runlog")
-pattern <- "^pecan_workflow_runlog_([0-9]{14})_([0-9]+-[0-9]+)\\.log$"
-matches <- stringr::str_match(logfile, pattern)
-forecast_time_string <- matches[2]
-forecast_unique_id <- matches[3]
-
-efi_std <- ens_results |>
-    left_join(readr::read_csv("data/design_points.csv") |> distinct(), by = c("site" = "id")) |>
-    mutate(
-        forecast_iteration_id = forecast_unique_id,
-        forecast_time = as.POSIXct(forecast_time_string, format = "%Y%m%d%H%M%S"),
-        obs_flag = 0
-    ) |>
-    rename(time = posix, ensemble = ens, X = lon, Y = lat) |>
-    select(time, ensemble, X, Y, TotSoilCarb, AGB, obs_flag)
-
-readr::write_csv(efi_std, file.path(outdir, "efi_std_ens_results.csv"))
+nc_vars <- list(
+    time = time_var,
+    lat  = lat_var,
+    lon  = lon_var,
+    AGB  = agb_ncvar,
+    TotSoilCarb = soc_ncvar
+)
+
+nc_file <- file.path(outdir, "efi_forecast.nc")
+
+if (file.exists(nc_file)) {    
+    file.remove(nc_file)
+
+}
+
+nc_out <- ncdf4::nc_create(nc_file, nc_vars)
+# Add attributes to coordinate variables for clarity
+# ncdf4::ncatt_put(nc_out, "time", "bounds", "time_bounds", prec = NA)
+# ncdf4::ncatt_put(nc_out, "time", "axis", "T", prec = NA)
+# ncdf4::ncatt_put(nc_out, "site", "axis", "Y", prec = NA)
+# ncdf4::ncatt_put(nc_out, "ensemble", "axis", "E", prec = NA)
+
+# Write data into the netCDF file.
+ncvar_put(nc_out, time_var, cf_time)
+ncvar_put(nc_out, lat_var, lat)
+ncvar_put(nc_out, lon_var, lon)
+ncvar_put(nc_out, agb_ncvar, ens_arrays[["AGB"]])
+ncvar_put(nc_out, soc_ncvar, ens_arrays[["TotSoilCarb"]])
+
+# Add global attributes per EFI standards.
+ncatt_put(nc_out, 0, "model_name", "SIPNET")
+ncatt_put(nc_out, 0, "model_version", "v1.3")
+ncatt_put(nc_out, 0, "iteration_id", "forecast_iteration_id")
+ncatt_put(nc_out, 0, "forecast_time", forecast_time)
+ncatt_put(nc_out, 0, "obs_flag", 0)
+ncatt_put(nc_out, 0, "creation_date", Sys.time())
+# Close the netCDF file.
+nc_close(nc_out)
+
+PEcAn.logger::logger.info("EFI-compliant netCDF file 'efi_forecast.nc' created.")

From 22c36258b67dea6b9e98831f4b588d73ec0f09c9 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 14 Mar 2025 15:40:23 -0400
Subject: [PATCH 18/49] correct netcdf format

---
 downscale/02_extract_sipnet_output.R | 37 ++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 4e8dfdf..ee75f34 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -19,7 +19,7 @@ pattern <- "^pecan_workflow_runlog_([0-9]{14})_([0-9]+-[0-9]+)\\.log$"
 matches <- stringr::str_match(logfile, pattern)
 forecast_time_string <- matches[2]
 forecast_iteration_id <- matches[3]
-forecast_time <- as.POSIXct(forecast_time_string, format = "%Y%m%d%H%M%S")
+forecast_time <- lubridate::as_datetime(as.POSIXct(forecast_time_string, format = "%Y%m%d%H%M%S"))
 obs_flag <- 0
 
 # Read settings file and extract run information
@@ -191,16 +191,15 @@ cf_time <- PEcAn.utils::datetime2cf(time_char, unit = time_units)
 #       Otherwise this returns an invalid dimension 
 # time_dim <- PEcAn.utils::to_ncdim("time", cf_time)
 time_dim <- ncdf4::ncdim_def(
-    name = "datetime",
-    longname = "time",
+    name = "ntime",
+    longname = "Time middle averaging period",
     units = time_units,
     vals = cf_time,
     calendar = "standard",
-    unlim = TRUE
+    unlim = FALSE
 )
-# For ensemble, we use the available ens_ids; for site, we use the indices of site_ids.
-ensemble_dim <- ncdim_def("ensemble", "", vals = ens_ids, longname = "ensemble member", unlim = FALSE)
 site_dim <- ncdim_def("site", "", vals = seq_along(site_ids), longname = "Site ID", unlim = FALSE)
+ensemble_dim <- ncdim_def("ensemble", "", vals = ens_ids, longname = "ensemble member", unlim = FALSE)
 
 # Use dims in reversed order so that the unlimited (time) dimension ends up as the record dimension:
 dims <- list(time_dim, site_dim, ensemble_dim)
@@ -218,6 +217,25 @@ soc_ncvar <- ncvar_def(
     dim = dims,
     longname = "Total Soil Carbon"
 )
+time_var <- ncvar_def(
+    name = "time",
+    units = "days since 1970-01-01 00:00:00",
+    dim = time_dim,
+    longname = "Time dimension"
+)
+lat_var <- ncvar_def(
+    name = "lat",
+    units = "degrees_north",
+    dim = site_dim,
+    longname = "Latitude"
+)
+
+lon_var <- ncvar_def(
+    name = "lon",
+    units = "degrees_east",
+    dim = site_dim,
+    longname = "Longitude"
+)
 
 nc_vars <- list(
     time = time_var,
@@ -231,7 +249,6 @@ nc_file <- file.path(outdir, "efi_forecast.nc")
 
 if (file.exists(nc_file)) {    
     file.remove(nc_file)
-
 }
 
 nc_out <- ncdf4::nc_create(nc_file, nc_vars)
@@ -251,10 +268,10 @@ ncvar_put(nc_out, soc_ncvar, ens_arrays[["TotSoilCarb"]])
 # Add global attributes per EFI standards.
 ncatt_put(nc_out, 0, "model_name", "SIPNET")
 ncatt_put(nc_out, 0, "model_version", "v1.3")
-ncatt_put(nc_out, 0, "iteration_id", "forecast_iteration_id")
-ncatt_put(nc_out, 0, "forecast_time", forecast_time)
+ncatt_put(nc_out, 0, "iteration_id", forecast_iteration_id)
+ncatt_put(nc_out, 0, "forecast_time", format(forecast_time, "%Y-%m-%d %H:%M:%S"))
 ncatt_put(nc_out, 0, "obs_flag", 0)
-ncatt_put(nc_out, 0, "creation_date", Sys.time())
+ncatt_put(nc_out, 0, "creation_date", format(Sys.time(), "%Y-%m-%d"))
 # Close the netCDF file.
 nc_close(nc_out)
 

From 1e7b09c4de4135cc943ea8364c1d28fbb0d20766 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Mon, 17 Mar 2025 22:38:39 -0400
Subject: [PATCH 19/49] convert sipnetwopet simulated data as arrays

---
 downscale/_02_design_point_simulations.R | 39 +++++++++++++++---------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/downscale/_02_design_point_simulations.R b/downscale/_02_design_point_simulations.R
index 1137727..ff1ea55 100644
--- a/downscale/_02_design_point_simulations.R
+++ b/downscale/_02_design_point_simulations.R
@@ -30,7 +30,7 @@ source("downscale/sipnetwopet.R")
 #' ### Join Design Points with Covariates
 #'
 ## -----------------------------------------------------------------------------
-design_points <- read_csv("data/final_design_points.csv")
+design_points <- read_csv("data/design_points.csv") |> distinct()
 covariates <- load("data/data_for_clust_with_ids.rda") |> get()
 
 # Remove duplicate entries using 'id'
@@ -51,19 +51,30 @@ design_point_results <- design_point_covs |>
   tidyr::unnest(result) |>
   dplyr::select(id, ensemble_id, SOC = soc, AGB = agb)
 
-# Transform long to wide format, unwrapping SOC list to numeric value
-design_point_wide <- design_point_results |>
-  tidyr::pivot_wider(
-    id_cols = id,
-    names_from = ensemble_id,
-    values_from = SOC,
-    names_prefix = "ensemble"
-  ) |>
-  dplyr::mutate(across(starts_with("ensemble"), ~ unlist(.)))
+# Convert design_point_results into arrays using pivot_wider and as.array
+arr_soc_matrix <- design_point_results |>
+  select(id, ensemble_id, SOC) |>
+  pivot_wider(names_from = ensemble_id, values_from = SOC) |>
+  column_to_rownames("id") |>
+  as.matrix()
 
-# Save ensemble_data as a list with date naming, matching the expected shape
-ensemble_data <- list("2020-01-01" = design_point_wide)
+arr_soc <- as.array(arr_soc_matrix)
+dim(arr_soc) <- c(1, nrow(arr_soc_matrix), ncol(arr_soc_matrix))
+dimnames(arr_soc) <- list(datetime = "2020-01-01",
+                          site = rownames(arr_soc_matrix),
+                          ensemble = colnames(arr_soc_matrix))
 
-saveRDS(ensemble_data, "cache/ensemble_data.rds")
+arr_agb_matrix <- design_point_results |>
+  select(id, ensemble_id, AGB) |>
+  pivot_wider(names_from = ensemble_id, values_from = AGB) |>
+  column_to_rownames("id") |>
+  as.matrix()
 
-design_point_wide  # final simulated design point output
+arr_agb <- as.array(arr_agb_matrix)
+dim(arr_agb) <- c(1, nrow(arr_agb_matrix), ncol(arr_agb_matrix))
+dimnames(arr_agb) <- list(datetime = "2020-01-01",
+                          site = rownames(arr_agb_matrix),
+                          ensemble = colnames(arr_agb_matrix))
+
+ensemble_arrays <- list(SOC = arr_soc, AGB = arr_agb)
+saveRDS(ensemble_arrays, "cache/efi_ensemble_arrays.rds")

From f4aacfb7258b55b6eff365acf2e08e390564ab64 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Mon, 17 Mar 2025 22:39:03 -0400
Subject: [PATCH 20/49] save objects as RDS instead of RData

---
 downscale/00-prepare.R                          | 17 ++++++++---------
 downscale/01_cluster_and_select_design_points.R |  5 +++--
 downscale/02_extract_sipnet_output.R            |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index ca73bdd..a4b7107 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -128,13 +128,12 @@ yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson"
 #' The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
 #' 
 ## ----eval=FALSE---------------------------------------------------------------
-# input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
-# output_gpkg = 'data/ca_fields.gpkg'
-# output_csv = 'data/ca_field_attributes.csv'
-# 
-# landiq2std(input_file, output_gpkg, output_csv)
+input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
+output_gpkg = 'data/ca_fields.gpkg'
+output_csv = 'data/ca_field_attributes.csv'
+
+landiq2std(input_file, output_gpkg, output_csv)
 
-#' 
 #' ##### Subset Woody Perennial Crop Fields
 #' 
 #' Phase 1 focuses on Woody Perennial Crop fields.
@@ -295,7 +294,7 @@ ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
     sf::st_transform(crs = ca_albers_crs) |>
     dplyr::rename(climregion_id = id,
            climregion_name = name)
-save(ca_climregions, file = "data/ca_climregions.rda")
+saveRDS(ca_climregions, file = "data/ca_climregions.rds")
 
 #' 
 ## ----join_climregions---------------------------------------------------------
@@ -304,7 +303,7 @@ ca_woody_pts_clay_ocd_twi_cr <- ca_woody_pts_clay_ocd_twi |>
   sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
 
 # convenience cache.
-save(ca_woody_pts_clay_ocd_twi_cr, file = "ca_woody_pts_clay_ocd_twi_cr.rda")
+saveRDS(ca_woody_pts_clay_ocd_twi_cr, file = "cache/ca_woody_pts_clay_ocd_twi_cr.rda")
 
 #' 
 #' ### GridMet
@@ -387,7 +386,7 @@ data_for_clust_with_ids <- .all |>
   na.omit() |>
   mutate(across(where(is.numeric), ~ signif(., digits = 3)))
 
-save(data_for_clust_with_ids, file = "cache/data_for_clust_with_ids.rda")
+saveRDS(data_for_clust_with_ids, "data/data_for_clust_with_ids.rds")
 
 # Final output for targets; if not in targets, suppress return
 if (exists("IN_TARGETS") && IN_TARGETS) {
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index 040628d..2d24e5f 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -37,9 +37,10 @@ plan(multicore,  workers = no_cores - 2)
 options(future.globals.maxSize = benchmarkme::get_ram() * 0.9)
 
 # load climate regions for mapping
-load("data/ca_climregions.rda")
+ca_climregions <- readRDS("data/ca_climregions.rds")
 # environmental covariates
-load("cache/data_for_clust_with_ids.rda")
+data_for_clust_with_ids <- readRDS("data/data_for_clust_with_ids.rds")
+
 if('mean_temp' %in% names(data_for_clust_with_ids)){
   data_for_clust_with_ids <- data_for_clust_with_ids |>
     rename(temp = mean_temp)
diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index ee75f34..4a3100e 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -155,7 +155,7 @@ for (var in variables) {
     ens_arrays[[var]] <- arr
 }
 
-save(ens_arrays, file = file.path(outdir, "efi_ens_arrays.RData"))
+saveRDS(ens_arrays, file = file.path(outdir, "efi_ens_arrays.rds"))
     
 efi_long <- ens_results |>
     rename(datetime = time) |>

From 46a61d58a7b0e43ba4f851b7ba0d427d112be362 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Mon, 17 Mar 2025 22:42:09 -0400
Subject: [PATCH 21/49] first draft of county aggregated SOC and AGB

---
 downscale/03_downscale_and_agregate.R | 207 ++++++++++++++++++--------
 1 file changed, 145 insertions(+), 62 deletions(-)

diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index b14dc51..3ff321a 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -22,70 +22,68 @@ library(sf)
 library(terra)
 devtools::load_all(here::here("../pecan/modules/assim.sequential/"))
 # library(PEcAnAssimSequential)
-
+basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_98sites_20reps_20250312"
+outdir <- file.path(basedir, "out")
 options(readr.show_col_types = FALSE)
 
-#' ## Get Site Level Outputs
-#'
-#' Next we read in site-level model outputs generated by SIPNETWOPET in 02_anchorsite_simulations.R.
-#'
-#' TODO:
-#' - replace SIPNETWOPET outputs with SIPNET results following regional runs
-#'
-## -----------------------------------------------------------------------------
+library(furrr)
+no_cores <- parallel::detectCores(logical = FALSE)
+plan(multicore, workers = no_cores - 1)
 
-ensemble_data <- readRDS("cache/ensemble_data.rds")
+#' ## Get Site Level Outputs
+ensemble_file <- file.path(outdir, "efi_ens_long.csv.gz")
+ensemble_data <- readr::read_csv(ensemble_file) 
 
 #' ### Random Forest using PEcAn downscale workflow
-#'
-#'
 ## -----------------------------------------------------------------------------
-site_coords <- readr::read_csv("data/ca_field_attributes.csv") |>
-  filter(pft == "woody perennial crop") |>
-  select(id, lon, lat, year)
-
-covariates <- load("data/data_for_clust_with_ids.rda") |>
-  get() # maybe there is a less convoluted way; maybe I just need to rename data_for_clust...
-
-covariates_points <- site_coords |>
-  left_join(covariates, by = "id")
-covariates_sf <- covariates_points |>
-  sf::st_as_sf(coords = c("lon", "lat"), crs = "EPSG:4326")
-
-# Preprocess using renamed objects and updated ensemble data with date
+design_points <- read_csv(here::here("data/design_points.csv")) |>
+  dplyr::distinct()
+
+covariates <- readRDS(here::here("data/data_for_clust_with_ids.rds")) |>
+  rename(site = id) |>
+  select(
+    site, where(is.numeric),
+    -ends_with("id") # drop crop_id, climregion_id columns
+  )
 
-####################### Start Here ############################
-# Find example inputs: `find "/projectnb/dietzelab/jploshay/" -name "*.rds"`
-preprocessed <- SDA_downscale_preprocess(
-  ensemble_data = ensemble_data,
-  site_coords   = site_coords,
-  date          = "2020-01-01",
-  carbon_pool   = "SOC"
-)
+d <- function(date, carbon_pool) {
+  filtered_ens_data <- subset_ensemble(
+    ensemble_data = ensemble_data,
+    site_coords   = design_points,
+    date          = date,
+    carbon_pool   = carbon_pool
+  )
 
+  # Downscale the data
+  downscale_output <- downscale(
+    ensemble_data = filtered_ens_data,
+    site_coords   = design_points,
+    covariates    = covariates,
+    model_type    = "rf",
+    seed          = 123
+  )
+  return(downscale_output)
+}
 
-## Next steps
-debugonce(SDA_downscale)
-## Stop at randomForest and see what is expected vs what is provided
-## Error in model.frame.default(formula = formula, data = train_data, na.action = function (object,  :
-##  invalid type (list) for variable 'ensemble1'
+cpools <- c("TotSoilCarb", "AGB")
+library(furrr)
+plan(multisession)
 
+downscale_output <- purrr::map( # not using furrr b/c it is used inside downscale
+  cpools,
+  ~ d(date = "2018-12-31", carbon_pool = .x)
+) |>
+  purrr::set_names(cpools)
 
-# Downscale the data
-downscale_output <- SDA_downscale(
-  preprocessed = preprocessed,
-  carbon_pool  = "SOC",
-  covariates   = covariates_vect,
-  model_type   = "rf",
-  seed         = 123
-)
 
+## Save to make it easier to restart
+save(downscale_output, file = here::here("cache/downscale_output.rda"))
 
 
-metrics <- SDA_downscale_metrics(downscale_output, carbon_pool = "SOC")
+metrics <- downscale_metrics(downscale_output)
+# could compute stats here e.g. mean, CI for ea. metric
 print(metrics)
 
-
 #'
 #'
 #' ## Aggregate to County Level
@@ -94,25 +92,110 @@ print(metrics)
 library(sf)
 library(dplyr)
 
-# Load CA county boundaries
-# These are provided by Cal-Adapt as 'Areas of Interest'
-county_boundaries <- st_read("data/counties.gpkg")
 
-# check if attributes has county name
-# Append county name to predicted table
-grid_with_counties <- st_join(ca_grid, county_boundaries, join = st_intersects)
+# ca_fields <- readr::read_csv(here::here("data/ca_field_attributes.csv")) |>
+#   dplyr::select(id, lat, lon) |>
+#   rename(site = id)
+
+ca_fields_full <- sf::read_sf(here::here("data/ca_fields.gpkg"))
+
+ca_fields <- ca_fields_full |>
+  select(site = id, county, area_ha)  
 
-# Calculate county-level mean, median, and standard deviation.
-county_aggregates <- grid_with_counties |>
-  st_drop_geometry() |> # drop geometry for faster summarization
-  group_by(county_name) |> # replace with your actual county identifier
+# Convert list to table with predictions and site identifier
+get_downscale_preds <- function(downscale_output) {
+  purrr::map(
+    downscale_output$predictions,
+    ~ tibble(site = covariates$site, prediction = .x)
+  ) |>
+    bind_rows(.id = "ensemble") |>
+    left_join(ca_fields, by = "site") 
+}
+
+downscale_preds <- purrr::map(downscale_output, get_downscale_preds) |>
+  dplyr::bind_rows(.id = "carbon_pool") |>
+  # Convert kg / ha to tonne (Mg) / field level totals
+  # first convert scale
+  mutate(c_density = PEcAn.utils::ud_convert(prediction, "kg/m2", "Tg/ha")) |>
+  mutate(total_c = c_density * area_ha)
+
+ens_county_preds <- downscale_preds |>
+  # Now aggregate to get county level totals for each pool x ensemble
+  group_by(carbon_pool, county, ensemble) |>
   summarize(
-    mean_biomass   = mean(predicted_biomass, na.rm = TRUE),
-    median_biomass = median(predicted_biomass, na.rm = TRUE),
-    sd_biomass     = sd(predicted_biomass, na.rm = TRUE)
+    total_c = sum(total_c)
+  ) |>
+  arrange(carbon_pool, county, ensemble)
+
+county_summaries <- ens_county_preds |>
+    group_by(carbon_pool, county) |>
+    summarize(
+      n = n(),
+      mean_total_c = mean(total_c),
+      median_total_c = median(total_c),
+      sd_total_c = sd(total_c)
+    )
+  
+# Lets plot the results!
+
+county_boundaries <- st_read(here::here("data/counties.gpkg")) |>
+  filter(state_name == "California") |>
+  select(name)
+
+co_preds_to_plot <- county_summaries |>
+  right_join(county_boundaries, by = c("county" = "name")) |>
+  arrange(county, carbon_pool) |>
+  pivot_longer(
+    cols = c(mean_total_c, median_total_c, sd_total_c),
+    names_to = "stat",
+    values_to = "value"
+  )
+
+# now plot map of county-level predictions with total carbon
+p <- purrr::map(cpools, function(pool) {
+  .p <- ggplot(
+    co_preds_to_plot |> filter(carbon_pool == pool),
+    aes(geometry = geom, fill = value)
+  ) +
+    geom_sf(data = county_boundaries, fill = "lightgrey", color = "black") +
+    geom_sf() +
+    scale_fill_viridis_c(option = "plasma") +
+    theme_minimal() +
+    labs(
+      title = paste0(pool, "-C by County"),
+      fill = "Total Carbon (Tg)"
+    ) +
+    facet_grid(~stat)
+
+  ggsave(
+    plot = .p,
+    filename = here::here(paste0("county_total_", pool, ".png")),
+    width = 10, height = 5,
+    bg = "white"
   )
+return(.p)
+})
+
 
-print(county_aggregates)
+# Load CA county boundaries
+# # These are provided by Cal-Adapt as 'Areas of Interest'
+# 
+
+# # check if attributes has county name
+# # Append county name to predicted table
+# grid_with_counties <- st_join(ca_grid, county_boundaries, join = st_intersects)
+
+# # Calculate county-level mean, median, and standard deviation.
+# county_aggregates <- grid_with_counties |>
+#   st_drop_geometry() |> # drop geometry for faster summarization
+#   group_by(county_name) |> # replace with your actual county identifier
+#   summarize(
+#     mean_biomass   = mean(predicted_biomass, na.rm = TRUE),
+#     median_biomass = median(predicted_biomass, na.rm = TRUE),
+#     sd_biomass     = sd(predicted_biomass, na.rm = TRUE)
+#   )
+
+# print(county_aggregates)
 
 # For state-level, do the same but don't group_by county
 

From 72c5d7ad2f79466d1956d80af288b78a3accd6b6 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 18 Mar 2025 16:13:17 -0400
Subject: [PATCH 22/49] added anchor sites table

---
 data_raw/anchor_sites.csv | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 data_raw/anchor_sites.csv

diff --git a/data_raw/anchor_sites.csv b/data_raw/anchor_sites.csv
new file mode 100644
index 0000000..1caf342
--- /dev/null
+++ b/data_raw/anchor_sites.csv
@@ -0,0 +1,26 @@
+external_site_id,site_name,lat,lon,crops,pft
+US-ASH,San Joaquin Valley Almond High Salinity,36.1697,-120.201,Almond,woody perennial crop
+US-ASL,San Joaquin Valley Almond Low Salinity,36.9466,-120.1024,Almond,woody perennial crop
+US-ASM,San Joaquin Valley Almond Medium Salinity,36.1777,-120.2026,Almond,woody perennial crop
+US-PSH,San Joaquin Valley Pistachio High Salinity,36.2347,-119.9247,Pistachio,woody perennial crop
+US-PSL,San Joaquin Valley Pistachio Low Salinity,36.8276,-120.1397,Pistachio,woody perennial crop
+US-Bi1,Bouldin Island Alfalfa,38.0992,-121.4993,Alfalfa,hay and haylage crops
+US-Bi2,Bouldin Island Corn,38.1091,-121.5351,Corn,row crops
+US-Dea,UC Desert REC Alfalfa,32.8136,-115.4423,Alfalfa,hay and haylage crops
+US-DS1,Staten Corn 1,38.1335,-121.539,Corn,row crops
+US-DS2,Staten Corn 2,38.1375,-121.514,Corn,row crops
+US-DS3,Staten Rice 1,38.1235,-121.549,Rice,row crops
+US-Lin,Lindcove Orchard,36.3566,-119.0922,Oranges,woody perennial crop
+US-RGB,Butte County Rice Farm,39.5782,-121.8579,Rice,row crops
+US-RGF,Stanislaus Forage Farm,37.6995,-121.1369,Forage crops,hay and haylage crops
+US-RGG,Glenn County Rice Farm,39.5944,-122.0253,Rice,row crops
+US-RGO,Glenn County Organic Rice,39.6805,-122.0026,Organic rice,row crops
+US-Si1,Staten Island Fallow,38.1747,-121.5047,Fallow,NA
+US-Si2,Staten Island Flooded,38.1758,-121.5167,Rice,row crops
+US-Tw2,Twitchell Corn,38.0969,-121.6365,Corn,row crops
+US-Tw3,Twitchell Alfalfa,38.1152,-121.6469,Alfalfa,herbaceous perennial
+US-Twt,Twitchell Island,38.1087,-121.6531,NA,NA
+russell-ranch,Russell Ranch,38.543,-121.874,"tomato, corn, wheat",row crops
+west-side,West Side Research Center,36.342,-120.108,Row crops including cotton,row crops
+wolfskill,Wolfskill Experimental Orchards,38.503,-121.977,Fruit and nut breeding,woody perennial crop
+ucr-citrus,UC Riverside Citrus Research,33.967,-117.34,Citrus variety collection,woody perennial crop

From 0d1801d45e91ff071b1e1220ad2117f1a62f7148 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 21 Mar 2025 01:33:05 -0400
Subject: [PATCH 23/49] lots of clean up; too much to document, sorry

---
 _targets.R                                    |   2 +-
 downscale/00-prepare.R                        | 560 +++++++-----------
 .../01_cluster_and_select_design_points.R     | 435 +++++++-------
 downscale/02_extract_sipnet_output.R          |  80 +--
 downscale/03_downscale_and_agregate.R         |  75 ++-
 5 files changed, 521 insertions(+), 631 deletions(-)

diff --git a/_targets.R b/_targets.R
index 9082ca4..1c19c24 100644
--- a/_targets.R
+++ b/_targets.R
@@ -22,7 +22,7 @@ list(
     ),
     tar_target(simulations,
         {
-            source("downscale/02_design_point_simulations.R")
+            source("downscale/02_extract_sipnet_output.R")
             design_point_wide # output from 02-design_point_simulations.R
         },
         deps = cluster_sites
diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index a4b7107..cbeaa7c 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -27,200 +27,87 @@
 #' 
 ## -----------------------------------------------------------------------------
 
-options(repos = c(
-  pecanproject = 'https://pecanproject.r-universe.dev',
-  ropensci     = 'https://ropensci.r-universe.dev',
-  CRAN         = 'https://cloud.r-project.org'))
-
-# install.packages("PEcAn.all")
-## Required until https://github.com/UCANR-IGIS/caladaptr/pull/3 is merged
-# remotes::install_github("dlebauer/caladaptr")
+# options(repos = c(
+#   pecanproject = 'https://pecanproject.r-universe.dev',
+#   ropensci     = 'https://ropensci.r-universe.dev',
+#   CRAN         = 'https://cloud.r-project.org'))
+
 library(PEcAn.all)
 library(tidyverse)
-
-library(caladaptr)
 library(sf)
 library(terra)
-## Required until PR 3423is merged https://github.com/PecanProject/pecan/pull/3423
-# check if PR is merged
-devtools::load_all(here::here("../pecan/modules/data.land"))
 
-## Check available compute resources
-benchmarkme::get_ram()
-ncpu <- benchmarkme::get_cpu()$no_of_cores
-no_cores <- parallel::detectCores(logical = FALSE)
-future_multi <- ifelse(future::supportsMulticore(),
-       future::plan(future::multicore),
-       future::plan(future::multisession))
-future::plan(future_multi,
-             workers = ncpu - 2)
+# remotes::install_github("UCANR-IGIS/caladaptr")
+library(caladaptr)
 
+## Check available compute resources and set up parallel processing
+no_cores <- parallel::detectCores(logical = FALSE)
+future::plan(multicore, workers = no_cores - 2)
 
+data_dir <- "/projectnb2/dietzelab/ccmmf/data"
+raw_data_dir <- "/projectnb2/dietzelab/ccmmf/data_raw"
 
+ca_albers_crs <- 3310
 
-#' 
-#' ## Organize Input Data
-#' 
-#' ### Domain Polygons
-#' 
-#' Here we are generating domain polygons that will be used for subsetting.
-#' These are converted to convex hulls and simplified for computational efficiency.
-#' The Yolo County domain is a smaller domain that can be used for testing and debugging.
-#' 
-#' These include:
-#' 
-#' - caladapt_domain_convex_hull: a convex hull for the Cal-Adapt domain
-#' - ca_convex_hull: a convex hull around CA
-#' - ca_state_polygon_simplified: a simplified convex hull for CA
-#' - ca_state_polygon: a convex hull for CA
-#' 
-#' - ca_convex_hull_reduced: a simplified convex hull for CA
-#' - yolo_bbox: a smaller domain limited to Yolo County
-#' 
-## ----eval = FALSE-------------------------------------------------------------
-# # remotes::install_github("ucanr-igis/caladaptr")
-# ## Cal-Adapt Domain
-# caladapt_domain <- caladaptr::ca_aoipreset_geom("counties") |>
-#      sf::st_transform(4326) |>
-#      sf::st_union() |>
-#      sf::st_convex_hull()
-# st_write(caladapt_domain, "data/caladapt_domain_convex_hull.geojson")
-# 
-# ## California State
-# ca_counties_polygons <- ca_aoipreset_geom("counties") |>
-#      dplyr::filter(state_name == "California") |>
-#      dplyr::select(state_name, county_name = name, geom) |>
-#      sf::st_transform(4326)
-# 
-# ca_state_polygon <- ca_counties_polygons |>
-#      group_by(state_name) |>
-#      mutate(geom = sf::st_union(geom))
-# 
-# ca_state_polygon_simplified <- sf::st_simplify(ca_state_polygon, dTolerance = 5000)
-# file.remove("data/ca_state_polygon_simplified.geojson")
-# sf::st_write(ca_state_polygon_simplified, "data/ca_state_polygon_simplified.geojson")
-# 
-# ## Yolo County
-# yolo_county_polygon <- ca_counties_polygons |>
-#     filter(county_name=='Yolo')
-# 
-# yolo_county_polygon_simplified <- sf::st_simplify(yolo_county_polygon, dTolerance = 5000)
-# sf::st_write(yolo_county_polygon_simplified, "data/yolo_county_polygon_simplified.geojson")
-# yolo_county_convex_hull <- sf::st_convex_hull(yolo_county_polygon_simplified)
-# # check if it is sufficiently simple to avoid unnecessary computational expensse
-# # st_coordinates(yolo_county_convex_hull)
-# 
-# ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-# 
-
-#' 
-## -----------------------------------------------------------------------------
-ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-yolo_county_polygon <- sf::st_read("data/yolo_county_polygon_simplified.geojson")
-
-#' 
 #' ### LandIQ Woody Polygons
 #' 
-#' The first step is to convert LandIQ to a open, standard format.
+#' The first step is to convert LandIQ to a open and standard (TBD) format.
 #' 
-#' We will use a GeoPackage file to store geospatial information and an associated CSV file with attributes.
+#' We will use a GeoPackage file to store geospatial information and 
+#' associated CSV files to store attributes associated with the LandIQ fields.
+#' 
+#' The `site_id` field is the unique identifier for each field.
 #' 
 #' The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
 #' 
-## ----eval=FALSE---------------------------------------------------------------
-input_file = 'data/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp'
-output_gpkg = 'data/ca_fields.gpkg'
-output_csv = 'data/ca_field_attributes.csv'
-
-landiq2std(input_file, output_gpkg, output_csv)
+## Convert SHP to Geotiff`
+## Required until PR 3423 is merged https://github.com/PecanProject/pecan/pull/3423
+# check if PR is merged
+# devtools::install_github("dlebauer/pecan",
+#                          ref = "shp2gpkg",
+#                          subdir = "modules/data.land")
+
+devtools::load_all("../pecan/modules/data.land/")
+input_file = file.path(raw_data_dir, 'i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp')
+ca_fields_gpkg <- file.path(data_dir, 'ca_fields.gpkg')
+ca_attributes_csv = file.path(data_dir, 'ca_field_attributes.csv')
+if(!file.exists(ca_fields_gpkg) & !file.exists(ca_attributes_csv)) {
+  landiq2std(input_file, ca_fields_gpkg, ca_attributes_csv)
+}
 
+ca_fields <- sf::st_read(ca_fields_gpkg) |>
+  sf::st_transform(crs = ca_albers_crs)
+  
 #' ##### Subset Woody Perennial Crop Fields
-#' 
+#'
 #' Phase 1 focuses on Woody Perennial Crop fields.
-#' 
+#'
 #' Next, we will subset the LandIQ data to only include woody perennial crop fields.
 #' At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
-#' 
-## -----------------------------------------------------------------------------
-ca_fields <- sf::st_read("data/ca_fields.gpkg")
-ca_attributes <- readr::read_csv("data/ca_field_attributes.csv")
-
-ca <- ca_fields |>
-  dplyr::select(-lat, -lon) |>
-  dplyr::left_join(ca_attributes, by = "id")
-
-ca_woody <- ca |>
-  dplyr::filter(pft == "woody perennial crop")
-sf::st_write(ca_woody,
-        "data/ca_woody.gpkg", delete_layer = TRUE)
-
-#' 
-#' Now, calculate percent of California croplands that are woody perennial crops, in order to estimate the
-#' number of design points that will be selected in the clustering step.
-#' 
-#' 
+#'
 ## -----------------------------------------------------------------------------
-system.time(
-pft_area <- ca |>
-  dplyr::select(id, pft, area_ha) |>
-  dtplyr::lazy_dt() |>
-  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
-  dplyr::group_by(woody_indicator) |>
-  dplyr::summarize(pft_area = sum(area_ha))
-)
+ca_fields |>
+  filter(pft = = "woody perennial crop") |>
+  sf::st_transform(crs = ca_albers_crs) |>
+  dplyr::select(site_id, crop, pft, geom) |>
+  sf::st_write(file.path(data_dir, 'ca_woody.gpkg'), 
+    delete_dsn = TRUE)
 
+ca_attributes <- readr::read_csv(ca_attributes_csv)
 
-
-#' 
-#' ### Subset California Woody Crop Fields for development & testting
-#' 
-#' Now, create a subset of the California Woody Crop Fields for development & testting
-#' 
-## ----eval=FALSE---------------------------------------------------------------
-# set.seed(25)
-# ca_woody_subset <- ca_woody  |>
-#    dplyr::sample_n(200)
-# 
-# sf::st_write(ca_woody_subset,
-#         "data/ca_woody_subset.gpkg", delete_layer = TRUE)
-
-#' 
+#'
 #' ### Convert Polygons to Points.
-#' 
+#'
 #' For Phase 1, we will use points to query raster data.
 #' In later phases we will evaluate the performance of polygons and how querying environmental data using polygons will affect the performance of clustering and downscaling algorithms.
-#' 
-## ----polygons-to-points-------------------------------------------------------
-woody_gpkg <- "data/ca_woody.gpkg" # use **ca_woody_subset.gpkg** for development and testing
+#'
 
-# load fields, convert polygons to points, and subset
-ca_woody_pts <- sf::st_read(woody_gpkg) |>
+ca_fields_pts <- ca_fields |>
+  dplyr::select(-lat, -lon) |>
+  left_join(ca_attributes, by = "site_id") |>
   sf::st_centroid() |>
   # and keep only the columns we need
-  dplyr::select(id, crop, pft, geom)
-
-
-#' 
-#' ## Anchor Sites
-#' 
-## ----anchor-sites-------------------------------------------------------------
-# Anchor sites from UC Davis, UC Riverside, and Ameriflux.
-anchor_sites <- readr::read_csv("data/anchor_sites.csv")
-
-anchor_sites_pts <- anchor_sites |>
-  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326)
-
-# Join with ca_fields: keep only the rows associated with anchor sites
-# takes ~ 1 min on BU cluster w/ "Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz"
-anchor_sites_with_ids <- anchor_sites_pts |>
-   # spatial join find ca_fields that contain anchor site points
-  sf::st_join(ca_fields, join = sf::st_within)
-
-sf::st_write(anchor_sites_with_ids |>
-               dplyr::select(id, lat, lon, location, site_name, crops, pft),
-               dsn = "data/anchor_sites_ids.csv",
-               delete_layer = TRUE)
-
+  dplyr::select(site_id, crop, pft, geom)
 
 #' 
 #' ## Environmental Covariates
@@ -229,128 +116,105 @@ sf::st_write(anchor_sites_with_ids |>
 #' 
 #' #### Load Prepared Soilgrids GeoTIFF
 #' 
-#' Using already prepared SoilGrids layers
+#' Using already prepared SoilGrids layers. 
+#' TODO: move a copy of these files to data_dir
 #' 
 ## ----load-soilgrids-----------------------------------------------------------
 soilgrids_north_america_clay_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/clay/clay_0-5cm_mean/clay/clay_0-5cm_mean.tif'
 soilgrids_north_america_ocd_tif <- '/projectnb/dietzelab/dongchen/anchorSites/NA_runs/soil_nc/soilgrids_250m/ocd/ocd_0-5cm_mean/ocd/ocd_0-5cm_mean.tif'
 ## if we want to clip to CA
-## use terra to read in that file and clip to california
-# soilgrids_california <- terra::crop(soilgrids_north_america, yolo_bbox)
+## use terra to read in that file and then extract values for each location
 
-# read in the file
-# read two layers
 soilgrids_north_america_clay_rast <- terra::rast(soilgrids_north_america_clay_tif)
 soilgrids_north_america_ocd_rast <- terra::rast(soilgrids_north_america_ocd_tif)
 
-
 #' 
-#' #### Extract clay from SoilGrids
+#' #### Extract clay and carbon stock from SoilGrids
 #' 
 ## ----sg-clay-ocd--------------------------------------------------------------
 
-clay <- (terra::extract(soilgrids_north_america_clay_rast, terra::vect(ca_woody_pts)) |>
-  dplyr::select(-ID)) |>
-  dplyr::pull()/ 10
+clay <- terra::extract(
+  soilgrids_north_america_clay_rast,
+  terra::vect(ca_fields_pts |>
+    sf::st_transform(crs = sf::st_crs(soilgrids_north_america_clay_rast)))) |>
+  dplyr::select(-ID) |>
+  dplyr::pull() / 10
 
-ocd <- terra::extract(soilgrids_north_america_ocd_rast, terra::vect(ca_woody_pts)) |>
+ocd <- terra::extract(
+  soilgrids_north_america_ocd_rast,
+  terra::vect(ca_fields_pts |>
+    sf::st_transform(crs = sf::st_crs(soilgrids_north_america_ocd_rast)))) |>
   dplyr::select(-ID) |>
   dplyr::pull()
 
-ca_woody_pts_clay_ocd <- cbind(ca_woody_pts,
+ca_fields_pts_clay_ocd <- cbind(ca_fields_pts,
                                clay = clay,
                                ocd = ocd)
 
-required_cols <- c("id", "crop", "pft", "clay", "ocd", "geom")
-assertthat::assert_that(
-  all(required_cols %in% colnames(ca_woody_pts_clay_ocd)),
-            msg = "ca_woody_pts_clay_ocd is missing expected columns")
-
-
 #' 
 #' ### Topographic Wetness Index
 #' 
 ## ----twi----------------------------------------------------------------------
 twi_tiff <- '/projectnb/dietzelab/dongchen/anchorSites/downscale/TWI/TWI_resample.tiff'
-twi_rast <- terra::rast(twi_tiff)
+twi_rast <- terra::rast(twi_tiff) 
 
-twi <- terra::extract(twi_rast, vect(ca_woody_pts)) |>
-  select(-ID) |>
+twi <- terra::extract(
+  twi_rast,
+  terra::vect(ca_fields_pts |>
+    sf::st_transform(crs = sf::st_crs(twi_rast)))) |>
+  dplyr::select(-ID) |>
   dplyr::pull()
 
-ca_woody_pts_clay_ocd_twi <- cbind(ca_woody_pts_clay_ocd, twi = twi)
-
+ca_fields_pts_clay_ocd_twi <- cbind(ca_fields_pts_clay_ocd, twi = twi)
 
 #' 
-#' ### Cal-Adapt Climate Regions
-#' 
-## ----caladapt_climregions-----------------------------------------------------
-
-ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
-                      # and so units are in meters
-                      # required that crs(x) == crs(y) for st_join
-
-ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
-    sf::st_transform(crs = ca_albers_crs) |>
-    dplyr::rename(climregion_id = id,
-           climregion_name = name)
-saveRDS(ca_climregions, file = "data/ca_climregions.rds")
-
-#' 
-## ----join_climregions---------------------------------------------------------
-ca_woody_pts_clay_ocd_twi_cr <- ca_woody_pts_clay_ocd_twi |>
-  sf::st_transform(., crs = ca_albers_crs) |>
-  sf::st_join(ca_climregions, join = st_intersects, left = TRUE)
-
-# convenience cache.
-saveRDS(ca_woody_pts_clay_ocd_twi_cr, file = "cache/ca_woody_pts_clay_ocd_twi_cr.rda")
-
-#' 
-#' ### GridMet
+#' ### ERA5 Met Data
 #' 
 ## -----------------------------------------------------------------------------
-gridmet_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
+era5met_dir <- "/projectnb/dietzelab/dongchen/anchorSites/NA_runs/GridMET/"
 
 # List all ERA5_met_*.tiff files for years 2012-2021
 raster_files <- list.files(
-  path = gridmet_dir,
+  path = era5met_dir,
   pattern = "^ERA5_met_\\d{4}\\.tiff$",
   full.names = TRUE
 )
 
 # Read all rasters into a list of SpatRaster objects
-rasters_list <- map(
+rasters_list <- purrr::map(
   raster_files,
-  ~ rast(.x))
+  ~ terra::rast(.x))
 
-years <- map_chr(rasters_list, ~ {
+years <- purrr::map_chr(rasters_list, ~ {
   source_path <- terra::sources(.x)[1]
-  str_extract(source_path, "\\d{4}")
+  stringr::str_extract(source_path, "\\d{4}")
 })  |>
   as.integer()
 
 names(rasters_list) <- years
 
 extract_clim <- function(raster, points_sf) {
-  terra::extract(raster, points_sf) |>
+  terra::extract(
+    raster, 
+    points_sf |> 
+  sf::st_transform(crs = sf::st_crs(raster))) |>
     tibble::as_tibble() |>
     select(-ID) |>
-    mutate(id = points_sf$id) |>
-    select(id, temp, prec, srad, vapr)
+    mutate(site_id = points_sf$site_id) |>
+    select(site_id, temp, prec, srad, vapr)
 }
 
 .tmp <-  rasters_list |>
   furrr::future_map_dfr(
-    ~ extract_clim(.x, ca_woody_pts),
+    ~ extract_clim(.x, ca_fields_pts),
       .id = "year",
       .options = furrr::furrr_options(seed = 123))
 
-
 clim_summaries <- .tmp |>
   dplyr::mutate(
     precip = PEcAn.utils::ud_convert(prec, "second-1", "year-1")
 ) |>
-  dplyr::group_by(id) |>
+  dplyr::group_by(site_id) |>
   dplyr::summarise(
     temp = mean(temp),
     precip = mean(precip),
@@ -358,129 +222,153 @@ clim_summaries <- .tmp |>
     vapr = mean(vapr)
   )
 
-#' 
-#' ## Prepare Dataset for Clustering
-#' 
-#' First, we will turn crop names into IDs to support hierarchical clustering
-#' 
-## -----------------------------------------------------------------------------
-crop_ids <- ca_woody_pts |>
-  distinct(crop) |>
-  mutate(crop_id = as.integer(as.factor(crop))) |>
-  write_csv("data/crop_ids.csv")
-
 #' 
 ## ----join_and_subset----------------------------------------------------------
 .all <- clim_summaries  |>
-  dplyr::left_join(ca_woody_pts_clay_ocd_twi_cr, by = "id")  |>
-  dplyr::left_join(crop_ids, by = "crop")
+  dplyr::left_join(ca_fields_pts_clay_ocd_twi, by = "site_id")
+
+assertthat::assert_that(
+  nrow(.all) == nrow(clim_summaries) &&
+    nrow(.all) == nrow(ca_fields_pts_clay_ocd_twi),
+  msg = "join was not 1:1 as expected"
+)
+
+#' Append CA Climate Region
+#'
+## Add Climregions
+# load climate regions for mapping
+#' ### Cal-Adapt Climate Regions
+#'
+#' Climate Region will be used as a factor
+#' in the hierarchical clustering step. 
+## ----caladapt_climregions-----------------------------------------------------
 
-assertthat::assert_that(nrow(.all) == nrow(clim_summaries) && nrow(.all) == nrow(ca_woody_pts_clay_ocd_twi_cr),
-                        msg = "join was not 1:1 as expected")
+ca_field_climregions <- ca_fields |>
+  sf::st_join(
+    caladaptr::ca_aoipreset_geom("climregions") |>
+      sf::st_transform(crs = ca_albers_crs),
+    join = sf::st_within
+  ) |>
+  dplyr::select(
+    site_id,
+    climregion_id = id,
+    climregion_name = name
+  )
 
-glimpse(.all)
-skimr::skim(.all)
+# This returns a point geometry. 
+# To return the **polygon** geometry from ca_fields, 
+# drop geometry from .all instead of from ca_field_climregions
+.all2 <- .all |>
+  dplyr::left_join(
+    ca_field_climregions  |> st_drop_geometry(),
+    by = "site_id"
+  )
 
-data_for_clust_with_ids <- .all |>
-  #dplyr::select(-c(climregion_name))  |>
+site_covariates <- .all2 |>
   na.omit() |>
-  mutate(across(where(is.numeric), ~ signif(., digits = 3)))
+  mutate(across(where(is.numeric), ~ signif(., digits = 3))) 
+
+PEcAn.logger::logger.info(
+  round(100 * (1 - nrow(site_covariates) / nrow(ca_fields)), 0), "% of LandIQ polygons (sites) have at least one missing environmental covariate"
+)
+
+# takes a long time
+# knitr::kable(skimr::skim(site_covariates))
 
-saveRDS(data_for_clust_with_ids, "data/data_for_clust_with_ids.rds")
+readr::write_csv(site_covariates, file.path(data_dir, "site_covariates.csv"))
 
 # Final output for targets; if not in targets, suppress return
 if (exists("IN_TARGETS") && IN_TARGETS) {
-  data_for_clust_with_ids
+  site_covariates
 } else {
-  invisible(data_for_clust_with_ids)
+  invisible(site_covariates)
 }
-#' 
-#' <!--
-#' # [Draft] Unused Code
-#' 
-#' ### Cal-Adapt Climate
-#' 
-#' #### Cal-Adapt Catalog
-#' 
-#' See `ca_catalog_search('30yavg_ens32avg_historical')`
-#' 
-#' ## make a paginated table
-#' 
-## ----eval=FALSE---------------------------------------------------------------
-# library(dplyr)
-# ca_catalog_search('ens32avg') |>
-#   reactable::reactable(searchable = TRUE, filterable = TRUE, defaultPageSize = 20)
-
-#' 
-#' ##### Download Cal-Adapt Climate Rasters
-#' 
-#' LOCA (CMIP5-based)
-#' 
-## ----eval = FALSE-------------------------------------------------------------
-# 
-# #gcm = "ens32avg"
-# gcm = "HadGEM2-ES"
-# scenario = "historical"
-# period = "year"
-# start_year = 1961
-# end_year = 1990
-# out_dir = "data/caladapt/"
-# polygon = ca_state_polygon <- sf::st_read("data/ca_convex_hull_reduced.geojson")
-# 
-# vars <- c("pr", "tasmin", "tasmax")
-# 
-# num_workers <- parallel::detectCores()
-# future::plan(future_multi,
-#             workers = 3)# num_workers
-# ### Not working because Rasters are all 0????
-# caladapt_rasters <- furrr::future_map(
-#   vars,
-#   ~ {
-#     gcms <- gcms # required for parallel
-#     ca_fetch_raster_polygon(
-#       polygon = polygon,
-#       var = .x,
-#       gcm = gcm,
-#       scenario = scenario,
-#       period = period,
-#       start_year = start_year,
-#       end_year = end_year,
-#       out_dir = out_dir
-#   )
-#   },
-#   .options = furrr_options(seed = 123)
-# )
-# 
-# # combine into a dataframe
-# caladapt_rasters_df <- caladapt_rasters |>
-#   purrr::imap_dfr(~ mutate(.x, variable = .y)) |>
-#   dplyr::group_by(variable) |>
-#   dplyr::summarise(raster = list(terra::vrt(raster)))
-# 
-# z <- caladapt_rasters_df |>
-#   dplyr::group_by(variable) |>
-#   dplyr::summarise(raster = terra::extract(raster, terra::vect(ca_woody_pts)))
-# ca_woody_pts_sg_cr_ca <-z
-# 
-# climate_df <- furrr::future_map(
-#   caladapt_rasters_df$raster,
-#   ~ terra::extract(.x, terra::vect(ca_woody_pts)),
-#   .options = furrr_options(seed = 123)
-# )
-# 
-# 
 
-#' 
-#' ##### Cal-Adapt 3.0
-#' 
-## ----eval=FALSE---------------------------------------------------------------
-# my_data <- read_3km_hourly_climate_terra(
-#      data_dir = "local_folder",
-#      variable = "T2",
-#      domain = "d03",
-#      year = 2015,
-#      n_cores = 8
-#    )
-
-#' -->
 #'
+#' ## Anchor Sites
+#'
+## ----anchor-sites-------------------------------------------------------------
+# Anchor sites from UC Davis, UC Riverside, and Ameriflux.
+anchor_sites_pts <- readr::read_csv("data_raw/anchor_sites.csv") |>
+  sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
+  sf::st_transform(crs = ca_albers_crs)
+
+# Join with ca_fields: keep only the rows associated with anchor sites
+# spatial join find ca_fields that contain anchor site points
+#     takes ~ 1 min on BU cluster w/ "Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz"
+
+# First subset ca_fields to only include those with covariates
+#     (approx.  )
+
+ca_fields_with_covariates <- ca_fields |>
+  dplyr::right_join(site_covariates |> select(site_id), by = "site_id")
+
+anchor_sites_with_ids <- anchor_sites_pts |>
+  sf::st_join(ca_fields_with_covariates,
+    join = sf::st_within
+  )
+
+# Handle unmatched anchor sites
+unmatched_anchor_sites <- anchor_sites_with_ids |>
+  dplyr::filter(is.na(site_id))
+matched_anchor_sites <- anchor_sites_with_ids |>
+  dplyr::filter(!is.na(site_id))
+
+if (nrow(unmatched_anchor_sites) > 0) {
+  # Find nearest indices
+  nearest_indices <- sf::st_nearest_feature(unmatched_anchor_sites, ca_fields)
+
+  # Get nearest ca_fields
+  nearest_ca_fields <- ca_fields |> dplyr::slice(nearest_indices)
+
+  # Assign site_id and calculate distances
+  unmatched_anchor_sites <- unmatched_anchor_sites |>
+    dplyr::mutate(
+      site_id = nearest_ca_fields$site_id,
+      lat = nearest_ca_fields$lat,
+      lon = nearest_ca_fields$lon,
+      distance_m = sf::st_distance(geometry, nearest_ca_fields, by_element = TRUE)
+    )
+  threshold <- units::set_units(250, "m")
+  if (any(unmatched_anchor_sites$distance_m > threshold)) {
+    PEcAn.logger::logger.warn(
+      "The following anchor sites are more than 250 m away from the nearest landiq field:",
+      paste(unmatched_anchor_sites |> filter(distance_m > threshold) |> pull(site_name), collapse = ", "),
+      "Please check the distance_m column in the unmatched_anchor_sites data.",
+      "Consider dropping these sites or expanding the threshold."
+    )
+  }
+
+  # Combine matched and unmatched anchor sites
+  anchor_sites_with_ids <- dplyr::bind_rows(
+    matched_anchor_sites,
+    unmatched_anchor_sites |> select(-distance_m)
+  )
+}
+
+# Check for missing site_id, lat, or lon
+if (any(is.na(anchor_sites_with_ids |> select(site_id, lat, lon)))) {
+  PEcAn.logger::logger.warn(
+    "Some anchor sites **still** have missing site_id, lat, or lon!"
+  )
+}
+
+# Save processed anchor sites
+anchor_sites_with_ids |>
+  dplyr::select(site_id, external_site_id, site_name, lat, lon, crops, pft) |>
+  readr::write_csv(file.path(data_dir, "anchor_sites_ids.csv"))
+
+if(anchor_sites_with_ids |>
+  left_join(site_covariates, by = "site_id") |>
+  dplyr::select(
+    site_id, lat, lon,
+    clay, ocd, twi, temp, precip
+  ) |>
+  filter(if_any(
+    everything(),
+    ~ is.na(.x)
+  ))  |> nrow() > 0) {
+  PEcAn.logger::logger.warn(
+    "Some anchor sites have missing environmental covariates!"
+  )
+}
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index 2d24e5f..5b7ffdd 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -26,38 +26,28 @@ library(terra)
 # parallel computing
 library(cluster)
 library(factoextra)
-library(pathviewr)
+library(pathviewr) #???
 library(furrr)
 library(doParallel)
 library(dplyr)
 
+library(caladaptr) # to plot climate regions
+
 # Set up parallel processing with a safe number of cores
 no_cores <- parallel::detectCores(logical = FALSE)
 plan(multicore,  workers = no_cores - 2)
 options(future.globals.maxSize = benchmarkme::get_ram() * 0.9)
+ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
 
-# load climate regions for mapping
-ca_climregions <- readRDS("data/ca_climregions.rds")
-# environmental covariates
-data_for_clust_with_ids <- readRDS("data/data_for_clust_with_ids.rds")
-
-if('mean_temp' %in% names(data_for_clust_with_ids)){
-  data_for_clust_with_ids <- data_for_clust_with_ids |>
-    rename(temp = mean_temp)
-  PEcAn.logger::logger.warn("you should", 
-    "change mean_temp --> temp in data_for_clust_with_ids",
-    "when it is created in 00-prepare.qmd and then delete",
-    "this conditional chunk")
-}
-
-#' 
-#' ## Load Site Environmental Data
-#' 
+data_dir <- "/projectnb/dietzelab/ccmmf/data"
+#'
+#' ## Load Site Environmental Data Covariates
+#'
 #' Environmental data was pre-processed in the previous workflow 00-prepare.qmd.
-#' 
+#'
 #' Below is a sumary of the covariates dataset
-#' 
-#' - id: Unique identifier for each polygon
+#'
+#' - site_id: Unique identifier for each polygon
 #' - temp: Mean Annual Temperature from ERA5
 #' - precip: Mean Annual Precipitation from ERA5
 #' - srad: Solar Radiation
@@ -65,51 +55,26 @@ if('mean_temp' %in% names(data_for_clust_with_ids)){
 #' - clay: Clay content from SoilGrids
 #' - ocd: Organic Carbon content from SoilGrids
 #' - twi: Topographic Wetness Index
-#' - crop_id: identifier for crop type, see table in crop_ids.csv
-#' - climregion_id: Climate Regions as defined by CalAdapt identifier for climate region, see table in climregion_ids.csv
-#' 
-#' 
+
+site_covariates_csv <- file.path(data_dir, "site_covariates.csv")
+site_covariates <- readr::read_csv(site_covariates_csv) 
+  
 #' ## Anchor Site Selection
-#' 
+#'
 #' Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
-#' 
+#'
 ## ----anchor-sites-selection---------------------------------------------------
 
-# set coordinate reference system, local and in meters for faster joins
-ca_albers_crs <- 3310 # California Albers EPSG
+anchor_sites_with_ids <- readr::read_csv(file.path(data_dir, "anchor_sites_ids.csv"))
 
-anchor_sites_pts <- readr::read_csv("data/anchor_sites.csv") |>
+anchor_sites_pts <- anchor_sites_with_ids |>
   sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
-  sf::st_transform(crs = ca_albers_crs) |>
-  dplyr::mutate(pt_geometry = geometry) |> 
-  rename(anchor_site_pft = pft)
-
-# ca_woody <- sf::st_read("data/ca_woody.gpkg")
-
-ca_fields <- sf::st_read("data/ca_fields.gpkg") |>
-  # must use st_crs(anchor_sites_pts) b/c  !identical(ca_albers_crs, st_crs(ca_albers_crs))
-  sf::st_transform(crs = st_crs(anchor_sites_pts))  |> 
-  rename(landiq_pft = pft)
-
-# Get the index of the nearest polygon for each point
-nearest_idx <- st_nearest_feature(anchor_sites_pts, ca_fields)
-site_field_distances <- diag(st_distance(anchor_sites_pts, ca_fields |> slice(nearest_idx)))
-ca_field_ids  <- ca_fields |> 
-  dplyr::slice(nearest_idx) |>
-  dplyr::select(id, lat, lon)
-
-anchor_sites_ids <- dplyr::bind_cols(
-  anchor_sites_pts,
-  ca_field_ids,
-  distance = site_field_distances
-) |>
-  dplyr::select(id, lat, lon, location, site_name, distance) #,anchor_site_pft, landiq_pft)
-  
-anchor_sites_ids |>
-  readr::write_csv("data/anchor_sites_ids.csv")
+  sf::st_transform(crs = ca_albers_crs)
+
 # create map of anchor sites
-anchor_sites_ids |>
-  sf::st_transform(., crs = ca_albers_crs) |>
+ca_climregions <- caladaptr::ca_aoipreset_geom("climregions") |>
+  rename(climregion_name = name, climregion_id = id)
+p <- anchor_sites_pts |>
   ggplot() +
   geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.25) +
   labs(color = "Climate Region") +
@@ -117,40 +82,45 @@ anchor_sites_ids |>
   scale_color_brewer(palette = "Dark2") +
   labs(color = "PFT") +
   theme_minimal()
+ggsave(p, filename = "downscale/figures/anchor_sites.png", dpi = 300, bg = "white")
 
-woody_anchor_sites <-  anchor_sites_pts |>
-  dplyr::filter(pft == "woody perennial crop")
 anchorsites_for_clust <-
-  data_for_clust_with_ids |>
-  dplyr::filter(id %in% woody_anchor_sites$id)
-
-message("Anchor sites included in final selection:")
-knitr::kable(woody_anchor_sites |> dplyr::left_join(anchorsites_for_clust, by = 'id'))
-
-#' 
+  anchor_sites_with_ids |>
+  select(-pft) |>  # for consistency, only keep pfts from site_covariates
+  left_join(site_covariates, by = 'site_id') 
+  
+#'
 #' ### Subset LandIQ fields for clustering
-#' 
+#'
 #' The following code does:
 #' - Read in a dataset of site environmental data
 #' - Removes anchor sites from the dataset that will be used for clustering
-#' - Subsample the dataset - 80GB RAM too small to cluster 100k rows
-#' - Bind anchor sites back to the dataset 
-#' 
+#' - Subsample the dataset - 136GB RAM is insufficient to cluster 100k rows
+#' - Bind anchor sites back to the dataset
+#'
 ## ----subset-for-clustering----------------------------------------------------
-set.seed(42)  # Set seed for random number generator for reproducibility
-# subsample for testing (full dataset exceeds available Resources)
-sample_size <- 20000
-
-data_for_clust <- data_for_clust_with_ids |>
-                    # remove anchor sites
-                    dplyr::filter(!id %in% anchorsites_for_clust$id) |>
-                    sample_n(sample_size - nrow(anchorsites_for_clust)) |>
-                    # row bind anchorsites_for_clust
-                    bind_rows(anchorsites_for_clust) |>
-                    dplyr::mutate(crop = factor(crop),
-                                  climregion_id = factor(climregion_id))
+set.seed(42) # Set seed for random number generator for reproducibility
+# 10k works
+# 2k sufficient for testing
+sample_size <- 10000
+
+data_for_clust <- site_covariates |>
+  # remove anchor sites
+  dplyr::filter(!site_id %in% anchorsites_for_clust$site_id) |>
+  # subset to woody perennial crops
+  # dplyr::filter(pft == "woody perennial crop") |>
+  # dplyr::mutate(pft = ifelse(pft == "woody perennial crop", "woody perennial crop", "other")) |>
+  sample_n(sample_size - nrow(anchorsites_for_clust)) |>
+  # now add anchor sites back
+  bind_rows(anchorsites_for_clust) |>
+  dplyr::mutate(
+    crop = factor(crop),
+    climregion_name = factor(climregion_name)
+  ) |>
+  select(-lat, -lon) 
 assertthat::assert_that(nrow(data_for_clust) == sample_size)
-assertthat::assert_that('temp'%in% colnames(data_for_clust))
+
+PEcAn.logger::logger.info("Summary of data for clustering before scaling:")
 skimr::skim(data_for_clust)
 
 #' 
@@ -163,54 +133,80 @@ skimr::skim(data_for_clust)
 #' 
 ## ----k-means-clustering-function----------------------------------------------
 
-perform_clustering <- function(data) {
+perform_clustering <- function(data, k_range = 2:20) {
   # Select numeric variables for clustering
-  clust_data <- data |> select(where(is.numeric))
-
+  clust_data <- data |>
+    select(where(is.numeric), -ends_with("id"))
+  
+  PEcAn.logger::logger.info(
+    "Columns used for clustering: ",
+    paste(names(clust_data), collapse = ", ")
+  )
   # Standardize data
   clust_data_scaled <- scale(clust_data)
-
+  gc()   # free up memory
+  PEcAn.logger::logger.info("Summary of scaled data used for clustering:")
+  print(skimr::skim(clust_data_scaled))
+  
   # Determine optimal number of clusters using elbow method
-  k_range <- 3:12
-  tot.withinss <- future_map_dbl(k_range, function(k) {
-    model <- hkmeans(clust_data_scaled, k)
-    model$tot.withinss
-  }, .options = furrr_options(seed = TRUE))
-
-  # Find elbow point
-  elbow_df <- data.frame(k = k_range, tot.withinss = tot.withinss)
-  optimal_k <- find_curve_elbow(elbow_df)
-  message("Optimal number of clusters determined: ", optimal_k)
-
-  # Plot elbow method results
-  elbow_plot <- ggplot(elbow_df, aes(x = k, y = tot.withinss)) +
-    geom_line() +
-    geom_point() +
-    labs(title = "Elbow Method for Optimal k", x = "Number of Clusters", y = "Total Within-Cluster Sum of Squares")
-  print(elbow_plot)
-
-  # Compute silhouette scores to validate clustering quality
-  silhouette_scores <- future_map_dbl(k_range, function(k) {
-    model <- hkmeans(clust_data_scaled, k)
-    mean(silhouette(model$cluster, dist(clust_data_scaled))[, 3])
-  }, .options = furrr_options(seed = TRUE))
-
-  silhouette_df <- data.frame(k = k_range, silhouette = silhouette_scores)
-
-  message("Silhouette scores computed. Higher values indicate better-defined clusters.")
-  print(silhouette_df)
-
-  silhouette_plot <- ggplot(silhouette_df, aes(x = k, y = silhouette)) +
-    geom_line(color = "red") +
-    geom_point(color = "red") +
-    labs(title = "Silhouette Scores for Optimal k", x = "Number of Clusters", y = "Silhouette Score")
-  print(silhouette_plot)
+  metrics_list <- furrr::future_map(
+    k_range,
+    function(k) {
+      model <- hkmeans(clust_data_scaled, k)
+      total_withinss <- model$tot.withinss
+      sil_score <- mean(silhouette(model$cluster, dist(clust_data_scaled))[, 3])
+ #     dunn_index <- 
+ #     calinski_harabasz <- 
+      list(model = model, total_withinss = total_withinss, sil_score = sil_score)
+    },
+    .options = furrr_options(seed = TRUE)
+  )
+  # extract metrics
+  metrics_df <- data.frame(
+    # see also https://github.com/PecanProject/pecan/blob/b5322a0fc62760b4981b2565aabafc07b848a699/modules/assim.sequential/inst/sda_backup/bmorrison/site_selection/pick_sda_sites.R#L221
+    k = k_range,
+    tot.withinss = map_dbl(metrics_list, "total_withinss"),
+    sil_score = map_dbl(metrics_list, "sil_score")
+#    dunn_index = map_dbl(metrics_list, "dunn_index")
+#    calinski_harabasz = map_dbl(metrics_list, "calinski_harabasz")
+  )
+
+  elbow_k <- find_curve_elbow(
+    metrics_df[, c("k", "tot.withinss")],
+    export_type = "k" # default uses row number instead of k
+  )["k"]
+
+## TODO check other metrics (b/c sil and elbow disagree)
+# other metrics
+#  sil_k <- metrics_df$k[which.max(metrics_df$sil_score)]
+#  dunn_k <- metrics_df$k[which.max(metrics_df$dunn_index)]
+#  calinski_harabasz_k <- metrics_df$k[which.max(metrics_df$calinski_harabasz)]
+
+  txtplot::txtplot(
+    x = metrics_df$k, y = metrics_df$tot.withinss, 
+    xlab = "k (number of clusters)",
+    ylab = "SS(Within)"
+  )
+  PEcAn.logger::logger.info(
+    "Optimal number of clusters according to Elbow Method: ", elbow_k, 
+    "(where the k vs ss(within) curve starts to flatten.)"
+  )
+
+  PEcAn.logger::logger.info("Silhouette scores computed. Higher values indicate better-defined clusters.")
+  txtplot::txtplot(
+    x = metrics_df$k, y = metrics_df$sil_score,
+    xlab = "Number of Clusters (k)", ylab = "Score"
+  )  
 
   # Perform hierarchical k-means clustering with optimal k
-  final_hkmeans <- hkmeans(clust_data_scaled, optimal_k)
-  data$cluster <- final_hkmeans$cluster
+  final_hkmeans <- hkmeans(clust_data_scaled, elbow_k)
+  clust_data <- cbind(
+    site_id = data$site_id,
+    clust_data, 
+    cluster = final_hkmeans$cluster
+  )
 
-  return(data)
+  return(clust_data)
 }
 
 #' 
@@ -218,43 +214,51 @@ perform_clustering <- function(data) {
 #' 
 ## ----clustering, eval=FALSE---------------------------------------------------
 # 
-# data_clustered <- perform_clustering(data_for_clust)
-# save(data_clustered, file = "cache/data_clustered.rda")
+sites_clustered <- perform_clustering(data_for_clust, k = 5:15)
 
-#' 
+#'
 #' ### Check Clustering
-#' 
+#'
 ## ----check-clustering---------------------------------------------------------
-load("cache/data_clustered.rda")
 # Summarize clusters
-cluster_summary <- data_clustered |>
-                      group_by(cluster) |>
-                      summarise(across(where(is.numeric), mean, na.rm = TRUE))
-if('mean_temp' %in% names(cluster_summary)){
-  cluster_summary <- cluster_summary |>
-    rename(temp = mean_temp)
-  PEcAn.logger::logger.warn("you should", 
-    "change mean_temp --> temp in cluster_summary",
-    "when it is created upstream and then delete this",
-    "conditional chunk")
-}
-# use ggplot to plot all pairwise numeric variables
+cluster_summary <- sites_clustered |>
+  group_by(cluster) |>
+  summarise(across(where(is.numeric), \(x) mean(x, na.rm = TRUE)))
+
+knitr::kable(cluster_summary, digits = 0)
 
+# ANOVA based variable importance
+anova_results <- sites_clustered |>
+  select(where(is.numeric)) |>
+  mutate(cluster = as.factor(cluster)) |>
+  aov(cluster ~ ., data = .) |>
+  broom::tidy() 
+
+# Plot all pairwise numeric variables
 library(GGally)
-data_clustered |>
+ggpairs_plot <- sites_clustered |>
+  select(-site_id, -crop, -climregion_id) |>
+  # need small # pfts for ggpairs
+  mutate(pft = ifelse(pft == "woody perennial crop", "woody perennial crop", "other")) |>
   sample_n(1000) |>
-  ggpairs(columns=c(1,2,4,5,6)+1,
-          mapping = aes(color = as.factor(cluster), alpha = 0.8))+
+  ggpairs(
+    columns = c(1, 2, 4, 5, 6) + 1,
+    mapping = aes(color = as.factor(cluster), alpha = 0.8)
+  ) +
   theme_minimal()
+ggsave(ggpairs_plot,
+  filename = "downscale/figures/cluster_pairs.png",
+  dpi = 300, width = 10, height = 10, units = "in"
+)
 
-ggplot(data = cluster_summary, aes(x = cluster)) +
+cluster_plot <- ggplot(data = cluster_summary, aes(x = cluster)) +
   geom_line(aes(y = temp, color = "temp")) +
   geom_line(aes(y = precip, color = "precip")) +
   geom_line(aes(y = clay, color = "clay")) +
   geom_line(aes(y = ocd, color = "ocd")) +
   geom_line(aes(y = twi, color = "twi")) +
   labs(x = "Cluster", y = "Value", color = "Variable")
-
+ggsave(cluster_plot, filename = "downscale/figures/cluster_summary.png", dpi = 300)
 knitr::kable(cluster_summary |> round(0))
 
 
@@ -277,13 +281,15 @@ climregion_ids <- read_csv("data/climregion_ids.csv",
                            ))
 
 factor_stratification <- list(
-    crop_id = table(data_clustered$cluster, data_clustered$crop),
-    climregion_id = table(data_clustered$cluster, data_clustered$climregion_name))
+    crop_id = table(sites_clustered$cluster, sites_clustered$crop),
+    climregion_id = table(sites_clustered$cluster, sites_clustered$climregion_name))
 
 lapply(factor_stratification, knitr::kable)
 # Shut down parallel backend
 plan(sequential)
 
+
+
 #' 
 #' ## Design Point Selection
 #' 
@@ -292,51 +298,60 @@ plan(sequential)
 #' 
 #' For the final high resolution runs we expect to use approximately 10,000 design points.
 #' For woody croplands, we will start with a number proportional to the total number of sites with woody perennial pfts.
+#'
 #' 
+
+#'
+#' ### How Many Design Points?
+#'
+#' Calculating Woody Cropland Proportion
+#'
+#' Here we calculate percent of California croplands that are woody perennial crops,
+#' in order to estimate the number of design points that will be selected in the clustering step
+## ----woody-proportion---------------------------------------------------------
+ca_attributes <- read_csv(file.path(data_dir, "ca_field_attributes.csv"))
+ca_fields <- sf::st_read(file.path(data_dir, "ca_fields.gpkg"))
+pft_area <- ca_fields |>
+  left_join(ca_attributes, by = "site_id") |>
+  dplyr::select(site_id, pft, area_ha) |>
+  dtplyr::lazy_dt() |>
+  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
+  dplyr::group_by(woody_indicator) |>
+  dplyr::summarize(pft_area = sum(area_ha)) |>
+  # calculate percent of total area
+  dplyr::mutate(pft_area_pct = pft_area / sum(pft_area) * 100)
+
+knitr::kable(pft_area, digits = 0)
+# answer: 17% of California croplands were woody perennial crops in the
+# 2016 LandIQ dataset
+# So ... if we want to ultimately have 2000 design points, we should have ~ 400
+# design points for woody perennial crops
+
+
 ## ----design-point-selection---------------------------------------------------
 # From the clustered data, remove anchor sites to avoid duplicates in design point selection.
 
 if(!exists("ca_fields")) {
-  ca_fields <- sf::st_read("data/ca_fields.gpkg")
+  ca_fields <- sf::st_read(file.path(data_dir, "ca_fields.gpkg"))
 }
 
-missing_anchor_sites <- woody_anchor_sites|>
-               as_tibble()|>
-               left_join(ca_fields, by = 'id') |>
-               filter(is.na(id)) |> 
-               select(location, site_name, geometry)
-
-if(nrow(missing_anchor_sites) > 0){
-  woody_anchor_sites <- woody_anchor_sites |> 
-                          drop_na(lat, lon)
-  # there is an anchor site that doesn't match the ca_fields;
-  # need to check on this. For now we will just remove it from the dataset.
-  PEcAn.logger::logger.warn(
-    "The following site(s) aren't within DWR crop fields:",
-    knitr::kable(missing_anchor_sites)
-  )
-  PEcAn.logger::logger.info(
-    "Check the sf::st_nearest_feature join at the beginning of this script"
-  )
-}   
-
 set.seed(2222222)
-design_points_ids <- data_clustered |>
-  filter(!id %in% woody_anchor_sites$id) |>
-  select(id)  |>
-  sample_n(100 - nrow(woody_anchor_sites))  |>
-  select(id)
+design_points_ids <- sites_clustered |>
+  filter(!site_id %in% anchorsites_for_clust$site_id) |>
+  select(site_id)  |>
+  sample_n(100 - nrow(anchorsites_for_clust))  |>
+  select(site_id)
 
-anchor_site_ids <- woody_anchor_sites |>
-   select(id)
+anchor_site_ids <- anchorsites_for_clust |>
+   select(site_id)
 
-final_design_points <- bind_rows(design_points_ids,
+design_points <- bind_rows(design_points_ids,
    anchor_site_ids)  |>
-   left_join(ca_fields, by = "id")
+   left_join(ca_fields, by = "site_id")
 
-final_design_points |>
+design_points |>
    as_tibble()  |>
-   select(id, lat, lon) |>
+   select(site_id, lat, lon) |>
    write_csv("data/design_points.csv")
 
 
@@ -348,59 +363,21 @@ final_design_points |>
 ## ----design-point-map---------------------------------------------------------
 # plot map of california and climregions
 
-final_design_points_clust <- final_design_points |>
-  left_join(data_clustered, by = "id") |>
-  select(id, lat, lon, cluster) |>
+design_points_clust <- design_points |>
+  left_join(sites_clustered, by = "site_id") |>
+  select(site_id, lat, lon, cluster) |>
   drop_na(lat, lon) |>
   mutate(cluster = as.factor(cluster)) |>
   st_as_sf(coords = c("lon", "lat"), crs = 4326)
 
 ca_fields_pts <- ca_fields  |>
   st_as_sf(coords = c("lon", "lat"), crs = 4326)
-ggplot() +
-  geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.5) +
+
+design_pt_plot <- ggplot() +
+  geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.75) +
   labs(color = "Climregion") +
   theme_minimal() +
-  geom_sf(data = final_design_points_clust, aes(shape = cluster)) +
-  geom_sf(data = ca_fields_pts, fill = 'black', color = "grey", alpha = 0.5)
-
-
-#'
-#' ## Woody Cropland Proportion
-#' 
-#' Here we calculate percent of California croplands that are woody perennial crops, in order to estimate the number of design points that will be selected in the clustering step
-#' 
-## ----woody-proportion---------------------------------------------------------
-field_attributes <- read_csv("data/ca_field_attributes.csv")
-ca <- ca_fields |>
-  dplyr::select(-lat, -lon) |>
-  dplyr::left_join(field_attributes, by = "id")
-
-set.seed(5050)
-pft_area <- ca |>
-  dplyr::sample_n(2000) |>
-  dplyr::select(id, pft, area_ha) |>
-  dtplyr::lazy_dt() |>
-  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
-  dplyr::group_by(woody_indicator) |>
-  dplyr::summarize(pft_area = sum(area_ha))
-
-# now calculate sum of pft_area and the proportion of woody perennial crops
-pft_area <- pft_area |>
-  dplyr::mutate(total_area = sum(pft_area)) |>
-  dplyr::mutate(area_pct = round(100 * pft_area / total_area)) |>
-  select(-total_area, -pft_area) |>
-  dplyr::rename("Woody Crops" = woody_indicator, "Area %" = area_pct) 
-
-PEcAn.logger::logger.info(
-  "Total area and proportion of fields that are woody perennial",
-  "crops in California croplands:",
-  pft_area |> kableExtra::kable()
-)
-
-PEcAn.logger::logger.info(
-  "final output from clustering and design point selection:",
-  cluster_summary |> knitr::kable()
-) #
-
+  geom_sf(data = ca_fields, fill = "black", color = "lightgrey", alpha = 0.25) +
+  geom_sf(data = design_points_clust, aes(shape = cluster))
 
+ggsave(design_pt_plot, filename = "downscale/figures/design_points.png", dpi = 300, bg = "white")
\ No newline at end of file
diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 4a3100e..66cba1d 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -9,21 +9,11 @@ no_cores <- parallel::detectCores(logical = FALSE)
 plan(multicore, workers = no_cores - 1)
 
 # Define base directory for ensemble outputs
-basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_98sites_20reps_20250312"
-outdir <- file.path(basedir, "out")
-
-# Get Run metadata from log filename 
-# ??? is there a more reliable way to do this?
-logfile <- dir(basedir, pattern = "pecan_workflow_runlog")
-pattern <- "^pecan_workflow_runlog_([0-9]{14})_([0-9]+-[0-9]+)\\.log$"
-matches <- stringr::str_match(logfile, pattern)
-forecast_time_string <- matches[2]
-forecast_iteration_id <- matches[3]
-forecast_time <- lubridate::as_datetime(as.POSIXct(forecast_time_string, format = "%Y%m%d%H%M%S"))
-obs_flag <- 0
+basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
 
 # Read settings file and extract run information
-settings <- PEcAn.settings::read.settings(file.path(basedir, "pecan.CONFIGS.xml"))
+settings <- PEcAn.settings::read.settings(file.path(basedir, "settings.xml"))
+outdir <- file.path(basedir, settings$modeloutdir)
 ensemble_size <- settings$ensemble$size |>
     as.numeric()
 start_date <- settings$run$settings.1$start.date # TODO make this unique for each site
@@ -32,9 +22,14 @@ end_date <- settings$run$settings.1$end.date
 end_year <- lubridate::year(end_date)
 
 # Site Information
-design_points <- readr::read_csv(here::here("data/design_points.csv"), show_col_types = FALSE) |>
-    distinct()
+# design points for 1b
+# data/design_points.csv
+design_pt_csv <- "https://raw.githubusercontent.com/ccmmf/workflows/46a61d58a7b0e43ba4f851b7ba0d427d112be362/data/design_points.csv"
+design_points <- readr::read_csv(design_pt_csv, show_col_types = FALSE) |>
+    rename(site_id = id) |> # fixed in more recent version of 01 script
+    dplyr::distinct()
 
+## end scratch
 # Variables to extract
 variables <- c("AGB", "TotSoilCarb")
 
@@ -72,7 +67,7 @@ variables <- c("AGB", "TotSoilCarb")
 
 # Preallocate 3-D array for 98 sites, 2 variables, and 20 ensemble members
 site_ids <- design_points |>
-    pull(id) |>
+    pull(site_id) |>
     unique()
 ens_ids <- 1:ensemble_size
 
@@ -83,9 +78,9 @@ ens_ids <- 1:ensemble_size
 # start_year <- end_year - 1
 
 ens_dirs <- expand.grid(ens = PEcAn.utils::left.pad.zeros(ens_ids), 
-                        site = site_ids, 
+                        site_id = site_ids, 
                         stringsAsFactors = FALSE) |>
-    mutate(dir = file.path(outdir, paste("ENS", ens, site, sep = "-")))
+    mutate(dir = file.path(outdir, paste("ENS", ens, site_id, sep = "-")))
 # Check that all ens dirs exist
 existing_dirs <- file.exists(ens_dirs$dir)
 if (!all(existing_dirs)) {
@@ -96,9 +91,9 @@ if (!all(existing_dirs)) {
 # extract output via read.output
 ens_results <- furrr::future_pmap_dfr(
     ens_dirs,
-    function(ens, site, dir) {
+    function(ens, site_id, dir) {
         out_df <- PEcAn.utils::read.output(
-            runid = paste(ens, site, sep = "-"),
+            runid = paste(ens, site_id, sep = "-"),
             outdir = dir,
             start.year = start_year,
             end.year = end_year,
@@ -106,19 +101,25 @@ ens_results <- furrr::future_pmap_dfr(
             dataframe = TRUE,
             verbose = FALSE
         ) |>
-            dplyr::mutate(site = site, ensemble = as.numeric(ens)) |>
+            dplyr::mutate(site_id = site_id, ensemble = as.numeric(ens)) |>
             dplyr::rename(time = posix)
     },
     .options = furrr::furrr_options(seed = TRUE)
 ) |>
-    group_by(ensemble, site, year) |>
+    group_by(ensemble, site_id, year) |>
     filter(year <= end_year) |>
     filter(time == max(time)) |> # only take last value
     ungroup() |>
-    arrange(ensemble, site, year)  |> 
+    arrange(ensemble, site_id, year)  |> 
     tidyr::pivot_longer(cols = all_of(variables), names_to = "variable", values_to = "prediction")
 
-# --- Create 4-D array ---
+## Create Ensemble Output For Downscaling
+## Below, three different output formats are created:
+## 1. 4-D array (time, site, ensemble, variable)
+## 2. long format data frame (time, site, ensemble, variable)
+## 3. NetCDF file (time, site, ensemble, variable)
+
+# --- 1. Create 4-D array ---
 # Add a time dimension (even if of length 1) so that dimensions are: [time, site, ensemble, variable]
 unique_times <- sort(unique(ens_results$time))
 if(length(unique_times) != length(start_year:end_year)){
@@ -138,7 +139,7 @@ for (var in variables) {
         dim = c(length(unique_times), length(site_ids), length(ens_ids)),
         dimnames = list(
             datetime = as.character(unique_times),
-            site = site_ids,
+            site_id = site_ids,
             ensemble = as.character(ens_ids)
         )
     )
@@ -147,7 +148,7 @@ for (var in variables) {
     subset_idx <- which(ens_results$variable == var)
     if (length(subset_idx) > 0) {
         i_time <- match(ens_results$time[subset_idx], unique_times)
-        i_site <- match(ens_results$site[subset_idx], site_ids)
+        i_site <- match(ens_results$site_id[subset_idx], site_ids)
         i_ens <- match(ens_results$ensemble[subset_idx], ens_ids)
         arr[cbind(i_time, i_site, i_ens)] <- ens_results$prediction[subset_idx]
     }
@@ -156,15 +157,15 @@ for (var in variables) {
 }
 
 saveRDS(ens_arrays, file = file.path(outdir, "efi_ens_arrays.rds"))
-    
+
+# --- 2. Create EFI Standard v1.0 long format data frame ---
 efi_long <- ens_results |>
     rename(datetime = time) |>
-    select(datetime, site, ensemble, variable, prediction)
+    select(datetime, site_id, ensemble, variable, prediction)
 
 readr::write_csv(efi_long, file.path(outdir, "efi_ens_long.csv"))
 
-
-####--- Generate EFI Standard v1.0 NetCDF files
+####--- 3. Create EFI Standard v1.0 NetCDF files
 library(ncdf4)
 # Assume these objects already exist (created above):
 #   unique_times: vector of unique datetime strings
@@ -177,10 +178,10 @@ library(ncdf4)
 time_char <- unique_times
 
 lat <- design_points |>
-    filter(id %in% site_ids) |> # only required when testing w/ subset
+    filter(site_id %in% site_ids) |> # only required when testing w/ subset
     dplyr::pull(lat)
 lon <- design_points |>
-    filter(id %in% site_ids) |>
+    filter(site_id %in% site_ids) |>
     dplyr::pull(lon)
 
 # Convert time to CF-compliant values using PEcAn.utils::datetime2cf
@@ -265,7 +266,20 @@ ncvar_put(nc_out, lon_var, lon)
 ncvar_put(nc_out, agb_ncvar, ens_arrays[["AGB"]])
 ncvar_put(nc_out, soc_ncvar, ens_arrays[["TotSoilCarb"]])
 
-# Add global attributes per EFI standards.
+## Add global attributes per EFI standards.
+
+# Get Run metadata from log filename
+# ??? is there a more reliable way to do this?
+logfile <- dir(basedir, pattern = "log$")
+pattern <- "([0-9]{14})_([0-9]+)\\.log$"
+forecast_time_string <- stringr::str_match(logfile, "[0-9]{12}")
+if (is.na(forecast_time_string)) {
+    PEcAn.logger::logger.error("Unable to extract forecast time from log filename.")
+}
+forecast_iteration_id <- forecast_time_string # or ?
+forecast_time <- lubridate::as_datetime(forecast_time_string, format = "%Y%m%d%H%M%S")
+obs_flag <- 0
+
 ncatt_put(nc_out, 0, "model_name", "SIPNET")
 ncatt_put(nc_out, 0, "model_version", "v1.3")
 ncatt_put(nc_out, 0, "iteration_id", forecast_iteration_id)
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 3ff321a..20efb94 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -16,14 +16,23 @@
 #' - Aggregate County-level biomass and SOC inventories
 #'
 ## ----setup--------------------------------------------------------------------
-# remotes::install_github("dlebauer/pecan@ensemble_downscaling", subdir = "modules/assim.sequential", ref = "da96331")
 library(tidyverse)
 library(sf)
 library(terra)
-devtools::load_all(here::here("../pecan/modules/assim.sequential/"))
-# library(PEcAnAssimSequential)
-basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_98sites_20reps_20250312"
-outdir <- file.path(basedir, "out")
+library(furrr)
+
+no_cores <- parallel::detectCores(logical = FALSE)
+plan(multicore, workers = no_cores - 1)
+
+# while developing PEcAn:
+# devtools::load_all(here::here("../pecan/modules/assim.sequential/"))
+# remotes::install_git("dlebauer/pecan@ensemble_downscaling", subdir = "modules/assim.sequential")
+remotes::install_git("../pecan@ensemble_downscaling", subdir = "modules/assim.sequential", upgrade = FALSE)
+library(PEcAnAssimSequential)
+datadir <- "/projectnb/dietzelab/ccmmf/data"
+basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
+settings <- PEcAn.settings::read.settings(file.path(basedir, "settings.xml"))
+outdir <- file.path(basedir, settings$modeloutdir)
 options(readr.show_col_types = FALSE)
 
 library(furrr)
@@ -31,21 +40,24 @@ no_cores <- parallel::detectCores(logical = FALSE)
 plan(multicore, workers = no_cores - 1)
 
 #' ## Get Site Level Outputs
-ensemble_file <- file.path(outdir, "efi_ens_long.csv.gz")
+ensemble_file <- file.path(outdir, "efi_ens_long.csv")
 ensemble_data <- readr::read_csv(ensemble_file) 
 
 #' ### Random Forest using PEcAn downscale workflow
 ## -----------------------------------------------------------------------------
-design_points <- read_csv(here::here("data/design_points.csv")) |>
+design_pt_csv <- "https://raw.githubusercontent.com/ccmmf/workflows/46a61d58a7b0e43ba4f851b7ba0d427d112be362/data/design_points.csv"
+design_points <- read_csv(design_pt_csv) |> #read_csv(here::here("data/design_points.csv")) |>
   dplyr::distinct()
 
-covariates <- readRDS(here::here("data/data_for_clust_with_ids.rds")) |>
-  rename(site = id) |>
+covariates <- read_csv(here::here("data/site_covariates.csv")) |>
   select(
-    site, where(is.numeric),
-    -ends_with("id") # drop crop_id, climregion_id columns
+    site_id, where(is.numeric),
+    -climregion_id
   )
 
+## TODO
+# separate fitting and predicting functions
+# calculate variable importance (randomForest::importance)
 d <- function(date, carbon_pool) {
   filtered_ens_data <- subset_ensemble(
     ensemble_data = ensemble_data,
@@ -55,21 +67,18 @@ d <- function(date, carbon_pool) {
   )
 
   # Downscale the data
-  downscale_output <- downscale(
+  downscale_output <- ensemble_downscale(
     ensemble_data = filtered_ens_data,
     site_coords   = design_points,
     covariates    = covariates,
-    model_type    = "rf",
     seed          = 123
   )
   return(downscale_output)
 }
 
 cpools <- c("TotSoilCarb", "AGB")
-library(furrr)
-plan(multisession)
 
-downscale_output <- purrr::map( # not using furrr b/c it is used inside downscale
+downscale_output_list <- purrr::map( # not using furrr b/c it is used inside downscale
   cpools,
   ~ d(date = "2018-12-31", carbon_pool = .x)
 ) |>
@@ -77,11 +86,9 @@ downscale_output <- purrr::map( # not using furrr b/c it is used inside downscal
 
 
 ## Save to make it easier to restart
-save(downscale_output, file = here::here("cache/downscale_output.rda"))
-
+# saveRDS(downscale_output, file = here::here("cache/downscale_output.rds"))
 
-metrics <- downscale_metrics(downscale_output)
-# could compute stats here e.g. mean, CI for ea. metric
+metrics <- lapply(downscale_output_list, downscale_metrics)
 print(metrics)
 
 #'
@@ -89,27 +96,24 @@ print(metrics)
 #' ## Aggregate to County Level
 #'
 ## -----------------------------------------------------------------------------
-library(sf)
-library(dplyr)
-
 
 # ca_fields <- readr::read_csv(here::here("data/ca_field_attributes.csv")) |>
 #   dplyr::select(id, lat, lon) |>
 #   rename(site = id)
 
-ca_fields_full <- sf::read_sf(here::here("data/ca_fields.gpkg"))
+ca_fields_full <- sf::read_sf(file.path(datadir, "ca_fields.gpkg"))
 
 ca_fields <- ca_fields_full |>
-  select(site = id, county, area_ha)  
+dplyr::select(site_id, county, area_ha)  
 
 # Convert list to table with predictions and site identifier
 get_downscale_preds <- function(downscale_output) {
   purrr::map(
     downscale_output$predictions,
-    ~ tibble(site = covariates$site, prediction = .x)
+    ~ tibble(site_id = covariates$site_id, prediction = .x)
   ) |>
     bind_rows(.id = "ensemble") |>
-    left_join(ca_fields, by = "site") 
+    left_join(ca_fields, by = "site_id") 
 }
 
 downscale_preds <- purrr::map(downscale_output, get_downscale_preds) |>
@@ -123,7 +127,12 @@ ens_county_preds <- downscale_preds |>
   # Now aggregate to get county level totals for each pool x ensemble
   group_by(carbon_pool, county, ensemble) |>
   summarize(
-    total_c = sum(total_c)
+    total_c = sum(total_c),
+    total_ha = sum(area_ha)
+  ) |>
+  ungroup() |>
+  mutate(
+    c_density = PEcAn.utils::ud_convert(total_c / total_ha, "Tg/ha", "kg/m2")
   ) |>
   arrange(carbon_pool, county, ensemble)
 
@@ -132,8 +141,10 @@ county_summaries <- ens_county_preds |>
     summarize(
       n = n(),
       mean_total_c = mean(total_c),
-      median_total_c = median(total_c),
-      sd_total_c = sd(total_c)
+      #median_total_c = median(total_c),
+      sd_total_c = sd(total_c),
+      mean_c_density = mean(c_density),
+      sd_c_density = sd(c_density)
     )
   
 # Lets plot the results!
@@ -146,7 +157,7 @@ co_preds_to_plot <- county_summaries |>
   right_join(county_boundaries, by = c("county" = "name")) |>
   arrange(county, carbon_pool) |>
   pivot_longer(
-    cols = c(mean_total_c, median_total_c, sd_total_c),
+    cols = c(mean_total_c, sd_total_c, mean_c_density, sd_c_density),
     names_to = "stat",
     values_to = "value"
   )
@@ -169,7 +180,7 @@ p <- purrr::map(cpools, function(pool) {
 
   ggsave(
     plot = .p,
-    filename = here::here(paste0("county_total_", pool, ".png")),
+    filename = here::here("downscale/figures",paste0("county_total_", pool, ".png")),
     width = 10, height = 5,
     bg = "white"
   )

From d468ea4d55e62f59381ca858ec0fe645b03f9968 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 21 Mar 2025 18:14:13 -0400
Subject: [PATCH 24/49] added first draft of documentation

---
 downscale/00-prepare.R                        |  31 +--
 downscale/03_downscale_and_agregate.R         | 117 ++++++----
 .../04_downscaling_documentation_results.qmd  | 200 ++++++++++++++++++
 3 files changed, 298 insertions(+), 50 deletions(-)
 create mode 100644 downscale/04_downscaling_documentation_results.qmd

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index cbeaa7c..f5d1537 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -77,7 +77,8 @@ if(!file.exists(ca_fields_gpkg) & !file.exists(ca_attributes_csv)) {
 
 ca_fields <- sf::st_read(ca_fields_gpkg) |>
   sf::st_transform(crs = ca_albers_crs)
-  
+ca_attributes <- readr::read_csv(ca_attributes_csv)
+
 #' ##### Subset Woody Perennial Crop Fields
 #'
 #' Phase 1 focuses on Woody Perennial Crop fields.
@@ -87,14 +88,13 @@ ca_fields <- sf::st_read(ca_fields_gpkg) |>
 #'
 ## -----------------------------------------------------------------------------
 ca_fields |>
-  filter(pft = = "woody perennial crop") |>
+  left_join(ca_attributes |> select(site_id, pft), by = c("site_id")) |>
+  filter(pft == "woody perennial crop") |>
   sf::st_transform(crs = ca_albers_crs) |>
-  dplyr::select(site_id, crop, pft, geom) |>
+  dplyr::select(site_id, geom) |>
   sf::st_write(file.path(data_dir, 'ca_woody.gpkg'), 
     delete_dsn = TRUE)
 
-ca_attributes <- readr::read_csv(ca_attributes_csv)
-
 #'
 #' ### Convert Polygons to Points.
 #'
@@ -289,7 +289,8 @@ if (exists("IN_TARGETS") && IN_TARGETS) {
 #'
 ## ----anchor-sites-------------------------------------------------------------
 # Anchor sites from UC Davis, UC Riverside, and Ameriflux.
-anchor_sites_pts <- readr::read_csv("data_raw/anchor_sites.csv") |>
+anchor_sites <- readr::read_csv("data_raw/anchor_sites.csv")
+anchor_sites_pts <- anchor_sites |>
   sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
   sf::st_transform(crs = ca_albers_crs)
 
@@ -353,12 +354,8 @@ if (any(is.na(anchor_sites_with_ids |> select(site_id, lat, lon)))) {
   )
 }
 
-# Save processed anchor sites
-anchor_sites_with_ids |>
-  dplyr::select(site_id, external_site_id, site_name, lat, lon, crops, pft) |>
-  readr::write_csv(file.path(data_dir, "anchor_sites_ids.csv"))
-
-if(anchor_sites_with_ids |>
+# Check that all anchor sites have covariates
+if (anchor_sites_with_ids |>
   left_join(site_covariates, by = "site_id") |>
   dplyr::select(
     site_id, lat, lon,
@@ -367,8 +364,16 @@ if(anchor_sites_with_ids |>
   filter(if_any(
     everything(),
     ~ is.na(.x)
-  ))  |> nrow() > 0) {
+  )) |> nrow() > 0) {
   PEcAn.logger::logger.warn(
     "Some anchor sites have missing environmental covariates!"
   )
 }
+
+
+# Save processed anchor sites
+anchor_sites_with_ids |>
+  sf::st_drop_geometry() |>
+  dplyr::select(site_id, lat, lon, external_site_id, site_name, crops, pft) |>
+  readr::write_csv(file.path(data_dir, "anchor_sites_ids.csv"))
+
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 20efb94..0e653ce 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -35,9 +35,6 @@ settings <- PEcAn.settings::read.settings(file.path(basedir, "settings.xml"))
 outdir <- file.path(basedir, settings$modeloutdir)
 options(readr.show_col_types = FALSE)
 
-library(furrr)
-no_cores <- parallel::detectCores(logical = FALSE)
-plan(multicore, workers = no_cores - 1)
 
 #' ## Get Site Level Outputs
 ensemble_file <- file.path(outdir, "efi_ens_long.csv")
@@ -49,16 +46,14 @@ design_pt_csv <- "https://raw.githubusercontent.com/ccmmf/workflows/46a61d58a7b0
 design_points <- read_csv(design_pt_csv) |> #read_csv(here::here("data/design_points.csv")) |>
   dplyr::distinct()
 
-covariates <- read_csv(here::here("data/site_covariates.csv")) |>
+covariates_csv <- file.path(datadir, "site_covariates.csv")
+covariates <- read_csv(covariates_csv) |>
   select(
     site_id, where(is.numeric),
     -climregion_id
   )
 
-## TODO
-# separate fitting and predicting functions
-# calculate variable importance (randomForest::importance)
-d <- function(date, carbon_pool) {
+downscale_carbon_pool <- function(date, carbon_pool) {
   filtered_ens_data <- subset_ensemble(
     ensemble_data = ensemble_data,
     site_coords   = design_points,
@@ -80,10 +75,11 @@ cpools <- c("TotSoilCarb", "AGB")
 
 downscale_output_list <- purrr::map( # not using furrr b/c it is used inside downscale
   cpools,
-  ~ d(date = "2018-12-31", carbon_pool = .x)
+  ~ downscale_carbon_pool(date = "2018-12-31", carbon_pool = .x)
 ) |>
   purrr::set_names(cpools)
 
+## Check variable importance
 
 ## Save to make it easier to restart
 # saveRDS(downscale_output, file = here::here("cache/downscale_output.rds"))
@@ -91,6 +87,21 @@ downscale_output_list <- purrr::map( # not using furrr b/c it is used inside dow
 metrics <- lapply(downscale_output_list, downscale_metrics)
 print(metrics)
 
+median_metrics <- purrr::map(metrics, function(m) {
+  m |>
+    select(-ensemble) |>
+    summarise(#do equivalent of colmeans but for medians)
+      across(
+        everything(),
+        list(median = ~ median(.x)),
+        .names = "{col}"
+      )
+    )
+})
+
+bind_rows(median_metrics, .id = "carbon_pool") |>
+  knitr::kable()
+
 #'
 #'
 #' ## Aggregate to County Level
@@ -104,19 +115,19 @@ print(metrics)
 ca_fields_full <- sf::read_sf(file.path(datadir, "ca_fields.gpkg"))
 
 ca_fields <- ca_fields_full |>
-dplyr::select(site_id, county, area_ha)  
+  dplyr::select(site_id, county, area_ha)  
 
 # Convert list to table with predictions and site identifier
-get_downscale_preds <- function(downscale_output) {
+get_downscale_preds <- function(downscale_output_list) {
   purrr::map(
-    downscale_output$predictions,
+    downscale_output_list$predictions,
     ~ tibble(site_id = covariates$site_id, prediction = .x)
   ) |>
     bind_rows(.id = "ensemble") |>
     left_join(ca_fields, by = "site_id") 
 }
 
-downscale_preds <- purrr::map(downscale_output, get_downscale_preds) |>
+downscale_preds <- purrr::map(downscale_output_list, get_downscale_preds) |>
   dplyr::bind_rows(.id = "carbon_pool") |>
   # Convert kg / ha to tonne (Mg) / field level totals
   # first convert scale
@@ -146,6 +157,11 @@ county_summaries <- ens_county_preds |>
       mean_c_density = mean(c_density),
       sd_c_density = sd(c_density)
     )
+
+readr::write_csv(
+  county_summaries,
+  file.path(outdir, "county_summaries.csv")
+)
   
 # Lets plot the results!
 
@@ -187,27 +203,54 @@ p <- purrr::map(cpools, function(pool) {
 return(.p)
 })
 
-
-# Load CA county boundaries
-# # These are provided by Cal-Adapt as 'Areas of Interest'
-# 
-
-# # check if attributes has county name
-# # Append county name to predicted table
-# grid_with_counties <- st_join(ca_grid, county_boundaries, join = st_intersects)
-
-# # Calculate county-level mean, median, and standard deviation.
-# county_aggregates <- grid_with_counties |>
-#   st_drop_geometry() |> # drop geometry for faster summarization
-#   group_by(county_name) |> # replace with your actual county identifier
-#   summarize(
-#     mean_biomass   = mean(predicted_biomass, na.rm = TRUE),
-#     median_biomass = median(predicted_biomass, na.rm = TRUE),
-#     sd_biomass     = sd(predicted_biomass, na.rm = TRUE)
-#   )
-
-# print(county_aggregates)
-
-# For state-level, do the same but don't group_by county
-
-#' ````
+## Variable Importance
+
+# importance_summary <- map_dfr(cpools, function(cp) {
+#   # Extract the importance for each ensemble model in the carbon pool
+#   importances <- map(1:20, function(i) {
+#     model <- downscale_output_list[[cp]][["model"]][[i]]
+#     randomForest::importance(model)[, "%IncMSE"]
+#   })
+
+#   # Turn the list of importance vectors into a data frame
+#   importance_df <- map_dfr(importances, ~ tibble(importance = .x), .id = "ensemble") |>
+#     group_by(ensemble) |>
+#     mutate(predictor = names(importances[[1]])) |>
+#     ungroup()
+
+#   # Now summarize median and IQR for each predictor across ensembles
+#   summary_df <- importance_df |>
+#     group_by(predictor) |>
+#     summarize(
+#       median_importance = median(importance, na.rm = TRUE),
+#       sd_importance = sd(importance, na.rm = TRUE)
+#     ) |>
+#     mutate(carbon_pool = cp)
+
+#   summary_df
+# })
+
+# library(ggplot2)
+# library(dplyr)
+
+# # Create the popsicle (lollipop) plot
+# p <- ggplot(importance_summary, aes(x = reorder(predictor, median_importance), y = median_importance)) +
+#   geom_errorbar(aes(ymin = median_importance - sd_importance, ymax = median_importance + sd_importance),
+#     width = 0.2, color = "gray50"
+#   ) +
+#   geom_point(size = 4, color = "steelblue") +
+#   coord_flip() +
+#   facet_wrap(~carbon_pool, scales = "free_y") +
+#   labs(
+#     title = "Popsicle Plot of Variable Importance",
+#     x = "Predictor",
+#     y = "Median %IncMSE (<U+00B1> SD)"
+#   ) +
+#   theme_minimal()
+
+# ggsave(p, filename = here::here("downscale/figures", "importance_summary.png"),
+#   width = 10, height = 5,
+#   bg = "white"
+# )
+
+# print(p)
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
new file mode 100644
index 0000000..0dbbe39
--- /dev/null
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -0,0 +1,200 @@
+---
+title: "Downscaling Workflow Documentation"
+author: "David LeBauer"
+date: "`r Sys.Date()`"
+format: html
+execute:
+  echo: false
+---
+
+# Downscaling Workflow Overview
+
+The downscaling workflow is used to predict carbon pools (Soil Organic Carbon and Aboveground Biomass) for cropland fields in California and then aggregate these predictions to the county scale.
+
+It uses an ensemble based approach to uncertainty propagation and analysis.
+
+This multi-step approach facilitates uncertainty quantification by maintaining ensemble structure to propagate errors through the prediction and aggregation processes.
+
+**Definitions**
+  - **Design Points**: Fields chosen via stratified random sampling - using k-means clustering on environmental data layers - across all of California crop fields.
+  - **Crop Fields**: These are all of the croplands in the LandIQ dataset.
+  - **Anchor Sites:** These are sites that will be used as ground truth for calibration and validation. These include UC research stations and Ameriflux sites with high quality data.
+
+
+## Steps
+
+1. **Data Preparation**: Prepares data for clustering and downscaling process.
+2. **Design Point Selection**: Uses k-means clustering to select a representative set of fields that represent environmental space, and add these to the anchor sites.
+3. **SIPNET Model Runs**: Prepares inputs and runs SIPNET simulations for the design points.
+4. **Extract SIPNET Output**: Extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats.
+5. **Downscale and Aggregate SIPNET Output**: Builds a Random Forest model for each ensemble member to predict SOC and AGB, downscales the predictions to the full set of fields, and aggregates predictions to county-level estimates to produce maps and summary statistics.
+
+### Data Preparation
+
+```sh
+Rscript downscale/00_prepare.R
+```
+
+This script prepares data for clustering and downscaling process.
+
+It reads in the LandIQ crop map, anchor sites, and environmental covariates, and creates a table of environmental covariates for each field in the LandIQ crop map. It also links a table of anchor sites to their corresponding LandIQ fields so that these can be used in downstreamm analyses.
+
+- Converts LandIQ-derived shapefiles to a geopackage with geospatial information and a CSV with other attributes from the LandIQ dataset.
+- Extracts environmental covariates (clay, organic carbon, topographic wetness, temperature, precipitation, solar radiation, vapor pressure) for each field in the LandIQ dataset.
+- Groups fields into Cal-Adapt climate regions.
+- Assigns anchor sites to fields.
+
+
+**Inputs:**
+- **LandIQ Crop Map**
+  - `data_raw/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp`. This is a manually curated and harmonized version of the 2016 LandIQ crop map for California.
+- **Soilgrids**
+  - `clay_0-5cm_mean.tif` and `ocd_0-5cm_mean.tif` are rasters that have been downloaded and prepared for efficient data extraction.
+- **Topographic Wetness Index (TWI)**
+  - `TWI/TWI_resample.tiff`
+- **ERA5 Climatological Means**
+  - Available for years 2012-2024 in `GridMET/` folder in files named `ERA5_met_<YYYY>.tiff`. Currently only using means of each variable across all years. Variables include:
+    - temperature
+    - precipitation
+    - solar radiation
+    - vapor pressure deficit
+- **Anchor Sites:**
+    - `data_raw/anchor_sites.csv`: anchor sites (sites with validation data)
+    - external_site_id: Ameriflux site ID or other unique ID, not to be confused with `site_id`.
+    - lat, lon, crops, pft.
+
+
+Future design point selection factors will include management practices (crop, cropping system (cycle, mixtures), irrigation, tillage, C&N ammendments).
+
+**Outputs:** 
+  -`ca_fields.gpkg` contains spatial information from LandIQ including: `site_id`, `lat`, `lon`, `area` (ha), `county`, and `geom`. Lat and Lon are centroids of the geom field.
+  - `ca_field_attributes.csv` contains site_id, lat, lon, year, crop, pft, source, and notes. The crop and pft associated with each field may differ from those in anchor sites, because they come from different sources.
+   - `site_covariates.csv` is a table of environmental covariates for each field in the LandIQ crop map. 
+     - site_id, temp, precip, srad, vapr, crop, pft, clay, ocd, twi.
+  - `anchor_sites_ids.csv`.
+     - site_id, lat, lon, external_site_id, site_name, crops, pft.
+     - (note that lat, lon are the centroid of the field, not the original values in `data_raw/anchor_sites.csv`).
+
+### Design Point Selection
+
+```sh
+Rscript downscale/01_cluster_and_select_design_points.R
+```
+
+Use k-means clustering to select a representative set of 75 fields that represent environmental space, and add these to the 23 distinct[^1^] anchor sites.
+
+[^1^]: There are 25 anchor sites but two have duplicate lat / lon coordinates. This will be addressed in future iterations analysis.
+
+These are the sites where the SIPNET crop and biogeochemistry model will be run. Currently we are running SIPNET for 98 total sites, which includes 75 design points and 23 anchor sites. 
+
+**Steps**
+- Subsample LandIQ fields and include anchor sites for clustering.
+- Select cluster number based on the Elbow Method.
+  - Plot within-cluster sum of squares (WCSS) against the number of clusters and identify the point where adding more clusters yields diminishing returns.
+  - Also evaluates silhouette scores; and future iterations may use multiple methods to select cluster number.
+- Cluster fields using k-means clustering to cluster the fields based on environmental covariates.
+- Select design points from clusters for SIPNET simulation.
+
+**Inputs:**
+- `data/site_covariates.csv`
+- `data/anchor_sites_ids.csv`
+
+**Output:** 
+- `data/design_points.csv`.
+
+### SIPNET Model Runs
+
+These are produced by the Modeling Workflow
+[TODO Describe; how do we integrate these]
+
+**Steps:**
+- Prepare inputs and run SIPNET simulations for the design points.
+
+**Inputs**
+- `design_points.csv`
+- Initial Conditions (described in modeling workflow)
+
+**Outputs:**
+- `out/ENS-<ensemble_number>-<site_id>/YYYY.nc` 
+  - These are NetCDF files that contain the SIPNET outputs for each site in a standardized format.
+  - Currently, <ensemble_number> is 1:20, and 98 values of <site_id> identify the design points. For final runs, these numbers may be closer to 100 ensemble members and 10k design points.
+
+### Extract SIPNET Output
+
+```sh
+Rscript downscale/02_extract_sipnet_output.R
+```
+
+This step extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats. 
+
+
+Extracts output variables (AGB, TotSoilCarb) from SIPNET simulations:
+- Aggregates site-level ensemble outputs into long and 4D array formats.
+- Saves CSV and NetCDF files for downstream use in formats that follow EFI standards for forecasts (Dietze et. al. 2023).
+
+**Inputs:**
+- `out/ENS-<ensemble_number>-<site_id>/YYYY.nc`
+
+**Outputs:** 
+- `out/efi_ens_long.csv`: long format.
+- `out/efi_forecast.nc`: arrays in NetCDF format
+
+
+### Downscale and Aggregate SIPNET Output
+
+```sh
+Rscript downscale/03_downscale_and_aggregate.R
+```
+
+**Steps:**
+- Build a Random Forest model for each ensemble member to predict SOC and AGB. It uses design point ensemble outputs, with one model per ensemble.
+- Downscale the predictions to the full set of fields.
+- Aggregate predictions to county-level estimates to produce maps and summary statistics.
+
+Uses Random Forest to predict SOC and biomass for all woody crop fields:
+- Trains models on design point ensembles.
+- Predicts for all fields and aggregates to county-level.
+- Outputs maps and statistics of carbon density and totals.
+
+**Inputs:**
+- `out/efi_ens_long.csv`: long format SIPNET outputs.
+
+**Outputs:** 
+- County-level statistics for each variable as tables and maps.
+- `out/county_total_AGB.png`: county-level AGB predictions.
+- `out/county_total_TotSoilCarb.png`: county-level SOC predictions.
+- `out/county_summaries.csv`: summary statistics for each county.
+
+
+
+## Results and Analysis
+
+### County Carbon Maps
+
+![County Carbon Map for TotSoilCarb](downscale/figures/county_total_TotSoilCarb.png)  
+![County Carbon Map for AGB](downscale/figures/county_total_AGB.png)
+
+The maps above display the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level.
+
+## Additional EDA
+
+(Include further exploratory data analysis as desired.)
+
+## Discussion and Future Work
+
+This workflow lays the foundation for ensemble-based downscaling of ecosystem carbon stocks. Future enhancements may include:
+- Incorporating alternative modeling approaches (e.g., CNNs) for improved prediction accuracy.
+- Expanding the analysis to include additional carbon pools and environmental variables.
+- Refining the aggregation methods to better capture spatial covariance and error propagation.
+- Integrating more real-time data sources for dynamic model updates.
+
+By maintaining an ensemble framework and detailed uncertainty quantification, this workflow aims to support robust carbon accounting in complex landscapes.
+
+# Conclusion
+
+(Conclude with final remarks, potential applications, and next steps.)
+
+
+# References
+
+Dietze, Michael C., R. Quinn Thomas, Jody Peters, Carl Boettiger, Gerbrand Koren, Alexey N. Shiklomanov, and Jaime Ashander. 2023. “A Community Convention for Ecological Forecasting: Output Files and Metadata Version 1.0.” Ecosphere 14 (11): e4686. https://doi.org/10.1002/ecs2.4686.

From 19eec456d02f0c318e2aa9cbe1bf83be91d86789 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 21 Mar 2025 23:08:09 -0400
Subject: [PATCH 25/49] updated draft of downscaling workflow

---
 downscale/00-prepare.R                        |   7 +-
 .../01_cluster_and_select_design_points.R     | 114 ++++++-----
 downscale/03_downscale_and_agregate.R         | 134 +++++++------
 .../04_downscaling_documentation_results.qmd  | 181 +++++++++++++-----
 4 files changed, 279 insertions(+), 157 deletions(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index f5d1537..74357f6 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -64,8 +64,11 @@ ca_albers_crs <- 3310
 ## Required until PR 3423 is merged https://github.com/PecanProject/pecan/pull/3423
 # check if PR is merged
 # devtools::install_github("dlebauer/pecan",
-#                          ref = "shp2gpkg",
-#                          subdir = "modules/data.land")
+devtools::install_git("../pecan",
+  ref = "shp2gpkg",
+  subdir = "modules/data.land", 
+  upgrade = FALSE
+)
 
 devtools::load_all("../pecan/modules/data.land/")
 input_file = file.path(raw_data_dir, 'i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp')
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index 5b7ffdd..a17fee8 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -58,7 +58,7 @@ data_dir <- "/projectnb/dietzelab/ccmmf/data"
 
 site_covariates_csv <- file.path(data_dir, "site_covariates.csv")
 site_covariates <- readr::read_csv(site_covariates_csv) 
-  
+file.exists(site_covariates_csv)
 #' ## Anchor Site Selection
 #'
 #' Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
@@ -227,19 +227,11 @@ cluster_summary <- sites_clustered |>
 
 knitr::kable(cluster_summary, digits = 0)
 
-# ANOVA based variable importance
-anova_results <- sites_clustered |>
-  select(where(is.numeric)) |>
-  mutate(cluster = as.factor(cluster)) |>
-  aov(cluster ~ ., data = .) |>
-  broom::tidy() 
-
 # Plot all pairwise numeric variables
 library(GGally)
 ggpairs_plot <- sites_clustered |>
-  select(-site_id, -crop, -climregion_id) |>
+  select(-site_id) |>
   # need small # pfts for ggpairs
-  mutate(pft = ifelse(pft == "woody perennial crop", "woody perennial crop", "other")) |>
   sample_n(1000) |>
   ggpairs(
     columns = c(1, 2, 4, 5, 6) + 1,
@@ -251,16 +243,28 @@ ggsave(ggpairs_plot,
   dpi = 300, width = 10, height = 10, units = "in"
 )
 
-cluster_plot <- ggplot(data = cluster_summary, aes(x = cluster)) +
-  geom_line(aes(y = temp, color = "temp")) +
-  geom_line(aes(y = precip, color = "precip")) +
-  geom_line(aes(y = clay, color = "clay")) +
-  geom_line(aes(y = ocd, color = "ocd")) +
-  geom_line(aes(y = twi, color = "twi")) +
-  labs(x = "Cluster", y = "Value", color = "Variable")
-ggsave(cluster_plot, filename = "downscale/figures/cluster_summary.png", dpi = 300)
-knitr::kable(cluster_summary |> round(0))
+# scale and reshape to long for plotting
+
+# Normalize the cluster summary data
+scaled_cluster_summary <- cluster_summary |>
+  mutate(across(-cluster, scale)) |>
+  pivot_longer(
+    cols = -cluster,
+    names_to = "variable",
+    values_to = "value"
+  )
+
+cluster_plot <- ggplot(
+  scaled_cluster_summary,
+  aes(x = factor(variable), y = value)
+) +
+  geom_bar(stat = "identity", position = "dodge") +
+  facet_wrap(~cluster) +
+  coord_flip() +
+  labs(x = "Variable", y = "Normalized Value") +
+  theme_minimal()
 
+ggsave(cluster_plot, filename = "downscale/figures/cluster_plot.png", dpi = 300)
 
 #' 
 #' #### Stratification by Crops and Climate Regions
@@ -269,26 +273,41 @@ knitr::kable(cluster_summary |> round(0))
 # Check stratification of clusters by categorical factors
 
 # cols should be character, factor
-crop_ids <- read_csv("data/crop_ids.csv",
+crop_ids <- read_csv(file.path(data_dir, "crop_ids.csv"),
                      col_types = cols(
                        crop_id = col_factor(),
                        crop = col_character())
                        )
-climregion_ids <- read_csv("data/climregion_ids.csv",
+climregion_ids <- read_csv(file.path(data_dir, "climregion_ids.csv"),
                            col_types = cols(
                              climregion_id = col_factor(),
                              climregion_name = col_character()
                            ))
 
-factor_stratification <- list(
-    crop_id = table(sites_clustered$cluster, sites_clustered$crop),
-    climregion_id = table(sites_clustered$cluster, sites_clustered$climregion_name))
 
-lapply(factor_stratification, knitr::kable)
-# Shut down parallel backend
-plan(sequential)
+## ----stratification-----------------------------------------------------------
+# The goal here is to check the stratification of the clusters by crop and climregion
+# to ensure that the clusters are not dominated by a single crop or climregion
+# BUT probably best to normalize by the total number of fields in each cluster 
+# if this is to be useful
+# is this useful?
+
+# ca_attributes <- read_csv(file.path(data_dir, "ca_field_attributes.csv"))
+# site_covariates <- read_csv(file.path(data_dir, "site_covariates.csv"))
+
+# sites_clustered <- sites_clustered |>
+#   left_join(ca_attributes, by = "site_id") |>
+#   left_join(site_covariates, by = "site_id")
 
+# factor_stratification <- list(
+#     crop_id = table(sites_clustered$cluster, sites_clustered$crop),
+#     climregion_id = table(sites_clustered$cluster, sites_clustered$climregion_name))
 
+# normalize <- function(x) {
+#   round(100 * x / rowSums(x), 1)
+# }
+# normalized_stratification <- lapply(factor_stratification, normalize)
+# lapply(normalized_stratification, knitr::kable)
 
 #' 
 #' ## Design Point Selection
@@ -308,20 +327,21 @@ plan(sequential)
 #'
 #' Here we calculate percent of California croplands that are woody perennial crops,
 #' in order to estimate the number of design points that will be selected in the clustering step
+#' This is commented out because it takes a while and the number won't change
 ## ----woody-proportion---------------------------------------------------------
 ca_attributes <- read_csv(file.path(data_dir, "ca_field_attributes.csv"))
 ca_fields <- sf::st_read(file.path(data_dir, "ca_fields.gpkg"))
-pft_area <- ca_fields |>
-  left_join(ca_attributes, by = "site_id") |>
-  dplyr::select(site_id, pft, area_ha) |>
-  dtplyr::lazy_dt() |>
-  dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
-  dplyr::group_by(woody_indicator) |>
-  dplyr::summarize(pft_area = sum(area_ha)) |>
-  # calculate percent of total area
-  dplyr::mutate(pft_area_pct = pft_area / sum(pft_area) * 100)
-
-knitr::kable(pft_area, digits = 0)
+# pft_area <- ca_fields |>
+#   left_join(ca_attributes, by = "site_id") |>
+#   dplyr::select(site_id, pft, area_ha) |>
+#   dtplyr::lazy_dt() |>
+#   dplyr::mutate(woody_indicator = ifelse(pft == "woody perennial crop", 1L, 0L)) |>
+#   dplyr::group_by(woody_indicator) |>
+#   dplyr::summarize(pft_area = sum(area_ha)) |>
+#   # calculate percent of total area
+#   dplyr::mutate(pft_area_pct = pft_area / sum(pft_area) * 100)
+
+# knitr::kable(pft_area, digits = 0)
 # answer: 17% of California croplands were woody perennial crops in the
 # 2016 LandIQ dataset
 # So ... if we want to ultimately have 2000 design points, we should have ~ 400
@@ -331,10 +351,6 @@ knitr::kable(pft_area, digits = 0)
 ## ----design-point-selection---------------------------------------------------
 # From the clustered data, remove anchor sites to avoid duplicates in design point selection.
 
-if(!exists("ca_fields")) {
-  ca_fields <- sf::st_read(file.path(data_dir, "ca_fields.gpkg"))
-}
-
 set.seed(2222222)
 design_points_ids <- sites_clustered |>
   filter(!site_id %in% anchorsites_for_clust$site_id) |>
@@ -352,8 +368,7 @@ design_points <- bind_rows(design_points_ids,
 design_points |>
    as_tibble()  |>
    select(site_id, lat, lon) |>
-   write_csv("data/design_points.csv")
-
+   write_csv(file.path(data_dir, "design_points.csv"))
 
 #' 
 #' ### Design Point Map
@@ -374,10 +389,11 @@ ca_fields_pts <- ca_fields  |>
   st_as_sf(coords = c("lon", "lat"), crs = 4326)
 
 design_pt_plot <- ggplot() +
-  geom_sf(data = ca_climregions, aes(fill = climregion_name), alpha = 0.75) +
-  labs(color = "Climregion") +
+  geom_sf(data = ca_climregions, fill = 'white') +
   theme_minimal() +
-  geom_sf(data = ca_fields, fill = "black", color = "lightgrey", alpha = 0.25) +
-  geom_sf(data = design_points_clust, aes(shape = cluster))
+  geom_sf(data = ca_fields, fill = "lightgrey", color = "lightgrey", alpha = 0.25) +
+  geom_sf(data = design_points_clust) +
+  geom_text(data = design_points_clust, aes(label = cluster, geometry = geometry), 
+            size = 2, stat = "sf_coordinates")
 
-ggsave(design_pt_plot, filename = "downscale/figures/design_points.png", dpi = 300, bg = "white")
\ No newline at end of file
+ggsave(design_pt_plot, filename = "downscale/figures/design_points.png", dpi = 300, bg = "white")
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 0e653ce..91b7d73 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -176,81 +176,95 @@ co_preds_to_plot <- county_summaries |>
     cols = c(mean_total_c, sd_total_c, mean_c_density, sd_c_density),
     names_to = "stat",
     values_to = "value"
-  )
+  ) |>
+  mutate(units = case_when(
+    str_detect(stat, "total_c") ~ "Carbon Stock (Tg)",
+    str_detect(stat, "c_density") ~ "Carbon Density (kg/m2)"
+  ), stat = case_when(
+    str_detect(stat, "mean") ~ "Mean",
+    str_detect(stat, "sd") ~ "SD"
+  ))
 
 # now plot map of county-level predictions with total carbon
-p <- purrr::map(cpools, function(pool) {
+units <- unique(co_preds_to_plot$units)
+p <- purrr::map2(cpools, units, function(pool, unit) {
   .p <- ggplot(
-    co_preds_to_plot |> filter(carbon_pool == pool),
+    co_preds_to_plot |> filter(carbon_pool == pool & units == unit),
     aes(geometry = geom, fill = value)
   ) +
     geom_sf(data = county_boundaries, fill = "lightgrey", color = "black") +
     geom_sf() +
     scale_fill_viridis_c(option = "plasma") +
     theme_minimal() +
+    facet_grid(carbon_pool ~ stat) +
     labs(
-      title = paste0(pool, "-C by County"),
-      fill = "Total Carbon (Tg)"
-    ) +
-    facet_grid(~stat)
+      title = paste("County-Level Predictions for", pool, unit),
+      fill = "Value"
+    )
+
+  unit <- ifelse(unit == "Carbon Stock (Tg)", "stock",
+    ifelse(unit == "Carbon Density (kg/m2)", "density", NA)
+  )
 
+  plotfile <- here::here("downscale/figures", paste0("county_", pool, "_carbon_", unit, ".png"))
+  print(plotfile)
   ggsave(
     plot = .p,
-    filename = here::here("downscale/figures",paste0("county_total_", pool, ".png")),
+    filename = plotfile,
     width = 10, height = 5,
     bg = "white"
   )
-return(.p)
+  return(.p)
+})
+
+# Variable Importance
+
+importance_summary <- map_dfr(cpools, function(cp) {
+  # Extract the importance for each ensemble model in the carbon pool
+  importances <- map(1:20, function(i) {
+    model <- downscale_output_list[[cp]][["model"]][[i]]
+    randomForest::importance(model)[, "%IncMSE"]
+  })
+
+  # Turn the list of importance vectors into a data frame
+  importance_df <- map_dfr(importances, ~ tibble(importance = .x), .id = "ensemble") |>
+    group_by(ensemble) |>
+    mutate(predictor = names(importances[[1]])) |>
+    ungroup()
+
+  # Now summarize median and IQR for each predictor across ensembles
+  summary_df <- importance_df |>
+    group_by(predictor) |>
+    summarize(
+      median_importance = median(importance, na.rm = TRUE),
+      lcl_importance = quantile(importance, 0.25, na.rm = TRUE),
+      ucl_importance = quantile(importance, 0.75, na.rm = TRUE)
+    ) |>
+    mutate(carbon_pool = cp)
+
+  summary_df
 })
 
-## Variable Importance
-
-# importance_summary <- map_dfr(cpools, function(cp) {
-#   # Extract the importance for each ensemble model in the carbon pool
-#   importances <- map(1:20, function(i) {
-#     model <- downscale_output_list[[cp]][["model"]][[i]]
-#     randomForest::importance(model)[, "%IncMSE"]
-#   })
-
-#   # Turn the list of importance vectors into a data frame
-#   importance_df <- map_dfr(importances, ~ tibble(importance = .x), .id = "ensemble") |>
-#     group_by(ensemble) |>
-#     mutate(predictor = names(importances[[1]])) |>
-#     ungroup()
-
-#   # Now summarize median and IQR for each predictor across ensembles
-#   summary_df <- importance_df |>
-#     group_by(predictor) |>
-#     summarize(
-#       median_importance = median(importance, na.rm = TRUE),
-#       sd_importance = sd(importance, na.rm = TRUE)
-#     ) |>
-#     mutate(carbon_pool = cp)
-
-#   summary_df
-# })
-
-# library(ggplot2)
-# library(dplyr)
-
-# # Create the popsicle (lollipop) plot
-# p <- ggplot(importance_summary, aes(x = reorder(predictor, median_importance), y = median_importance)) +
-#   geom_errorbar(aes(ymin = median_importance - sd_importance, ymax = median_importance + sd_importance),
-#     width = 0.2, color = "gray50"
-#   ) +
-#   geom_point(size = 4, color = "steelblue") +
-#   coord_flip() +
-#   facet_wrap(~carbon_pool, scales = "free_y") +
-#   labs(
-#     title = "Popsicle Plot of Variable Importance",
-#     x = "Predictor",
-#     y = "Median %IncMSE (<U+00B1> SD)"
-#   ) +
-#   theme_minimal()
-
-# ggsave(p, filename = here::here("downscale/figures", "importance_summary.png"),
-#   width = 10, height = 5,
-#   bg = "white"
-# )
-
-# print(p)
+library(ggplot2)
+library(dplyr)
+
+# Create the popsicle (lollipop) plot
+p <- ggplot(importance_summary, aes(x = reorder(predictor, median_importance), y = median_importance)) +
+  geom_errorbar(aes(ymin = lcl_importance, ymax = ucl_importance),
+    width = 0.2, color = "gray50") +
+  geom_point(size = 4, color = "steelblue") +
+  coord_flip() +
+  facet_wrap(~carbon_pool, scales = "free_y") +
+  labs(
+    title = "Variable Importance",
+    x = "Predictor",
+    y = "Median Increase MSE (SD)"
+  ) +
+  theme_minimal()
+
+ggsave(p, filename = here::here("downscale/figures", "importance_summary.png"),
+  width = 10, height = 5,
+  bg = "white"
+)
+
+print(p)
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index 0dbbe39..11399cd 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -16,20 +16,21 @@ It uses an ensemble based approach to uncertainty propagation and analysis.
 This multi-step approach facilitates uncertainty quantification by maintaining ensemble structure to propagate errors through the prediction and aggregation processes.
 
 **Definitions**
+
   - **Design Points**: Fields chosen via stratified random sampling - using k-means clustering on environmental data layers - across all of California crop fields.
   - **Crop Fields**: These are all of the croplands in the LandIQ dataset.
   - **Anchor Sites:** These are sites that will be used as ground truth for calibration and validation. These include UC research stations and Ameriflux sites with high quality data.
 
 
-## Steps
+**Steps**
 
 1. **Data Preparation**: Prepares data for clustering and downscaling process.
 2. **Design Point Selection**: Uses k-means clustering to select a representative set of fields that represent environmental space, and add these to the anchor sites.
-3. **SIPNET Model Runs**: Prepares inputs and runs SIPNET simulations for the design points.
+3. **SIPNET Model Runs**: A separate workflow prepares inputs and runs SIPNET simulations for the design points.
 4. **Extract SIPNET Output**: Extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats.
 5. **Downscale and Aggregate SIPNET Output**: Builds a Random Forest model for each ensemble member to predict SOC and AGB, downscales the predictions to the full set of fields, and aggregates predictions to county-level estimates to produce maps and summary statistics.
 
-### Data Preparation
+## Data Preparation
 
 ```sh
 Rscript downscale/00_prepare.R
@@ -46,6 +47,7 @@ It reads in the LandIQ crop map, anchor sites, and environmental covariates, and
 
 
 **Inputs:**
+
 - **LandIQ Crop Map**
   - `data_raw/i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp`. This is a manually curated and harmonized version of the 2016 LandIQ crop map for California.
 - **Soilgrids**
@@ -59,23 +61,27 @@ It reads in the LandIQ crop map, anchor sites, and environmental covariates, and
     - solar radiation
     - vapor pressure deficit
 - **Anchor Sites:**
-    - `data_raw/anchor_sites.csv`: anchor sites (sites with validation data)
-    - external_site_id: Ameriflux site ID or other unique ID, not to be confused with `site_id`.
-    - lat, lon, crops, pft.
-
+  - `data_raw/anchor_sites.csv`: anchor sites (sites with validation data)
+  - external_site_id: Ameriflux site ID or other unique ID, not to be confused with `site_id`.
+  - lat, lon, crops, pft.
 
 Future design point selection factors will include management practices (crop, cropping system (cycle, mixtures), irrigation, tillage, C&N ammendments).
 
 **Outputs:** 
-  -`ca_fields.gpkg` contains spatial information from LandIQ including: `site_id`, `lat`, `lon`, `area` (ha), `county`, and `geom`. Lat and Lon are centroids of the geom field.
-  - `ca_field_attributes.csv` contains site_id, lat, lon, year, crop, pft, source, and notes. The crop and pft associated with each field may differ from those in anchor sites, because they come from different sources.
-   - `site_covariates.csv` is a table of environmental covariates for each field in the LandIQ crop map. 
-     - site_id, temp, precip, srad, vapr, crop, pft, clay, ocd, twi.
-  - `anchor_sites_ids.csv`.
-     - site_id, lat, lon, external_site_id, site_name, crops, pft.
-     - (note that lat, lon are the centroid of the field, not the original values in `data_raw/anchor_sites.csv`).
 
-### Design Point Selection
+- `ca_fields.gpkg` contains spatial information from LandIQ including: `site_id`, `lat`, `lon`, `area` (ha), `county`, and `geom`. Lat and Lon are centroids of the geom field.
+- `ca_field_attributes.csv` contains site_id, lat, lon, year, crop, pft, source, and notes. The crop and pft associated with each field may differ from those in anchor sites, because they come from different sources.
+- `site_covariates.csv` is a table of environmental covariates for each field in the LandIQ crop map. 
+  - site_id, temp, precip, srad, vapr, crop, pft, clay, ocd, twi.
+- `anchor_sites_ids.csv`.
+  - site_id, lat, lon, external_site_id, site_name, crops, pft.
+  - (note that lat, lon are the centroid of the field, not the original values in `data_raw/anchor_sites.csv`).
+
+Below is a map of the Anchor Sites and Climate Regions of California.
+
+![Anchor Sites](figures/anchor_sites.png)
+
+## Design Point Selection
 
 ```sh
 Rscript downscale/01_cluster_and_select_design_points.R
@@ -88,6 +94,7 @@ Use k-means clustering to select a representative set of 75 fields that represen
 These are the sites where the SIPNET crop and biogeochemistry model will be run. Currently we are running SIPNET for 98 total sites, which includes 75 design points and 23 anchor sites. 
 
 **Steps**
+
 - Subsample LandIQ fields and include anchor sites for clustering.
 - Select cluster number based on the Elbow Method.
   - Plot within-cluster sum of squares (WCSS) against the number of clusters and identify the point where adding more clusters yields diminishing returns.
@@ -96,30 +103,53 @@ These are the sites where the SIPNET crop and biogeochemistry model will be run.
 - Select design points from clusters for SIPNET simulation.
 
 **Inputs:**
+
 - `data/site_covariates.csv`
 - `data/anchor_sites_ids.csv`
 
 **Output:** 
+
 - `data/design_points.csv`.
 
-### SIPNET Model Runs
+
+**Results:**
+
+**A map of design points.** Showing their geographic distribution across California and relative to croplands. Grey areas are the LandIQ fields, and the boundaries are CalAdapt Climate Zones.
+
+![Clustered Design Points](figures/design_points.png)
+
+
+The next two plots show the show the environmental characteristics of clusters - what makes them different, to help assess the clustering process.
+
+**Second is a pairs plot of the environmental covariates.** This plot shows the relationships between the covariates used for clustering, and colors indicate cluster membership.
+
+![Clustered Pairs Plot](figures/cluster_pairs.png)
+
+**Third is a summary of the normalized mean values of environmental covariates by cluster.** This plot illustrates the environmental characteristics of the clusters.
+
+![Cluster Summary](figures/cluster_plot.png)
+
+## SIPNET Model Runs
 
 These are produced by the Modeling Workflow
-[TODO Describe; how do we integrate these]
+[Link to Modeling Workflow]
 
 **Steps:**
+
 - Prepare inputs and run SIPNET simulations for the design points.
 
 **Inputs**
+
 - `design_points.csv`
 - Initial Conditions (described in modeling workflow)
 
 **Outputs:**
+
 - `out/ENS-<ensemble_number>-<site_id>/YYYY.nc` 
   - These are NetCDF files that contain the SIPNET outputs for each site in a standardized format.
   - Currently, <ensemble_number> is 1:20, and 98 values of <site_id> identify the design points. For final runs, these numbers may be closer to 100 ensemble members and 10k design points.
 
-### Extract SIPNET Output
+## Extract SIPNET Output
 
 ```sh
 Rscript downscale/02_extract_sipnet_output.R
@@ -127,15 +157,18 @@ Rscript downscale/02_extract_sipnet_output.R
 
 This step extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats. 
 
+**Steps:**
 
-Extracts output variables (AGB, TotSoilCarb) from SIPNET simulations:
-- Aggregates site-level ensemble outputs into long and 4D array formats.
-- Saves CSV and NetCDF files for downstream use in formats that follow EFI standards for forecasts (Dietze et. al. 2023).
+- Extract output variables (AGB, TotSoilCarb) from SIPNET simulations
+- Aggregate site-level ensemble outputs into long and 4D array formats.
+- Save CSV and NetCDF files for downstream use in formats that follow EFI standards for forecasts (Dietze et. al. 2023).
 
 **Inputs:**
+
 - `out/ENS-<ensemble_number>-<site_id>/YYYY.nc`
 
-**Outputs:** 
+**Outputs:**
+
 - `out/efi_ens_long.csv`: long format.
 - `out/efi_forecast.nc`: arrays in NetCDF format
 
@@ -146,55 +179,111 @@ Extracts output variables (AGB, TotSoilCarb) from SIPNET simulations:
 Rscript downscale/03_downscale_and_aggregate.R
 ```
 
+Builds a Random Forest model for each ensemble member of each output to predict SIPNET SOC and AGB.
+
 **Steps:**
-- Build a Random Forest model for each ensemble member to predict SOC and AGB. It uses design point ensemble outputs, with one model per ensemble.
-- Downscale the predictions to the full set of fields.
-- Aggregate predictions to county-level estimates to produce maps and summary statistics.
 
-Uses Random Forest to predict SOC and biomass for all woody crop fields:
-- Trains models on design point ensembles.
-- Predicts for all fields and aggregates to county-level.
-- Outputs maps and statistics of carbon density and totals.
+- Train models on SIPNET ensemble runs at design point in order to predict for all California fields.
+- Use environmental covariates extracted in the data preparation step to downscale the predictions to the full set of fields, including all woody crop fields.
+- Aggregate predictions to county-level estimates to produce maps and summary statistics.
+- Output maps and statistics of carbon density and totals.
 
 **Inputs:**
-- `out/efi_ens_long.csv`: long format SIPNET outputs.
+
+- `out/efi_ens_long.csv`: long format SIPNET outputs for ensemble runs at design points.
+- `data/site_covariates.csv`: environmental covariates for each field in the LandIQ crop map.
 
 **Outputs:** 
+
 - County-level statistics for each variable as tables and maps.
 - `out/county_total_AGB.png`: county-level AGB predictions.
 - `out/county_total_TotSoilCarb.png`: county-level SOC predictions.
 - `out/county_summaries.csv`: summary statistics for each county.
 
+## Results
 
+**County Carbon Stocks and Densities**
+### County-Level Carbon Stock and Density Maps
 
-## Results and Analysis
+The following maps illustrate the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level. Each map is accompanied by a descriptive title to ensure clarity.
 
-### County Carbon Maps
+#### County Carbon Stock for TotSoilCarb
 
-![County Carbon Map for TotSoilCarb](downscale/figures/county_total_TotSoilCarb.png)  
-![County Carbon Map for AGB](downscale/figures/county_total_AGB.png)
+![](figures/county_TotSoilCarb_carbon_stock.png)
 
-The maps above display the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level.
+#### County Carbon Density for TotSoilCarb
 
-## Additional EDA
+![](figures/county_TotSoilCarb_carbon_density.png)
 
-(Include further exploratory data analysis as desired.)
+#### County Carbon Stock for AGB
 
-## Discussion and Future Work
+![](figures/county_AGB_carbon_stock.png)
 
-This workflow lays the foundation for ensemble-based downscaling of ecosystem carbon stocks. Future enhancements may include:
-- Incorporating alternative modeling approaches (e.g., CNNs) for improved prediction accuracy.
-- Expanding the analysis to include additional carbon pools and environmental variables.
-- Refining the aggregation methods to better capture spatial covariance and error propagation.
-- Integrating more real-time data sources for dynamic model updates.
+#### County Carbon Density for AGB
 
-By maintaining an ensemble framework and detailed uncertainty quantification, this workflow aims to support robust carbon accounting in complex landscapes.
+![](figures/county_AGB_carbon_density.png)
 
-# Conclusion
+### Searchable Table
 
-(Conclude with final remarks, potential applications, and next steps.)
+The table below provides a searchable summary of the county-level carbon stocks and densities.
 
+```{r}
+outdir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859/output/out"
+# Load county summaries data
+county_summaries <- read.csv(file.path(outdir, "county_summaries.csv"))
+
+# Combine mean and SD into a single column for carbon density
+county_summaries <- county_summaries |> 
+   dplyr::mutate(
+      `Mean Total C (Tg/ county)` = paste0(
+         signif(mean_total_c, 2), 
+         " (", signif(sd_total_c, 2), ")"
+      ),
+      `Mean C Density (kg/ha)` = paste0(
+         signif(mean_c_density, 2), 
+         " (", signif(sd_c_density, 2), ")"
+      )
+   ) |> 
+   dplyr::rename(
+      `Carbon Pool` = carbon_pool,
+      `County` = county
+   ) |> 
+   dplyr::select(`Carbon Pool`, `County`, `Mean Total C (Tg/ county)`, `Mean C Density (kg/ha)`)
+
+DT::datatable(
+   county_summaries,
+   options = list(
+      pageLength = 10,
+      searchHighlight = TRUE
+   ),
+   rownames = FALSE
+)
+```
 
 # References
 
+**EFI Standards**
+
 Dietze, Michael C., R. Quinn Thomas, Jody Peters, Carl Boettiger, Gerbrand Koren, Alexey N. Shiklomanov, and Jaime Ashander. 2023. “A Community Convention for Ecological Forecasting: Output Files and Metadata Version 1.0.” Ecosphere 14 (11): e4686. https://doi.org/10.1002/ecs2.4686.
+
+**LandIQ Crop Map**
+
+Land IQ, LLC. California Crop Mapping (2014). California Department of Water Resources, 2017. https://gis.water.ca.gov/app/CADWRLandUseViewer/.
+
+**SoilGrids250m**
+
+Hengl, T. et al. 2017. “SoilGrids250m: Global Gridded Soil Information Based on Machine Learning.” PLoS ONE 12(2): e0169748. https://doi.org/10.1371/journal.pone.0169748
+
+**ERA5 Climate Data**
+
+Hersbach, H. et al. 2020. “The ERA5 Global Reanalysis.” Quarterly Journal of the Royal Meteorological Society 146: 1999–2049. https://doi.org/10.1002/qj.3803
+
+**SIPNET**
+
+Braswell, Bobby H., William J. Sacks, Ernst Linder, and David S. Schimel. 2005. “Estimating Diurnal to Annual Ecosystem Parameters by Synthesis of a Carbon Flux Model with Eddy Covariance Net Ecosystem Exchange Observations.” Global Change Biology 11 (2): 335–55. https://doi.org/10.1111/j.1365-2486.2005.00897.x.
+
+Sacks, William J., David S. Schimel, Russell K. Monson, and Bobby H. Braswell. 2006. “Model‐data Synthesis of Diurnal and Seasonal CO2 Fluxes at Niwot Ridge, Colorado.” Global Change Biology 12 (2): 240–59. https://doi.org/10.1111/j.1365-2486.2005.01059.x.
+
+**Random Forest**
+
+Liaw, Andy, and Matthew Wiener. 2002. “Classification and Regression by randomForest.” R News 2 (3): 18–22. https://CRAN.R-project.org/doc/Rnews/.
\ No newline at end of file

From 273f8619880b1dbee19da558fccd1f12057ba47e Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 21 Mar 2025 23:26:40 -0400
Subject: [PATCH 26/49] update qmd docs to generate self-contained html

---
 .../04_downscaling_documentation_results.qmd  | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index 11399cd..ae703d6 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -2,12 +2,21 @@
 title: "Downscaling Workflow Documentation"
 author: "David LeBauer"
 date: "`r Sys.Date()`"
-format: html
+format:
+  html:
+    self-contained: true
+    embed-resources: true
+    df-print: paged
+    toc: true
 execute:
   echo: false
 ---
 
-# Downscaling Workflow Overview
+<!--
+quarto render downscale/04_downscaling_documentation_results.qmd --to html
+-->
+
+# Overview
 
 The downscaling workflow is used to predict carbon pools (Soil Organic Carbon and Aboveground Biomass) for cropland fields in California and then aggregate these predictions to the county scale.
 
@@ -17,9 +26,9 @@ This multi-step approach facilitates uncertainty quantification by maintaining e
 
 **Definitions**
 
-  - **Design Points**: Fields chosen via stratified random sampling - using k-means clustering on environmental data layers - across all of California crop fields.
-  - **Crop Fields**: These are all of the croplands in the LandIQ dataset.
-  - **Anchor Sites:** These are sites that will be used as ground truth for calibration and validation. These include UC research stations and Ameriflux sites with high quality data.
+- **Design Points**: Fields chosen via stratified random sampling - using k-means clustering on environmental data layers - across all of California crop fields.
+- **Crop Fields**: These are all of the croplands in the LandIQ dataset.
+- **Anchor Sites:** These are sites that will be used as ground truth for calibration and validation. These include UC research stations and Ameriflux sites with high quality data.
 
 
 **Steps**
@@ -250,13 +259,18 @@ county_summaries <- county_summaries |>
    ) |> 
    dplyr::select(`Carbon Pool`, `County`, `Mean Total C (Tg/ county)`, `Mean C Density (kg/ha)`)
 
+# Create Table
+htmlwidgets::setWidgetIdSeed(123) # required to embed table self-contained in html
+options(htmlwidgets.TEMP_DIR = "htmlwidgets")
+
 DT::datatable(
    county_summaries,
    options = list(
       pageLength = 10,
       searchHighlight = TRUE
    ),
-   rownames = FALSE
+   rownames = FALSE,
+   escape = FALSE
 )
 ```
 

From a5259d1d80cf7e7c98c64a56b2f5f423c40db1df Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 21 Mar 2025 23:34:22 -0400
Subject: [PATCH 27/49] Revert README changes; remove
 _02_design_point_simulations, sipnetwopet.R, and _targets.R will move
 _targets.R to new branch

---
 README.md                                | 28 ---------
 _targets.R                               | 42 -------------
 downscale/_02_design_point_simulations.R | 80 ------------------------
 downscale/sipnetwopet.R                  | 60 ------------------
 4 files changed, 210 deletions(-)
 delete mode 100644 _targets.R
 delete mode 100644 downscale/_02_design_point_simulations.R
 delete mode 100644 downscale/sipnetwopet.R

diff --git a/README.md b/README.md
index bd382b7..aff3412 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,3 @@
 # Workflow files for CCMMF deliverables
 
 More to come here presently. Meanwhile see the READMEs for individual workflow folders.
-
-This workflow is divided into three steps:
-
-- prparation,
-- execution,
-- analysis
-
-## Data
-
-Data is stored in the `data/` folder.
-
-It includes the following files:
-
-Workflow Inputs
-├── ca_woody.gpkg
-├── anchor_sites.csv
-
-data/
-├── ca_climregions.rda
-├── ca_convex_hull.geojson
-├── ca_convex_hull_reduced.geojson
-├── ca_field_attributes.csv
-├── ca_fields.gpkg
-├── caladapt_domain_convex_hull.geojson
-├── ca_woody.gpkg
-└── yolo_county_polygon_simplified.geojson
-cache/
-├── data_for_clust_with_ids.rda
diff --git a/_targets.R b/_targets.R
deleted file mode 100644
index 1c19c24..0000000
--- a/_targets.R
+++ /dev/null
@@ -1,42 +0,0 @@
-library(targets)
-library(tarchetypes)
-
-tar_option_set(
-    packages = c(
-        "tidyverse", "dplyr", "sf", "terra",
-        "randomForest", "keras3", "PEcAn.all", "caladaptr"
-    )
-)
-
-list(
-    tar_target(prepare_data, {
-        source("downscale/00-prepare.R")
-        data_for_clust_with_ids # output from 00-prepare.R
-    }),
-    tar_target(cluster_sites,
-        {
-            source("downscale/01_cluster_and_select_design_points.R")
-            cluster_output # output from 01_cluster_and_select_design_points.R
-        },
-        deps = prepare_data
-    ),
-    tar_target(simulations,
-        {
-            source("downscale/02_extract_sipnet_output.R")
-            design_point_wide # output from 02-design_point_simulations.R
-        },
-        deps = cluster_sites
-    ),
-    tar_target(downscale,
-        {
-            source("downscale/03_downscale_and_agregate.R")
-            ensemble_data # output from 03_downscale_and_agregate.R
-        },
-        deps = simulations
-    ),
-    tar_quarto(
-        analysis_report,
-        path = "04-analysis.qmd",
-        deps = list(simulations, downscale)
-    )
-)
diff --git a/downscale/_02_design_point_simulations.R b/downscale/_02_design_point_simulations.R
deleted file mode 100644
index ff1ea55..0000000
--- a/downscale/_02_design_point_simulations.R
+++ /dev/null
@@ -1,80 +0,0 @@
-#' ---
-#' title: "Design Point Selection"
-#' author: "David LeBauer"
-#' ---
-#'
-#' # Overview
-#'
-#' In the future, this workflow will:
-#'
-#' - Use SIPNET to simulate SOC and biomass for each design point.
-#' - Generate a dataframe with site_id, lat, lon, soil carbon, biomass
-#' - (Maybe) use SIPNETWOPET to evaluate downscaling model skill?
-#'
-#' Curently, we will use a surrogate model, SIPNETWOPET, to simulate SOC and biomass for each design point.
-#'
-#' ## SIPNETWOPET [surrogate model]
-#'
-#'
-#'
-#' ## SIPNETWOPET Simulation of Design Points
-#'
-#' We introduce a new model, SIPNETWOPET, the "Simpler Photosynthesis and EvapoTranspiration model, WithOut Photosynthesis and EvapoTranspiration".
-#'
-## -----------------------------------------------------------------------------
-library(tidyverse)
-source("downscale/sipnetwopet.R")
-
-#'
-#'
-#' ### Join Design Points with Covariates
-#'
-## -----------------------------------------------------------------------------
-design_points <- read_csv("data/design_points.csv") |> distinct()
-covariates <- load("data/data_for_clust_with_ids.rda") |> get()
-
-# Remove duplicate entries using 'id'
-covariates_df <- sf::st_drop_geometry(covariates)
-
-
-design_point_covs <- design_points |>
-  left_join(covariates_df, by = "id")
-
-#'
-#' ### Run SIPNETWOPET
-#'
-## -----------------------------------------------------------------------------
-set.seed(8675.309)
-design_point_results <- design_point_covs |>
-  dplyr::rowwise() |>
-  dplyr::mutate(result = list(sipnetwopet(temp, precip, clay, ocd, twi))) |>
-  tidyr::unnest(result) |>
-  dplyr::select(id, ensemble_id, SOC = soc, AGB = agb)
-
-# Convert design_point_results into arrays using pivot_wider and as.array
-arr_soc_matrix <- design_point_results |>
-  select(id, ensemble_id, SOC) |>
-  pivot_wider(names_from = ensemble_id, values_from = SOC) |>
-  column_to_rownames("id") |>
-  as.matrix()
-
-arr_soc <- as.array(arr_soc_matrix)
-dim(arr_soc) <- c(1, nrow(arr_soc_matrix), ncol(arr_soc_matrix))
-dimnames(arr_soc) <- list(datetime = "2020-01-01",
-                          site = rownames(arr_soc_matrix),
-                          ensemble = colnames(arr_soc_matrix))
-
-arr_agb_matrix <- design_point_results |>
-  select(id, ensemble_id, AGB) |>
-  pivot_wider(names_from = ensemble_id, values_from = AGB) |>
-  column_to_rownames("id") |>
-  as.matrix()
-
-arr_agb <- as.array(arr_agb_matrix)
-dim(arr_agb) <- c(1, nrow(arr_agb_matrix), ncol(arr_agb_matrix))
-dimnames(arr_agb) <- list(datetime = "2020-01-01",
-                          site = rownames(arr_agb_matrix),
-                          ensemble = colnames(arr_agb_matrix))
-
-ensemble_arrays <- list(SOC = arr_soc, AGB = arr_agb)
-saveRDS(ensemble_arrays, "cache/efi_ensemble_arrays.rds")
diff --git a/downscale/sipnetwopet.R b/downscale/sipnetwopet.R
deleted file mode 100644
index 64cbc9a..0000000
--- a/downscale/sipnetwopet.R
+++ /dev/null
@@ -1,60 +0,0 @@
-#' SIPNETWOPET, the "Simpler Photosynthesis and EvapoTranspiration model,
-#' WithOut Photosynthesis and EvapoTranspiration"
-#'
-#' This function simulates soil organic carbon (SOC) and aboveground
-#' biomass (AGB) using the SIPNETWOPET model. It is a surrogate for
-#' SIPNET, a process-based model that simulates the carbon and water.
-#' It can generate ensemble predictions for SOC and AGB and has its own
-#' internal stochastic model. SIPNETWOPET promises rough
-#' relationships between environmental variables and SOC and AGB.
-#'
-#' @param temp Mean annual temperature (degrees Celsius)
-#' @param precip Mean annual precipitation (mm)
-#' @param clay Clay content (%)
-#' @param ocd Organic carbon density (g/cm^3)
-#' @param twi Topographic wetness index
-#' @param ensemble_size Number of ensemble predictions to generate (default 10)
-#'
-#' @return A tibble with columns \code{ensemble_id}, \code{soc}, and \code{agb}
-#' for each ensemble member. Unspecified units.
-#'
-#' @author David LeBauer
-#'
-#' @examples
-#' @examples
-#' sipnetwopet(temp = 22, precip = 4500, clay = 25, ocd = 0.3, twi = 9, ensemble_size = 5)
-#'
-sipnetwopet <- function(
-    temp, precip, clay, ocd, twi, ensemble_size = 10) {
-    ensemble_results <- list()
-    for (i in seq_len(ensemble_size)) {
-        # Manually scale inputs using predefined dataset statistics
-        scaled_temp <- (temp - 20) / 2
-        scaled_precip <- (precip - 5000) / 2000
-        scaled_clay <- (clay - 20) / 6
-        scaled_ocd <- (ocd - 300) / 60
-        scaled_twi <- (twi - 10) / 2
-
-        # Add stochastic variation
-        scaled_temp <- scaled_temp * rnorm(1, 1, 0.1)
-        scaled_precip <- scaled_precip * rnorm(1, 1, 0.1)
-        scaled_clay <- scaled_clay * rnorm(1, 1, 0.1)
-        scaled_ocd <- scaled_ocd * rnorm(1, 1, 0.1)
-        scaled_twi <- scaled_twi * rnorm(1, 1, 0.1)
-
-        # Simulate SOC with environmental effects and bounds
-        .soc <- 80 + 15 * scaled_precip + 12 * scaled_temp + 50 * scaled_ocd + 15 * scaled_clay + 8 * scaled_twi +
-            rnorm(1, 0, 10)
-        soc <- max(90 * (.soc / (100 + abs(.soc))), rlnorm(1, meanlog = log(50), sdlog = 0.3))
-
-        # Simulate AGB with environmental effects and bounds
-        .agb <- 120 + 25 * scaled_temp + 35 * scaled_precip + 10 * scaled_clay -
-            8 * scaled_twi + rnorm(1, 0, 15)
-        agb <- max(450 * (.agb / (500 + abs(.agb))), rlnorm(1, meanlog = log(20), sdlog = 0.4))
-
-        ensemble_results[[i]] <- tibble::tibble(soc = soc, agb = agb)
-    }
-
-    ensemble_data <- dplyr::bind_rows(ensemble_results, .id = "ensemble_id")
-    return(ensemble_data)
-}

From 65cd54b11ce9b49c71e3c77cc70e5ee9296575d6 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Sat, 22 Mar 2025 00:09:57 -0400
Subject: [PATCH 28/49] changed from local install_git to remote install_github

---
 downscale/00-prepare.R                |  4 ++--
 downscale/03_downscale_and_agregate.R | 17 +++++++++++++----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 74357f6..2134397 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -63,8 +63,8 @@ ca_albers_crs <- 3310
 ## Convert SHP to Geotiff`
 ## Required until PR 3423 is merged https://github.com/PecanProject/pecan/pull/3423
 # check if PR is merged
-# devtools::install_github("dlebauer/pecan",
-devtools::install_git("../pecan",
+devtools::install_github("dlebauer/pecan",
+#devtools::install_git("../pecan",
   ref = "shp2gpkg",
   subdir = "modules/data.land", 
   upgrade = FALSE
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 91b7d73..15cfa0d 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -26,8 +26,10 @@ plan(multicore, workers = no_cores - 1)
 
 # while developing PEcAn:
 # devtools::load_all(here::here("../pecan/modules/assim.sequential/"))
-# remotes::install_git("dlebauer/pecan@ensemble_downscaling", subdir = "modules/assim.sequential")
-remotes::install_git("../pecan@ensemble_downscaling", subdir = "modules/assim.sequential", upgrade = FALSE)
+#remotes::install_git("../pecan@ensemble_downscaling",
+remotes::install_github("dlebauer/pecan@ensemble_downscaling",
+  subdir = "modules/assim.sequential", upgrade = FALSE
+)
 library(PEcAnAssimSequential)
 datadir <- "/projectnb/dietzelab/ccmmf/data"
 basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
@@ -186,8 +188,15 @@ co_preds_to_plot <- county_summaries |>
   ))
 
 # now plot map of county-level predictions with total carbon
-units <- unique(co_preds_to_plot$units)
-p <- purrr::map2(cpools, units, function(pool, unit) {
+units <- rep(unique(co_preds_to_plot$units), each = length(cpools))
+pool_x_units <- co_preds_to_plot |>
+  select(carbon_pool, units) |>
+  distinct() |>
+  # remove na
+  filter(!is.na(carbon_pool)) |> # why is one field in SF county NA?
+  arrange(carbon_pool, units)
+
+p <- purrr::map2(pool_x_units$carbon_pool, pool_x_units$units, function(pool, unit) {
   .p <- ggplot(
     co_preds_to_plot |> filter(carbon_pool == pool & units == unit),
     aes(geometry = geom, fill = value)

From 72256b44f31694498e4e348c53243a88e0ccd45c Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 4 Apr 2025 23:56:38 -0700
Subject: [PATCH 29/49] fix wording

---
 downscale/04_downscaling_documentation_results.qmd | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index ae703d6..4da481f 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -47,7 +47,7 @@ Rscript downscale/00_prepare.R
 
 This script prepares data for clustering and downscaling process.
 
-It reads in the LandIQ crop map, anchor sites, and environmental covariates, and creates a table of environmental covariates for each field in the LandIQ crop map. It also links a table of anchor sites to their corresponding LandIQ fields so that these can be used in downstreamm analyses.
+It reads in the LandIQ crop map, anchor sites, and environmental covariates, and creates a table of environmental covariates for each field in the LandIQ crop map. It also links a table of anchor sites to their corresponding LandIQ fields so that these can be used in downstream analyses.
 
 - Converts LandIQ-derived shapefiles to a geopackage with geospatial information and a CSV with other attributes from the LandIQ dataset.
 - Extracts environmental covariates (clay, organic carbon, topographic wetness, temperature, precipitation, solar radiation, vapor pressure) for each field in the LandIQ dataset.
@@ -98,7 +98,7 @@ Rscript downscale/01_cluster_and_select_design_points.R
 
 Use k-means clustering to select a representative set of 75 fields that represent environmental space, and add these to the 23 distinct[^1^] anchor sites.
 
-[^1^]: There are 25 anchor sites but two have duplicate lat / lon coordinates. This will be addressed in future iterations analysis.
+[^1^]: There are 25 anchor sites but two have duplicate lat / lon coordinates. This will be addressed in future iterations of the analysis.
 
 These are the sites where the SIPNET crop and biogeochemistry model will be run. Currently we are running SIPNET for 98 total sites, which includes 75 design points and 23 anchor sites. 
 
@@ -156,7 +156,7 @@ These are produced by the Modeling Workflow
 
 - `out/ENS-<ensemble_number>-<site_id>/YYYY.nc` 
   - These are NetCDF files that contain the SIPNET outputs for each site in a standardized format.
-  - Currently, <ensemble_number> is 1:20, and 98 values of <site_id> identify the design points. For final runs, these numbers may be closer to 100 ensemble members and 10k design points.
+  - Currently, <ensemble_number> is a value from 1 to 20 ensemble members, and there are 98 values of <site_id> that identify the design points. For final runs, these numbers may be closer to 100 ensemble members and 10k design points.
 
 ## Extract SIPNET Output
 
@@ -214,7 +214,7 @@ Builds a Random Forest model for each ensemble member of each output to predict
 **County Carbon Stocks and Densities**
 ### County-Level Carbon Stock and Density Maps
 
-The following maps illustrate the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level. Each map is accompanied by a descriptive title to ensure clarity.
+The following maps illustrate the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level. 
 
 #### County Carbon Stock for TotSoilCarb
 

From 99ac120cd3a9d8e06f7ed7693019174349707965 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Mon, 7 Apr 2025 15:19:22 -0700
Subject: [PATCH 30/49] Update
 downscale/04_downscaling_documentation_results.qmd

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/04_downscaling_documentation_results.qmd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index 4da481f..3d746e1 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -282,7 +282,7 @@ Dietze, Michael C., R. Quinn Thomas, Jody Peters, Carl Boettiger, Gerbrand Koren
 
 **LandIQ Crop Map**
 
-Land IQ, LLC. California Crop Mapping (2014). California Department of Water Resources, 2017. https://gis.water.ca.gov/app/CADWRLandUseViewer/.
+Land IQ, LLC. California Crop Mapping (2014). California Department of Water Resources, 2017. https://data.cnra.ca.gov/dataset/statewide-crop-mapping.
 
 **SoilGrids250m**
 

From 5dcb76da7c84c561f29f80d53cfb83c5713cd15d Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 8 Apr 2025 12:22:41 -0700
Subject: [PATCH 31/49] Add partial dependence plots

---
 downscale/03_downscale_and_agregate.R         | 87 ++++++++++++++++---
 .../04_downscaling_documentation_results.qmd  | 14 +++
 2 files changed, 88 insertions(+), 13 deletions(-)

diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 15cfa0d..5eff9aa 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -10,7 +10,7 @@
 #' This workflow will:
 #'
 #' - Use environmental covariates to predict SIPNET estimated SOC for each woody crop field in the LandIQ dataset
-#'   - Uses Random Forest [maybe change to CNN later] trained on site-scale model runs.
+#'   - Uses Random Forest [may change to CNN later] trained on site-scale model runs.
 #'   - Build a model for each ensemble member
 #' - Write out a table with predicted biomass and SOC to maintain ensemble structure, ensuring correct error propagation and spatial covariance.
 #' - Aggregate County-level biomass and SOC inventories
@@ -20,6 +20,7 @@ library(tidyverse)
 library(sf)
 library(terra)
 library(furrr)
+library(patchwork) # for combining plots
 
 no_cores <- parallel::detectCores(logical = FALSE)
 plan(multicore, workers = no_cores - 1)
@@ -226,8 +227,9 @@ p <- purrr::map2(pool_x_units$carbon_pool, pool_x_units$units, function(pool, un
   return(.p)
 })
 
-# Variable Importance
+# Variable Importance and Partial Dependence Plots
 
+# First, calculate variable importance summary as before
 importance_summary <- map_dfr(cpools, function(cp) {
   # Extract the importance for each ensemble model in the carbon pool
   importances <- map(1:20, function(i) {
@@ -254,13 +256,9 @@ importance_summary <- map_dfr(cpools, function(cp) {
   summary_df
 })
 
-library(ggplot2)
-library(dplyr)
-
-# Create the popsicle (lollipop) plot
-p <- ggplot(importance_summary, aes(x = reorder(predictor, median_importance), y = median_importance)) +
-  geom_errorbar(aes(ymin = lcl_importance, ymax = ucl_importance),
-    width = 0.2, color = "gray50") +
+# Create importance plot
+p_importance <- ggplot(importance_summary, aes(x = reorder(predictor, median_importance), y = median_importance)) +
+  geom_errorbar(aes(ymin = lcl_importance, ymax = ucl_importance), width = 0.2, color = "gray50") +
   geom_point(size = 4, color = "steelblue") +
   coord_flip() +
   facet_wrap(~carbon_pool, scales = "free_y") +
@@ -271,9 +269,72 @@ p <- ggplot(importance_summary, aes(x = reorder(predictor, median_importance), y
   ) +
   theme_minimal()
 
-ggsave(p, filename = here::here("downscale/figures", "importance_summary.png"),
-  width = 10, height = 5,
-  bg = "white"
+# Save importance plot
+ggsave(p_importance, filename = here::here("downscale/figures", "importance_summary.png"),
+  width = 10, height = 5, bg = "white"
 )
 
-print(p)
+# Now create and save combined importance + partial plots for each carbon pool
+for (cp in cpools) {
+  # Find top 2 predictors for this carbon pool
+  top_predictors <- importance_summary |>
+    filter(carbon_pool == cp) |>
+    arrange(desc(median_importance)) |>
+    slice_head(n = 2) |>
+    pull(predictor)
+  
+  # Set up a 3-panel plot
+  png(filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
+      width = 14, height = 6, units = "in", res = 300, bg = "white")
+  
+  par(mfrow = c(1, 3))
+  
+  # Panel 1: Show only this carbon pool's importance plot
+  # Extract just this carbon pool's data
+  cp_importance <- importance_summary |> filter(carbon_pool == cp)
+  
+  # Create importance plot for just this carbon pool
+  par(mar = c(5, 10, 4, 2)) # Adjust margins for first panel
+  with(cp_importance, 
+       dotchart(median_importance, 
+                labels = reorder(predictor, median_importance),
+                xlab = "Median Increase MSE (SD)",
+                main = paste("Variable Importance -", cp),
+                pch = 19, col = "steelblue", cex = 1.2))
+  
+  # Add error bars
+  with(cp_importance, 
+       segments(lcl_importance, 
+                seq_along(predictor), 
+                ucl_importance, 
+                seq_along(predictor),
+                col = "gray50"))
+  
+  # Panels 2 & 3: Create partial plots for top 2 predictors
+  model <- downscale_output_list[[cp]][["model"]][[1]]
+  
+  # First top predictor partial plot
+  par(mar = c(5, 5, 4, 2)) # Reset margins for other panels
+  randomForest::partialPlot(model, 
+                           pred.data = covariates, 
+                           x.var = top_predictors[1], 
+                           main = paste("Partial Dependence Plot for", top_predictors[1]),
+                           xlab = top_predictors[1], 
+                           ylab = paste("Predicted", cp),
+                           col = "steelblue", 
+                           lwd = 2)
+  
+  # Second top predictor partial plot
+  randomForest::partialPlot(model, 
+                           pred.data = covariates, 
+                           x.var = top_predictors[2], 
+                           main = paste("Partial Dependence Plot for", top_predictors[2]),
+                           xlab = top_predictors[2], 
+                           ylab = paste("Predicted", cp),
+                           col = "steelblue", 
+                           lwd = 2)
+  
+  dev.off()
+}
+
+print(p_importance)
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index 4da481f..c52a6d7 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -232,6 +232,20 @@ The following maps illustrate the spatial variation and uncertainty (mean and st
 
 ![](figures/county_AGB_carbon_density.png)
 
+### Variable Importance and Partial Dependence
+
+The following plots show the variable importance from the random forest models used for downscaling, along with partial dependence plots for the top two predictors.
+
+Variable importance quantifies how useful each covariate is in predicting the carbon stock<!-- or ghg flux-->. Partial dependence plots show the marginal effect of individual predictors on model response after averaging over the other predictors.
+
+#### TotSoilCarb - Importance and Partial Dependence
+
+![](figures/TotSoilCarb_importance_partial_plots.png)
+
+#### AGB - Importance and Partial Dependence
+
+![](figures/AGB_importance_partial_plots.png)
+
 ### Searchable Table
 
 The table below provides a searchable summary of the county-level carbon stocks and densities.

From 292a5c7883a510305265fe9fef9a7b6ce957c23e Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 8 Apr 2025 16:25:14 -0700
Subject: [PATCH 32/49] Update downscale/02_extract_sipnet_output.R

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/02_extract_sipnet_output.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 66cba1d..0f8d968 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -29,7 +29,6 @@ design_points <- readr::read_csv(design_pt_csv, show_col_types = FALSE) |>
     rename(site_id = id) |> # fixed in more recent version of 01 script
     dplyr::distinct()
 
-## end scratch
 # Variables to extract
 variables <- c("AGB", "TotSoilCarb")
 

From e2f93017abeb61608b9d6cf78b9a2de51ef164fa Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 9 Apr 2025 18:50:45 -0400
Subject: [PATCH 33/49] fixed typos

---
 .../{03_downscale_and_agregate.R => 03_downscale_and_aggregate.R} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename downscale/{03_downscale_and_agregate.R => 03_downscale_and_aggregate.R} (100%)

diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_aggregate.R
similarity index 100%
rename from downscale/03_downscale_and_agregate.R
rename to downscale/03_downscale_and_aggregate.R

From cd973d332121fa179b534d4f4f6eb5a798ad1319 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 9 Apr 2025 19:34:35 -0400
Subject: [PATCH 34/49] reverting name change so I don't loose comments

---
 .../{03_downscale_and_aggregate.R => 03_downscale_and_agregate.R} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename downscale/{03_downscale_and_aggregate.R => 03_downscale_and_agregate.R} (100%)

diff --git a/downscale/03_downscale_and_aggregate.R b/downscale/03_downscale_and_agregate.R
similarity index 100%
rename from downscale/03_downscale_and_aggregate.R
rename to downscale/03_downscale_and_agregate.R

From fa0a8604515a7d318c13d3b966110cb9f43733a0 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 9 Apr 2025 17:55:25 -0700
Subject: [PATCH 35/49] Update downscale/00-prepare.R

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/00-prepare.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 2134397..7ecc322 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -200,7 +200,8 @@ extract_clim <- function(raster, points_sf) {
   terra::extract(
     raster, 
     points_sf |> 
-  sf::st_transform(crs = sf::st_crs(raster))) |>
+      sf::st_transform(crs = sf::st_crs(raster))
+  ) |>
     tibble::as_tibble() |>
     select(-ID) |>
     mutate(site_id = points_sf$site_id) |>

From 2c436684c5c6448fd3cb0fb30792b7017d14a7ab Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 9 Apr 2025 20:58:21 -0400
Subject: [PATCH 36/49] addressing PR review feedback

---
 downscale/00-prepare.R                        | 66 +++++--------
 downscale/03_downscale_and_agregate.R         |  8 +-
 .../04_downscaling_documentation_results.qmd  | 94 ++++++++++++++++++-
 3 files changed, 113 insertions(+), 55 deletions(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 2134397..078649f 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -20,24 +20,11 @@
 #' - Clean up domain code
 #' - Create a bunch of tables and join all at once at the end
 #' - Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
-#' 
-#' ## Install & Load PEcAn
-#' 
-#' See https://pecanproject.github.io/documentation/develop/
-#' 
-## -----------------------------------------------------------------------------
-
-# options(repos = c(
-#   pecanproject = 'https://pecanproject.r-universe.dev',
-#   ropensci     = 'https://ropensci.r-universe.dev',
-#   CRAN         = 'https://cloud.r-project.org'))
 
 library(PEcAn.all)
 library(tidyverse)
 library(sf)
 library(terra)
-
-# remotes::install_github("UCANR-IGIS/caladaptr")
 library(caladaptr)
 
 ## Check available compute resources and set up parallel processing
@@ -61,16 +48,8 @@ ca_albers_crs <- 3310
 #' The landiq2std function will be added to the PEcAn.data.land package, and has been implemented in a Pull Request https://github.com/PecanProject/pecan/pull/3423. The function is a work in progress. Two key work to be done. First `landiq2std` does not currently perform all steps to get from the original LandIQ format to the standard format - some steps related to harmonizing LandIQ across years have been completed manually. Second, the PEcAn 'standard' for such data is under development as we migrate from a Postgres database to a more portable GeoPackage + CSV format.
 #' 
 ## Convert SHP to Geotiff`
-## Required until PR 3423 is merged https://github.com/PecanProject/pecan/pull/3423
-# check if PR is merged
-devtools::install_github("dlebauer/pecan",
-#devtools::install_git("../pecan",
-  ref = "shp2gpkg",
-  subdir = "modules/data.land", 
-  upgrade = FALSE
-)
 
-devtools::load_all("../pecan/modules/data.land/")
+# if these functions aren't available, see software dependency docs
 input_file = file.path(raw_data_dir, 'i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp')
 ca_fields_gpkg <- file.path(data_dir, 'ca_fields.gpkg')
 ca_attributes_csv = file.path(data_dir, 'ca_field_attributes.csv')
@@ -91,8 +70,9 @@ ca_attributes <- readr::read_csv(ca_attributes_csv)
 #'
 ## -----------------------------------------------------------------------------
 ca_fields |>
-  left_join(ca_attributes |> select(site_id, pft), by = c("site_id")) |>
   filter(pft == "woody perennial crop") |>
+  left_join(ca_attributes |> 
+  select(site_id, pft), by = c("site_id")) |>
   sf::st_transform(crs = ca_albers_crs) |>
   dplyr::select(site_id, geom) |>
   sf::st_write(file.path(data_dir, 'ca_woody.gpkg'), 
@@ -196,35 +176,35 @@ years <- purrr::map_chr(rasters_list, ~ {
 
 names(rasters_list) <- years
 
-extract_clim <- function(raster, points_sf) {
-  terra::extract(
+extract_clim <- function(raster, points_sf) {  
+  terra::extract(  
     raster, 
     points_sf |> 
-  sf::st_transform(crs = sf::st_crs(raster))) |>
-    tibble::as_tibble() |>
-    select(-ID) |>
-    mutate(site_id = points_sf$site_id) |>
-    select(site_id, temp, prec, srad, vapr)
-}
+      sf::st_transform(crs = sf::st_crs(raster))  
+  ) |>  
+    tibble::as_tibble() |>  
+    select(-ID) |>  
+    mutate(site_id = points_sf$site_id) |>  
+    select(site_id, temp, prec, srad, vapr)  
+}  
 
 .tmp <-  rasters_list |>
   furrr::future_map_dfr(
     ~ extract_clim(.x, ca_fields_pts),
-      .id = "year",
-      .options = furrr::furrr_options(seed = 123))
+      .id = "year"
 
 clim_summaries <- .tmp |>
   dplyr::mutate(
-    precip = PEcAn.utils::ud_convert(prec, "second-1", "year-1")
-) |>
-  dplyr::group_by(site_id) |>
-  dplyr::summarise(
-    temp = mean(temp),
-    precip = mean(precip),
-    srad = mean(srad),
-    vapr = mean(vapr)
-  )
-
+extract_clim <- function(raster, points_sf) {
+  terra::extract(
+    raster, 
+    points_sf |> 
+      sf::st_transform(crs = sf::st_crs(raster)),
+    bind = TRUE
+  ) |>
+    select(site_id, temp, prec, srad, vapr)
+}
+}
 #' 
 ## ----join_and_subset----------------------------------------------------------
 .all <- clim_summaries  |>
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index 5eff9aa..e603cf7 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -1,5 +1,5 @@
 #' ---
-#' Title: Downscale and Agregate Woody Crop SOC stocks
+#' Title: Downscale and Aggregate Woody Crop SOC stocks
 #' author: "David LeBauer"
 #' ---
 #'
@@ -25,12 +25,6 @@ library(patchwork) # for combining plots
 no_cores <- parallel::detectCores(logical = FALSE)
 plan(multicore, workers = no_cores - 1)
 
-# while developing PEcAn:
-# devtools::load_all(here::here("../pecan/modules/assim.sequential/"))
-#remotes::install_git("../pecan@ensemble_downscaling",
-remotes::install_github("dlebauer/pecan@ensemble_downscaling",
-  subdir = "modules/assim.sequential", upgrade = FALSE
-)
 library(PEcAnAssimSequential)
 datadir <- "/projectnb/dietzelab/ccmmf/data"
 basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index 66c0c33..f2ffe34 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -33,11 +33,11 @@ This multi-step approach facilitates uncertainty quantification by maintaining e
 
 **Steps**
 
-1. **Data Preparation**: Prepares data for clustering and downscaling process.
-2. **Design Point Selection**: Uses k-means clustering to select a representative set of fields that represent environmental space, and add these to the anchor sites.
-3. **SIPNET Model Runs**: A separate workflow prepares inputs and runs SIPNET simulations for the design points.
-4. **Extract SIPNET Output**: Extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats.
-5. **Downscale and Aggregate SIPNET Output**: Builds a Random Forest model for each ensemble member to predict SOC and AGB, downscales the predictions to the full set of fields, and aggregates predictions to county-level estimates to produce maps and summary statistics.
+* **Data Preparation**: Prepares data for clustering and downscaling process.
+* **Design Point Selection**: Uses k-means clustering to select a representative set of fields that represent environmental space, and add these to the anchor sites.
+* **SIPNET Model Runs**: A separate workflow prepares inputs and runs SIPNET simulations for the design points.
+* **Extract SIPNET Output**: Extracts ensemble SIPNET outputs for the design points and converts them into NetCDF, array, and long formats.
+* **Downscale and Aggregate SIPNET Output**: Builds a Random Forest model for each ensemble member to predict SOC and AGB, downscales the predictions to the full set of fields, and aggregates predictions to county-level estimates to produce maps and summary statistics.
 
 ## Data Preparation
 
@@ -54,6 +54,90 @@ It reads in the LandIQ crop map, anchor sites, and environmental covariates, and
 - Groups fields into Cal-Adapt climate regions.
 - Assigns anchor sites to fields.
 
+**Software Requirements:**
+
+**Software:**
+
+Modules
+
+```{bash eval = FALSE}
+# Load system modules
+module load R/4.4.0 quarto/1.2.313 \
+  gdal/3.10.2 proj/9.5.1 geos/3.13.1
+```
+
+R Packages
+
+Spatial Data
+- sf
+- terra
+
+Analysis
+- PEcan.all 
+- randomForest (for random forest modeling)
+- factoextra
+
+Plotting
+- knitr (for rendering tables)
+- patchwork (for combining ggplot2 plots)
+- viridis (for color scales in ggplot2)
+
+Utilities
+- tidyverse (includes dplyr, ggplot2, readr, purrr, etc.)
+- parallel
+- remotes
+- here
+- lubridate
+- furrr (for parallelized purrr functions)
+- cluster
+- pathviewr
+- skimr
+
+```{r eval = FALSE}
+# Define R repositories 
+options(repos = c(
+  # https://pecanproject.github.io/pecan-documentation/develop/r-universe.html
+  pecanproject = 'https://pecanproject.r-universe.dev',
+  # ropensci     = 'https://ropensci.r-universe.dev',
+  ajlyons      = 'https://ajlyons.r-universe.dev', # caladaptr
+  CRAN         = 'https://cloud.r-project.org'))
+
+# Install CRAN packages
+
+install.packages(c(
+  # Utilities
+  "knitr", "tidyverse", "parallel", "here", "lubridate", "furrr", "cluster", "pathviewr", "caladaptr", "skimr",
+
+  # Analysis
+  "PEcAn.all", "randomForest", "factoextra",
+  "sf", "terra",
+
+  # Plotting
+ "patchwork", "viridis"
+
+))
+
+# Install PEcAn modules from GitHub (with specific branch names)
+# This is to use features that are still under development
+remotes::install_github("PecanProject/pecan@ensemble_downscaling", 
+  subdir = "modules/assim.sequential", upgrade = FALSE)
+remotes::install_github("PecanProject/pecan@shp2gpkg", 
+   subdir = "modules/data.land", upgrade = FALSE)
+# Should probably move to using renv
+# Initialize renv if not already done
+# if (!file.exists("renv.lock")) {
+#  renv::init(bare = TRUE)
+# }
+
+# Install PEcAn modules from specific branch on GitHub
+# renv::install("PecanProject/pecan@ensemble_downscaling", 
+  subdir = "modules/assim.sequential")
+# renv::install("PecanProject/pecan@shp2gpkg", 
+  subdir = "modules/data.land")
+
+# Snapshot the environment to lock dependencies
+# renv::snapshot()
+```
 
 **Inputs:**
 

From 94824500b6d7ec4809e407b6a2a1710dd9a2ab59 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 9 Apr 2025 17:58:41 -0700
Subject: [PATCH 37/49] Update downscale/00-prepare.R

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/00-prepare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 7ecc322..d851fa9 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -217,7 +217,7 @@ extract_clim <- function(raster, points_sf) {
 clim_summaries <- .tmp |>
   dplyr::mutate(
     precip = PEcAn.utils::ud_convert(prec, "second-1", "year-1")
-) |>
+  ) |>
   dplyr::group_by(site_id) |>
   dplyr::summarise(
     temp = mean(temp),

From 6dceea59894ce54ad1c79e5dd01c308d9da0d0c3 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 11 Apr 2025 01:07:53 -0400
Subject: [PATCH 38/49] Address PR reviews - lots of changes in one commit

---
 .future.R                                     |  10 ++
 downscale/00-prepare.R                        |  66 ++++++---
 .../01_cluster_and_select_design_points.R     |  85 ++++++-----
 downscale/02_extract_sipnet_output.R          |  56 ++++---
 downscale/03_downscale_and_agregate.R         | 139 ++++++++----------
 .../04_downscaling_documentation_results.qmd  |  87 +----------
 6 files changed, 204 insertions(+), 239 deletions(-)
 create mode 100644 .future.R

diff --git a/.future.R b/.future.R
new file mode 100644
index 0000000..de374d2
--- /dev/null
+++ b/.future.R
@@ -0,0 +1,10 @@
+# This file will load any time the future package is loaded.
+# see ?future::plan for more details
+
+# Auto-detect cores and leave one free
+no_cores <- max(future::availableCores( ) - 1, 1)
+future::plan(future::multicore, workers = no_cores)
+
+PEcAn.logger::logger.info(paste("Using", no_cores, "cores for parallel processing"))
+
+PEcAn.logger::logger.warn(paste("Using", no_cores, "cores for parallel processing"))
\ No newline at end of file
diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 65bfc3e..7999648 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -15,27 +15,25 @@
 #' ## TODO
 #' 
 #' - Use consistent projection(s):
-#'   - California Albers EPSG:33110 for joins
+#'   - California Albers EPSG:33110 for joins and spatial operations
 #'   - WGS84 EPSG:4326 for plotting, subsetting rasters?
 #' - Clean up domain code
 #' - Create a bunch of tables and join all at once at the end
 #' - Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
 
-library(PEcAn.all)
 library(tidyverse)
 library(sf)
 library(terra)
 library(caladaptr)
 
-## Check available compute resources and set up parallel processing
-no_cores <- parallel::detectCores(logical = FALSE)
-future::plan(multicore, workers = no_cores - 2)
-
+# moved to 000-config.R?
 data_dir <- "/projectnb2/dietzelab/ccmmf/data"
 raw_data_dir <- "/projectnb2/dietzelab/ccmmf/data_raw"
-
 ca_albers_crs <- 3310
 
+PEcAn.logger::logger.info("Starting Data Preparation Workflow")
+
+#source(here::here("downscale", "000-config.R"), local = TRUE)
 #' ### LandIQ Woody Polygons
 #' 
 #' The first step is to convert LandIQ to a open and standard (TBD) format.
@@ -49,12 +47,14 @@ ca_albers_crs <- 3310
 #' 
 ## Convert SHP to Geotiff`
 
-# if these functions aren't available, see software dependency docs
 input_file = file.path(raw_data_dir, 'i15_Crop_Mapping_2016_SHP/i15_Crop_Mapping_2016.shp')
 ca_fields_gpkg <- file.path(data_dir, 'ca_fields.gpkg')
 ca_attributes_csv = file.path(data_dir, 'ca_field_attributes.csv')
 if(!file.exists(ca_fields_gpkg) & !file.exists(ca_attributes_csv)) {
-  landiq2std(input_file, ca_fields_gpkg, ca_attributes_csv)
+  landiq2std(input_file, ca_fields_gpkg, ca_attributes_csv) # if landiq2std isnt available, see software section of README
+  PEcAn.logger::logger.info(paste0("Created ca_fields.gpkg and ca_field_attributes.csv in ", data_dir))
+} else {
+   PEcAn.logger::logger.info("ca_fields.gpkg and ca_field_attributes.csv already exist in ", data_dir)
 }
 
 ca_fields <- sf::st_read(ca_fields_gpkg) |>
@@ -69,16 +69,30 @@ ca_attributes <- readr::read_csv(ca_attributes_csv)
 #' At the same time we will calculate the total percent of California Croplands that are woody perennial crop.
 #'
 ## -----------------------------------------------------------------------------
-ca_fields |>
-  filter(pft == "woody perennial crop") |>
-  left_join(ca_attributes |> 
-  select(site_id, pft), by = c("site_id")) |>
-  sf::st_transform(crs = ca_albers_crs) |>
-  dplyr::select(site_id, geom) |>
-  sf::st_write(file.path(data_dir, 'ca_woody.gpkg'), 
-    delete_dsn = TRUE)
 
-#'
+ca_woody_gpkg <- file.path(data_dir, 'ca_woody.gpkg')
+if(!file.exists(ca_woody_gpkg)) {
+  ca_fields |>
+    left_join(
+      ca_attributes |>
+        filter(pft == "woody perennial crop") |>
+        select(site_id, pft),
+      by = c("site_id")
+    ) |>
+    sf::st_transform(crs = ca_albers_crs) |>
+    dplyr::select(site_id, geom) |>
+    sf::st_write(
+      file.path(data_dir, 'ca_woody.gpkg'),
+      delete_dsn = TRUE
+    )
+  PEcAn.logger::logger.info("Created ca_woody.gpkg with woody perennial crop fields in ", data_dir)
+} else {
+  PEcAn.logger::logger.info("ca_woody.gpkg already exists in ", data_dir)
+}
+
+#' ### Create California bounding box and polygon for clipping
+#'  
+
 #' ### Convert Polygons to Points.
 #'
 #' For Phase 1, we will use points to query raster data.
@@ -192,6 +206,7 @@ extract_clim <- function(raster, points_sf) {
   furrr::future_map_dfr(
     ~ extract_clim(.x, ca_fields_pts),
       .id = "year"
+      )
 
 clim_summaries <- .tmp |>
   dplyr::mutate(
@@ -281,6 +296,12 @@ anchor_sites_pts <- anchor_sites |>
 # spatial join find ca_fields that contain anchor site points
 #     takes ~ 1 min on BU cluster w/ "Intel(R) Xeon(R) CPU E5-2670 0 @ 2.60GHz"
 
+# Note: in the next step, pfts are removed from the anchorsites dataframe 
+# and kept pfts from site_covariates 
+# the ones in the sites_covariates were generated by the landiq2std function
+# TODO we will need to make sure that pfts are consistent b/w landiq and anchorsites
+# by identifying and investigating discrepancies
+
 # First subset ca_fields to only include those with covariates
 #     (approx.  )
 
@@ -299,7 +320,8 @@ matched_anchor_sites <- anchor_sites_with_ids |>
   dplyr::filter(!is.na(site_id))
 
 if (nrow(unmatched_anchor_sites) > 0) {
-  # Find nearest indices
+  # TODO Consider if it is more efficient and clear to match all anchor sites using 
+  # st_nearest_feature rather than st_within
   nearest_indices <- sf::st_nearest_feature(unmatched_anchor_sites, ca_fields)
 
   # Get nearest ca_fields
@@ -338,7 +360,7 @@ if (any(is.na(anchor_sites_with_ids |> select(site_id, lat, lon)))) {
 }
 
 # Check that all anchor sites have covariates
-if (anchor_sites_with_ids |>
+n_missing <- anchor_sites_with_ids |>
   left_join(site_covariates, by = "site_id") |>
   dplyr::select(
     site_id, lat, lon,
@@ -347,7 +369,9 @@ if (anchor_sites_with_ids |>
   filter(if_any(
     everything(),
     ~ is.na(.x)
-  )) |> nrow() > 0) {
+  )) |> nrow()
+
+if (n_missing > 0) {
   PEcAn.logger::logger.warn(
     "Some anchor sites have missing environmental covariates!"
   )
diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index a17fee8..8b973a4 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -18,27 +18,25 @@
 ## ----setup--------------------------------------------------------------------
 # general utilities
 library(tidyverse)
-
 # spatial
 library(sf)
 library(terra)
-
+library(caladaptr) # to plot climate regions
 # parallel computing
-library(cluster)
-library(factoextra)
-library(pathviewr) #???
 library(furrr)
 library(doParallel)
 library(dplyr)
+# analysis
+library(cluster)
+library(factoextra)
+library(pathviewr) #???
 
-library(caladaptr) # to plot climate regions
+# plotting
+library(GGally)
 
-# Set up parallel processing with a safe number of cores
-no_cores <- parallel::detectCores(logical = FALSE)
-plan(multicore,  workers = no_cores - 2)
-options(future.globals.maxSize = benchmarkme::get_ram() * 0.9)
+# settings
+set.seed(42)
 ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
-
 data_dir <- "/projectnb/dietzelab/ccmmf/data"
 #'
 #' ## Load Site Environmental Data Covariates
@@ -58,7 +56,7 @@ data_dir <- "/projectnb/dietzelab/ccmmf/data"
 
 site_covariates_csv <- file.path(data_dir, "site_covariates.csv")
 site_covariates <- readr::read_csv(site_covariates_csv) 
-file.exists(site_covariates_csv)
+
 #' ## Anchor Site Selection
 #'
 #' Load Anchor Sites from UC Davis, UC Riverside, and Ameriflux.
@@ -86,8 +84,9 @@ ggsave(p, filename = "downscale/figures/anchor_sites.png", dpi = 300, bg = "whit
 
 anchorsites_for_clust <-
   anchor_sites_with_ids |>
-  select(-pft) |>  # for consistency, only keep pfts from site_covariates
-  left_join(site_covariates, by = 'site_id') 
+  select(-pft) |>                                             
+  left_join(site_covariates, by = 'site_id')                  
+  
   
 #'
 #' ### Subset LandIQ fields for clustering
@@ -99,7 +98,6 @@ anchorsites_for_clust <-
 #' - Bind anchor sites back to the dataset
 #'
 ## ----subset-for-clustering----------------------------------------------------
-set.seed(42) # Set seed for random number generator for reproducibility
 # 10k works
 # 2k sufficient for testing
 sample_size <- 10000
@@ -126,12 +124,21 @@ skimr::skim(data_for_clust)
 #' 
 #' ### K-means Clustering
 #' 
-#' First, create a function `perform_clustering` to perform hierarchical k-means and find optimal clusters.
-#' 
-#' K-means on the numeric columns (temp, precip, clay, possibly ignoring 'crop'
-#' or treat 'crop' as categorical by some encoding if needed).
-#' 
-## ----k-means-clustering-function----------------------------------------------
+#' K-means on the numeric columns of covariates 
+#' Full list printed in message; currently (temp, precip, srad, vapr, clay, ocd, twi)
+#' could include 'crop' as categorical with one-hot encoding) i.e.
+#' crop1 crop2 crop3
+#' 0     1    0
+#' 1     0    0
+#' by changing function below w/ 
+# perform_clustering <- function(data, k_range = 2:20) {
+  # Perform one-hot encoding for the 'crop' variable
+  # encoded_crop <- model.matrix(~ crop - 1, data = data)
+  # clust_data <- data |>
+  #   select(where(is.numeric), -ends_with("id")) |>
+  #   cbind(encoded_crop)
+  # extract metrics
+
 
 perform_clustering <- function(data, k_range = 2:20) {
   # Select numeric variables for clustering
@@ -161,7 +168,9 @@ perform_clustering <- function(data, k_range = 2:20) {
     },
     .options = furrr_options(seed = TRUE)
   )
-  # extract metrics
+
+
+
   metrics_df <- data.frame(
     # see also https://github.com/PecanProject/pecan/blob/b5322a0fc62760b4981b2565aabafc07b848a699/modules/assim.sequential/inst/sda_backup/bmorrison/site_selection/pick_sda_sites.R#L221
     k = k_range,
@@ -213,7 +222,8 @@ perform_clustering <- function(data, k_range = 2:20) {
 #' Apply clustering function to the sampled dataset.
 #' 
 ## ----clustering, eval=FALSE---------------------------------------------------
-# 
+
+PEcAn.logger::logger.info("Starting clustering")
 sites_clustered <- perform_clustering(data_for_clust, k = 5:15)
 
 #'
@@ -228,13 +238,13 @@ cluster_summary <- sites_clustered |>
 knitr::kable(cluster_summary, digits = 0)
 
 # Plot all pairwise numeric variables
-library(GGally)
 ggpairs_plot <- sites_clustered |>
   select(-site_id) |>
   # need small # pfts for ggpairs
   sample_n(1000) |>
   ggpairs(
-    columns = c(1, 2, 4, 5, 6) + 1,
+    # plot all values except site_id and cluster
+    columns = setdiff(names(sites_clustered), c("site_id", "cluster")),
     mapping = aes(color = as.factor(cluster), alpha = 0.8)
   ) +
   theme_minimal()
@@ -273,17 +283,21 @@ ggsave(cluster_plot, filename = "downscale/figures/cluster_plot.png", dpi = 300)
 # Check stratification of clusters by categorical factors
 
 # cols should be character, factor
-crop_ids <- read_csv(file.path(data_dir, "crop_ids.csv"),
-                     col_types = cols(
-                       crop_id = col_factor(),
-                       crop = col_character())
-                       )
-climregion_ids <- read_csv(file.path(data_dir, "climregion_ids.csv"),
-                           col_types = cols(
-                             climregion_id = col_factor(),
-                             climregion_name = col_character()
-                           ))
+crop_ids <- read_csv(
+  file.path(data_dir, "crop_ids.csv"),
+  col_types = cols(
+    crop_id = col_factor(),
+    crop = col_character()
+  )
+)
 
+climregion_ids <- read_csv(
+  file.path(data_dir, "climregion_ids.csv"),
+  col_types = cols(
+    climregion_id = col_factor(),
+    climregion_name = col_character()
+  )
+)
 
 ## ----stratification-----------------------------------------------------------
 # The goal here is to check the stratification of the clusters by crop and climregion
@@ -351,7 +365,6 @@ ca_fields <- sf::st_read(file.path(data_dir, "ca_fields.gpkg"))
 ## ----design-point-selection---------------------------------------------------
 # From the clustered data, remove anchor sites to avoid duplicates in design point selection.
 
-set.seed(2222222)
 design_points_ids <- sites_clustered |>
   filter(!site_id %in% anchorsites_for_clust$site_id) |>
   select(site_id)  |>
diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 0f8d968..e776c8e 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -1,3 +1,13 @@
+# This file processess the output from SIPNET ensemble runs and generates
+# three different data formats that comply with Ecological Forecasting Initiative 
+# (EFI) standard:
+# 1. A 4-D array (time, site, ensemble, variable)
+# 2. A long format data frame (time, site, ensemble, variable)
+# 3. A NetCDF file (time, site, ensemble, variable)
+# This code can be moved to PEcAn.utils as one or more functions
+# I did this so that I could determine which format is easiest to work with
+# For now, I am planning to work with the CSV format
+# TODO: write out EML metadata in order to be fully EFI compliant
 library(PEcAn.logger)
 library(lubridate)
 library(dplyr)
@@ -5,18 +15,15 @@ library(ncdf4)
 library(furrr)
 library(stringr)
 
-no_cores <- parallel::detectCores(logical = FALSE)
-plan(multicore, workers = no_cores - 1)
-
 # Define base directory for ensemble outputs
-basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
+modeloutdir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
 
 # Read settings file and extract run information
-settings <- PEcAn.settings::read.settings(file.path(basedir, "settings.xml"))
-outdir <- file.path(basedir, settings$modeloutdir)
+settings <- PEcAn.settings::read.settings(file.path(modeloutdir, "settings.xml"))
+outdir <- file.path(modeloutdir, settings$modeloutdir)
 ensemble_size <- settings$ensemble$size |>
     as.numeric()
-start_date <- settings$run$settings.1$start.date # TODO make this unique for each site
+start_date <- settings$run$settings.1$start.date
 start_year <- lubridate::year(start_date)
 end_date <- settings$run$settings.1$end.date
 end_year <- lubridate::year(end_date)
@@ -64,7 +71,6 @@ variables <- c("AGB", "TotSoilCarb")
 #' | AGB                           | Total aboveground biomass                |
 #' | time_bounds                   | history time interval endpoints          |
 
-# Preallocate 3-D array for 98 sites, 2 variables, and 20 ensemble members
 site_ids <- design_points |>
     pull(site_id) |>
     unique()
@@ -87,7 +93,9 @@ if (!all(existing_dirs)) {
     PEcAn.logger::logger.warn("Missing expected ensemble directories: ", paste(missing_dirs, collapse = ", "))
 }
 
-# extract output via read.output
+# extract output via PEcAn.utils::read.output
+# temporarily suppress logging or else it will print a lot of file names
+logger_level <- PEcAn.logger::logger.setLevel("OFF")
 ens_results <- furrr::future_pmap_dfr(
     ens_dirs,
     function(ens, site_id, dir) {
@@ -106,12 +114,15 @@ ens_results <- furrr::future_pmap_dfr(
     .options = furrr::furrr_options(seed = TRUE)
 ) |>
     group_by(ensemble, site_id, year) |>
-    filter(year <= end_year) |>
+    #filter(year <= end_year) |> # not sure why this was necessary; should be taken care of by read.output
     filter(time == max(time)) |> # only take last value
     ungroup() |>
     arrange(ensemble, site_id, year)  |> 
     tidyr::pivot_longer(cols = all_of(variables), names_to = "variable", values_to = "prediction")
 
+# restore logging
+logger_level <- PEcAn.logger::logger.setLevel(logger_level)
+
 ## Create Ensemble Output For Downscaling
 ## Below, three different output formats are created:
 ## 1. 4-D array (time, site, ensemble, variable)
@@ -269,21 +280,20 @@ ncvar_put(nc_out, soc_ncvar, ens_arrays[["TotSoilCarb"]])
 
 # Get Run metadata from log filename
 # ??? is there a more reliable way to do this?
-logfile <- dir(basedir, pattern = "log$")
-pattern <- "([0-9]{14})_([0-9]+)\\.log$"
-forecast_time_string <- stringr::str_match(logfile, "[0-9]{12}")
-if (is.na(forecast_time_string)) {
-    PEcAn.logger::logger.error("Unable to extract forecast time from log filename.")
-}
-forecast_iteration_id <- forecast_time_string # or ?
-forecast_time <- lubridate::as_datetime(forecast_time_string, format = "%Y%m%d%H%M%S")
+# TODO try parsing STATUS file
+forecast_time <- readr::read_tsv(
+    file.path(basedir, 'output', "STATUS"),
+    col_names = FALSE
+) |>
+    filter(X1 == "FINISHED") |>
+    pull(X3)
+forecast_iteration_id <- as.numeric(forecast_time) # or is run_id available?
 obs_flag <- 0
-
-ncatt_put(nc_out, 0, "model_name", "SIPNET")
-ncatt_put(nc_out, 0, "model_version", "v1.3")
+ncatt_put(nc_out, 0, "model_name", settings$model$type) 
+ncatt_put(nc_out, 0, "model_version", settings$model$revision)
 ncatt_put(nc_out, 0, "iteration_id", forecast_iteration_id)
-ncatt_put(nc_out, 0, "forecast_time", format(forecast_time, "%Y-%m-%d %H:%M:%S"))
-ncatt_put(nc_out, 0, "obs_flag", 0)
+ncatt_put(nc_out, 0, "forecast_time", forecast_time)
+ncatt_put(nc_out, 0, "obs_flag", obs_flag)
 ncatt_put(nc_out, 0, "creation_date", format(Sys.time(), "%Y-%m-%d"))
 # Close the netCDF file.
 nc_close(nc_out)
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index e603cf7..cf96f06 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -21,9 +21,7 @@ library(sf)
 library(terra)
 library(furrr)
 library(patchwork) # for combining plots
-
-no_cores <- parallel::detectCores(logical = FALSE)
-plan(multicore, workers = no_cores - 1)
+library(pdp)       # for computing partial dependence plots
 
 library(PEcAnAssimSequential)
 datadir <- "/projectnb/dietzelab/ccmmf/data"
@@ -31,6 +29,7 @@ basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
 settings <- PEcAn.settings::read.settings(file.path(basedir, "settings.xml"))
 outdir <- file.path(basedir, settings$modeloutdir)
 options(readr.show_col_types = FALSE)
+set.seed(123)
 
 
 #' ## Get Site Level Outputs
@@ -140,7 +139,7 @@ ens_county_preds <- downscale_preds |>
   ) |>
   ungroup() |>
   mutate(
-    c_density = PEcAn.utils::ud_convert(total_c / total_ha, "Tg/ha", "kg/m2")
+    c_density = total_c / total_ha
   ) |>
   arrange(carbon_pool, county, ensemble)
 
@@ -250,85 +249,77 @@ importance_summary <- map_dfr(cpools, function(cp) {
   summary_df
 })
 
-# Create importance plot
-p_importance <- ggplot(importance_summary, aes(x = reorder(predictor, median_importance), y = median_importance)) +
-  geom_errorbar(aes(ymin = lcl_importance, ymax = ucl_importance), width = 0.2, color = "gray50") +
-  geom_point(size = 4, color = "steelblue") +
-  coord_flip() +
-  facet_wrap(~carbon_pool, scales = "free_y") +
-  labs(
-    title = "Variable Importance",
-    x = "Predictor",
-    y = "Median Increase MSE (SD)"
-  ) +
-  theme_minimal()
-
-# Save importance plot
-ggsave(p_importance, filename = here::here("downscale/figures", "importance_summary.png"),
-  width = 10, height = 5, bg = "white"
-)
-
 # Now create and save combined importance + partial plots for each carbon pool
 for (cp in cpools) {
-  # Find top 2 predictors for this carbon pool
-  top_predictors <- importance_summary |>
-    filter(carbon_pool == cp) |>
-    arrange(desc(median_importance)) |>
-    slice_head(n = 2) |>
-    pull(predictor)
   
-  # Set up a 3-panel plot
-  png(filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
-      width = 14, height = 6, units = "in", res = 300, bg = "white")
+  # Filter importance data for carbon pool cp
+  cp_importance <- importance_summary |>
+    dplyr::filter(carbon_pool == cp)
+  
+  # Select top 2 predictors
+  top_predictors <- cp_importance |>
+    dplyr::arrange(dplyr::desc(median_importance)) |>
+    dplyr::slice_head(n = 2) |>
+    dplyr::pull(predictor)
   
-  par(mfrow = c(1, 3))
+  # Build variable importance plot for carbon pool cp
+  p_importance_cp <- ggplot2::ggplot(cp_importance, 
+                                     ggplot2::aes(x = reorder(predictor, median_importance),
+                                                  y = median_importance)) +
+    ggplot2::geom_errorbar(ggplot2::aes(ymin = lcl_importance, ymax = ucl_importance),
+                           width = 0.2, color = "gray50") +
+    ggplot2::geom_point(size = 4, color = "steelblue") +
+    ggplot2::coord_flip() +
+    ggplot2::labs(title = paste("Variable Importance -", cp),
+                  x = "Predictor",
+                  y = "Median Increase MSE (SD)") +
+    ggplot2::theme_minimal()
   
-  # Panel 1: Show only this carbon pool's importance plot
-  # Extract just this carbon pool's data
-  cp_importance <- importance_summary |> filter(carbon_pool == cp)
+  model <- downscale_output_list[[cp]][["model"]][[1]]
   
-  # Create importance plot for just this carbon pool
-  par(mar = c(5, 10, 4, 2)) # Adjust margins for first panel
-  with(cp_importance, 
-       dotchart(median_importance, 
-                labels = reorder(predictor, median_importance),
-                xlab = "Median Increase MSE (SD)",
-                main = paste("Variable Importance -", cp),
-                pch = 19, col = "steelblue", cex = 1.2))
+  # Bin top predictors in covariates to make partial dependence faster
+  # Covariates is ~400k rows, so binning will speed up the process
+  binned_covariates_df <- covariates |>
+    dplyr::mutate(dplyr::across(dplyr::all_of(top_predictors), ~ cut(.x, breaks = 100, labels = FALSE))) |>
+    as.data.frame()
   
-  # Add error bars
-  with(cp_importance, 
-       segments(lcl_importance, 
-                seq_along(predictor), 
-                ucl_importance, 
-                seq_along(predictor),
-                col = "gray50"))
+  # Compute partial dependence for the top predictors,
+  # explicitly supplying the predict function via an anonymous function.
+  pd1 <- pdp::partial(object = model,
+                 pred.var = top_predictors[1],
+                 train = binned_covariates_df,
+                 grid.resolution = 10,
+                 .f = function(object, newdata) stats::predict(object, newdata))
   
-  # Panels 2 & 3: Create partial plots for top 2 predictors
-  model <- downscale_output_list[[cp]][["model"]][[1]]
+  pd2 <- as.data.frame(
+    pdp::partial(object = model,
+                 pred.var = top_predictors[2],
+                 train = binned_covariates_df,
+                 grid.resolution = 10,
+                 .f = function(object, newdata) stats::predict(object, newdata))
+  )
   
-  # First top predictor partial plot
-  par(mar = c(5, 5, 4, 2)) # Reset margins for other panels
-  randomForest::partialPlot(model, 
-                           pred.data = covariates, 
-                           x.var = top_predictors[1], 
-                           main = paste("Partial Dependence Plot for", top_predictors[1]),
-                           xlab = top_predictors[1], 
-                           ylab = paste("Predicted", cp),
-                           col = "steelblue", 
-                           lwd = 2)
+  # Build ggplot-based partial dependence plots for each top predictor
+  p_pd1 <- ggplot2::ggplot(pd1, ggplot2::aes(x = !!sym(top_predictors[1]), y = yhat)) +
+    ggplot2::geom_line(linewidth = 1.2, color = "steelblue") +
+    ggplot2::labs(title = paste("Partial Dependence for", top_predictors[1]),
+                  x = top_predictors[1],
+                  y = paste("Predicted", cp)) +
+    ggplot2::theme_minimal()
   
-  # Second top predictor partial plot
-  randomForest::partialPlot(model, 
-                           pred.data = covariates, 
-                           x.var = top_predictors[2], 
-                           main = paste("Partial Dependence Plot for", top_predictors[2]),
-                           xlab = top_predictors[2], 
-                           ylab = paste("Predicted", cp),
-                           col = "steelblue", 
-                           lwd = 2)
+  p_pd2 <- ggplot2::ggplot(pd2, ggplot2::aes(x = !!sym(top_predictors[2]), y = yhat)) +
+    ggplot2::geom_line(linewidth = 1.2, color = "steelblue") +
+    ggplot2::labs(title = paste("Partial Dependence for", top_predictors[2]),
+                  x = top_predictors[2],
+                  y = paste("Predicted", cp)) +
+    ggplot2::theme_minimal()
+  
+  # Combine the importance and partial dependence plots using patchwork
+  combined_plot <- p_importance_cp + p_pd1 + p_pd2 + 
+    patchwork::plot_layout(ncol = 3)
+  
+   ggplot2::ggsave(filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
+                  plot = combined_plot,
+                  width = 14, height = 6, bg = "white")
   
-  dev.off()
 }
-
-print(p_importance)
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index f2ffe34..74de3dd 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -54,90 +54,6 @@ It reads in the LandIQ crop map, anchor sites, and environmental covariates, and
 - Groups fields into Cal-Adapt climate regions.
 - Assigns anchor sites to fields.
 
-**Software Requirements:**
-
-**Software:**
-
-Modules
-
-```{bash eval = FALSE}
-# Load system modules
-module load R/4.4.0 quarto/1.2.313 \
-  gdal/3.10.2 proj/9.5.1 geos/3.13.1
-```
-
-R Packages
-
-Spatial Data
-- sf
-- terra
-
-Analysis
-- PEcan.all 
-- randomForest (for random forest modeling)
-- factoextra
-
-Plotting
-- knitr (for rendering tables)
-- patchwork (for combining ggplot2 plots)
-- viridis (for color scales in ggplot2)
-
-Utilities
-- tidyverse (includes dplyr, ggplot2, readr, purrr, etc.)
-- parallel
-- remotes
-- here
-- lubridate
-- furrr (for parallelized purrr functions)
-- cluster
-- pathviewr
-- skimr
-
-```{r eval = FALSE}
-# Define R repositories 
-options(repos = c(
-  # https://pecanproject.github.io/pecan-documentation/develop/r-universe.html
-  pecanproject = 'https://pecanproject.r-universe.dev',
-  # ropensci     = 'https://ropensci.r-universe.dev',
-  ajlyons      = 'https://ajlyons.r-universe.dev', # caladaptr
-  CRAN         = 'https://cloud.r-project.org'))
-
-# Install CRAN packages
-
-install.packages(c(
-  # Utilities
-  "knitr", "tidyverse", "parallel", "here", "lubridate", "furrr", "cluster", "pathviewr", "caladaptr", "skimr",
-
-  # Analysis
-  "PEcAn.all", "randomForest", "factoextra",
-  "sf", "terra",
-
-  # Plotting
- "patchwork", "viridis"
-
-))
-
-# Install PEcAn modules from GitHub (with specific branch names)
-# This is to use features that are still under development
-remotes::install_github("PecanProject/pecan@ensemble_downscaling", 
-  subdir = "modules/assim.sequential", upgrade = FALSE)
-remotes::install_github("PecanProject/pecan@shp2gpkg", 
-   subdir = "modules/data.land", upgrade = FALSE)
-# Should probably move to using renv
-# Initialize renv if not already done
-# if (!file.exists("renv.lock")) {
-#  renv::init(bare = TRUE)
-# }
-
-# Install PEcAn modules from specific branch on GitHub
-# renv::install("PecanProject/pecan@ensemble_downscaling", 
-  subdir = "modules/assim.sequential")
-# renv::install("PecanProject/pecan@shp2gpkg", 
-  subdir = "modules/data.land")
-
-# Snapshot the environment to lock dependencies
-# renv::snapshot()
-```
 
 **Inputs:**
 
@@ -322,7 +238,7 @@ The following plots show the variable importance from the random forest models u
 
 Variable importance quantifies how useful each covariate is in predicting the carbon stock<!-- or ghg flux-->. Partial dependence plots show the marginal effect of individual predictors on model response after averaging over the other predictors.
 
-#### TotSoilCarb - Importance and Partial Dependence
+#### Variable Importance and Partial Dependence
 
 ![](figures/TotSoilCarb_importance_partial_plots.png)
 
@@ -330,6 +246,7 @@ Variable importance quantifies how useful each covariate is in predicting the ca
 
 ![](figures/AGB_importance_partial_plots.png)
 
+
 ### Searchable Table
 
 The table below provides a searchable summary of the county-level carbon stocks and densities.

From 0958654037d5fb04f41c62e89611f8e78efe2451 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 11 Apr 2025 01:25:41 -0400
Subject: [PATCH 39/49] round lat,lon in anchor_sites_ids.csv

---
 downscale/00-prepare.R | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 7999648..405c6fa 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -20,7 +20,9 @@
 #' - Clean up domain code
 #' - Create a bunch of tables and join all at once at the end
 #' - Disambiguate the use of 'ca' in object names; currently refers to both California and Cal-Adapt
-
+#' - decide if we need both Ameriflux shortnames and full names in anchor_sites.csv
+#'    (prob. yes, b/c both are helpful); if so, come up w/ better name than 'location'
+#' - make sure anchor_sites_ids.csv fields are defined in README 
 library(tidyverse)
 library(sf)
 library(terra)
@@ -382,5 +384,10 @@ if (n_missing > 0) {
 anchor_sites_with_ids |>
   sf::st_drop_geometry() |>
   dplyr::select(site_id, lat, lon, external_site_id, site_name, crops, pft) |>
+  # save lat, lon with 5 decimal places 
+  dplyr::mutate(
+    lat = round(lat, 5),
+    lon = round(lon, 5)
+  ) |>
   readr::write_csv(file.path(data_dir, "anchor_sites_ids.csv"))
 

From b71741ee07ea3a6a6f7b96394608f4c85f48b5e8 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 11 Apr 2025 13:06:22 -0700
Subject: [PATCH 40/49] Update downscale/00-prepare.R

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/00-prepare.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 405c6fa..73e5c2b 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -361,7 +361,7 @@ if (any(is.na(anchor_sites_with_ids |> select(site_id, lat, lon)))) {
   )
 }
 
-# Check that all anchor sites have covariates
+# Check for anchor sites with any covariate missing
 n_missing <- anchor_sites_with_ids |>
   left_join(site_covariates, by = "site_id") |>
   dplyr::select(

From 3c054d6ceab54a1a61bd0e5505bd055da7014402 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 11 Apr 2025 13:07:55 -0700
Subject: [PATCH 41/49] Update downscale/00-prepare.R

---
 downscale/00-prepare.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 73e5c2b..bf17baa 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -305,7 +305,6 @@ anchor_sites_pts <- anchor_sites |>
 # by identifying and investigating discrepancies
 
 # First subset ca_fields to only include those with covariates
-#     (approx.  )
 
 ca_fields_with_covariates <- ca_fields |>
   dplyr::right_join(site_covariates |> select(site_id), by = "site_id")

From 84c135c0bd455904753de2ccacbafc58a44d4597 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 11 Apr 2025 13:13:32 -0700
Subject: [PATCH 42/49] Update downscale/02_extract_sipnet_output.R

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/02_extract_sipnet_output.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index e776c8e..50ed2d1 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -108,7 +108,7 @@ ens_results <- furrr::future_pmap_dfr(
             dataframe = TRUE,
             verbose = FALSE
         ) |>
-            dplyr::mutate(site_id = site_id, ensemble = as.numeric(ens)) |>
+            dplyr::mutate(site_id = .env$site_id, ensemble = as.numeric(.env$ens)) |>
             dplyr::rename(time = posix)
     },
     .options = furrr::furrr_options(seed = TRUE)

From 48d79d4c28ed91fa9460ecfd29c36742d92a5c93 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 11 Apr 2025 13:17:15 -0700
Subject: [PATCH 43/49] Update downscale/02_extract_sipnet_output.R

---
 downscale/02_extract_sipnet_output.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 50ed2d1..f8c69e5 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -280,7 +280,6 @@ ncvar_put(nc_out, soc_ncvar, ens_arrays[["TotSoilCarb"]])
 
 # Get Run metadata from log filename
 # ??? is there a more reliable way to do this?
-# TODO try parsing STATUS file
 forecast_time <- readr::read_tsv(
     file.path(basedir, 'output', "STATUS"),
     col_names = FALSE

From 537754538b75a22842983631b9615b3a1adc0862 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Fri, 11 Apr 2025 19:00:21 -0400
Subject: [PATCH 44/49] Refactor output file names for consistency and clarity
 in downscaling scripts

---
 downscale/00-prepare.R                             | 7 -------
 downscale/02_extract_sipnet_output.R               | 9 +++++----
 downscale/04_downscaling_documentation_results.qmd | 4 ++++
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 405c6fa..2aac9cd 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -277,13 +277,6 @@ PEcAn.logger::logger.info(
 
 readr::write_csv(site_covariates, file.path(data_dir, "site_covariates.csv"))
 
-# Final output for targets; if not in targets, suppress return
-if (exists("IN_TARGETS") && IN_TARGETS) {
-  site_covariates
-} else {
-  invisible(site_covariates)
-}
-
 #'
 #' ## Anchor Sites
 #'
diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index e776c8e..0e335c5 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -166,14 +166,14 @@ for (var in variables) {
     ens_arrays[[var]] <- arr
 }
 
-saveRDS(ens_arrays, file = file.path(outdir, "efi_ens_arrays.rds"))
+saveRDS(ens_arrays, file = file.path(outdir, "ensemble_output.rds"))
 
 # --- 2. Create EFI Standard v1.0 long format data frame ---
 efi_long <- ens_results |>
     rename(datetime = time) |>
     select(datetime, site_id, ensemble, variable, prediction)
 
-readr::write_csv(efi_long, file.path(outdir, "efi_ens_long.csv"))
+readr::write_csv(efi_long, file.path(outdir, "ensemble_output.csv"))
 
 ####--- 3. Create EFI Standard v1.0 NetCDF files
 library(ncdf4)
@@ -256,7 +256,7 @@ nc_vars <- list(
     TotSoilCarb = soc_ncvar
 )
 
-nc_file <- file.path(outdir, "efi_forecast.nc")
+nc_file <- file.path(outdir, "ensemble_output.nc")
 
 if (file.exists(nc_file)) {    
     file.remove(nc_file)
@@ -289,6 +289,7 @@ forecast_time <- readr::read_tsv(
     pull(X3)
 forecast_iteration_id <- as.numeric(forecast_time) # or is run_id available?
 obs_flag <- 0
+
 ncatt_put(nc_out, 0, "model_name", settings$model$type) 
 ncatt_put(nc_out, 0, "model_version", settings$model$revision)
 ncatt_put(nc_out, 0, "iteration_id", forecast_iteration_id)
@@ -298,4 +299,4 @@ ncatt_put(nc_out, 0, "creation_date", format(Sys.time(), "%Y-%m-%d"))
 # Close the netCDF file.
 nc_close(nc_out)
 
-PEcAn.logger::logger.info("EFI-compliant netCDF file 'efi_forecast.nc' created.")
+PEcAn.logger::logger.info("EFI-compliant netCDF file 'ensemble_output.nc' created.")
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index 74de3dd..a69fe99 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -275,6 +275,10 @@ county_summaries <- county_summaries |>
    dplyr::select(`Carbon Pool`, `County`, `Mean Total C (Tg/ county)`, `Mean C Density (kg/ha)`)
 
 # Create Table
+# TODO 
+#  - Fix point w/ missing county
+#  - Add n to table
+
 htmlwidgets::setWidgetIdSeed(123) # required to embed table self-contained in html
 options(htmlwidgets.TEMP_DIR = "htmlwidgets")
 

From 42a4f3e0ebadf8ba8b18a09f8fd999291c5f0fe6 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 15 Apr 2025 12:41:43 -0700
Subject: [PATCH 45/49] Update downscale/02_extract_sipnet_output.R

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/02_extract_sipnet_output.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 4fb7509..6bf137a 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -111,6 +111,11 @@ ens_results <- furrr::future_pmap_dfr(
             dplyr::mutate(site_id = .env$site_id, ensemble = as.numeric(.env$ens)) |>
             dplyr::rename(time = posix)
     },
+    # Avoids warning "future unexpectedly generated random numbers",
+    # which apparently originates from actions taken inside the `units` package
+    # when its namespace is loaded by ud_convert inside read.output.
+    # The warning is likely spurious, but looks scary and setting seed to
+    # silence it does not hurt anything.
     .options = furrr::furrr_options(seed = TRUE)
 ) |>
     group_by(ensemble, site_id, year) |>

From 96c604c3a885644df8757a3b195cc751a7954e99 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 15 Apr 2025 13:37:01 -0700
Subject: [PATCH 46/49] Update downscale/02_extract_sipnet_output.R

---
 downscale/02_extract_sipnet_output.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 6bf137a..a040962 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -115,7 +115,8 @@ ens_results <- furrr::future_pmap_dfr(
     # which apparently originates from actions taken inside the `units` package
     # when its namespace is loaded by ud_convert inside read.output.
     # The warning is likely spurious, but looks scary and setting seed to
-    # silence it does not hurt anything.
+    # silence it does not hurt anything. 
+    # Associated bug report: https://github.com/r-quantities/units/issues/409
     .options = furrr::furrr_options(seed = TRUE)
 ) |>
     group_by(ensemble, site_id, year) |>

From 77fe472181cda4610a583a86d4ccbf8948325115 Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Tue, 15 Apr 2025 19:37:32 -0400
Subject: [PATCH 47/49] fixing plots, including fixing units and removing
 partial dependency plots until I finish debugging

---
 downscale/00-prepare.R                        |   3 +-
 downscale/02_extract_sipnet_output.R          |  20 +-
 downscale/03_downscale_and_agregate.R         | 356 +++++++++++++-----
 .../04_downscaling_documentation_results.qmd  |  48 +--
 4 files changed, 309 insertions(+), 118 deletions(-)

diff --git a/downscale/00-prepare.R b/downscale/00-prepare.R
index 1dc344e..5afb7f4 100644
--- a/downscale/00-prepare.R
+++ b/downscale/00-prepare.R
@@ -282,6 +282,7 @@ readr::write_csv(site_covariates, file.path(data_dir, "site_covariates.csv"))
 #'
 ## ----anchor-sites-------------------------------------------------------------
 # Anchor sites from UC Davis, UC Riverside, and Ameriflux.
+# TODO rename raw_data/anchor_sites --> anchor_locations.csv 
 anchor_sites <- readr::read_csv("data_raw/anchor_sites.csv")
 anchor_sites_pts <- anchor_sites |>
   sf::st_as_sf(coords = c("lon", "lat"), crs = 4326) |>
@@ -381,5 +382,5 @@ anchor_sites_with_ids |>
     lat = round(lat, 5),
     lon = round(lon, 5)
   ) |>
-  readr::write_csv(file.path(data_dir, "anchor_sites_ids.csv"))
+  readr::write_csv(file.path(data_dir, "anchor_sites.csv"))
 
diff --git a/downscale/02_extract_sipnet_output.R b/downscale/02_extract_sipnet_output.R
index 4fb7509..06c41fa 100644
--- a/downscale/02_extract_sipnet_output.R
+++ b/downscale/02_extract_sipnet_output.R
@@ -33,7 +33,7 @@ end_year <- lubridate::year(end_date)
 # data/design_points.csv
 design_pt_csv <- "https://raw.githubusercontent.com/ccmmf/workflows/46a61d58a7b0e43ba4f851b7ba0d427d112be362/data/design_points.csv"
 design_points <- readr::read_csv(design_pt_csv, show_col_types = FALSE) |>
-    rename(site_id = id) |> # fixed in more recent version of 01 script
+    rename(site_id = id) |> # TODO remove; has already been renamed in source file 
     dplyr::distinct()
 
 # Variables to extract
@@ -43,6 +43,9 @@ variables <- c("AGB", "TotSoilCarb")
 #' This list is from the YYYY.nc.var files, and may change, 
 #'   e.g. if we write out less information in order to save time and storage space
 #' See SIPNET parameters.md for more details
+#' PEcAn standard units follow Climate Forecasting standards: 
+#' Carbon and water pools are in units of kg / m2
+#' Fluxes are in units of kg / m2 / s-1
 #'
 #' | Variable                      | Description                              |
 #' |-------------------------------|------------------------------------------|
@@ -120,6 +123,8 @@ ens_results <- furrr::future_pmap_dfr(
     arrange(ensemble, site_id, year)  |> 
     tidyr::pivot_longer(cols = all_of(variables), names_to = "variable", values_to = "prediction")
 
+# After extraction, ens_results$prediction is in kg C m-2 for both AGB and TotSoilCarb
+
 # restore logging
 logger_level <- PEcAn.logger::logger.setLevel(logger_level)
 
@@ -166,14 +171,18 @@ for (var in variables) {
     ens_arrays[[var]] <- arr
 }
 
-saveRDS(ens_arrays, file = file.path(outdir, "ensemble_output.rds"))
+# ens_arrays: each array is [time, site, ensemble], values in kg C m-2
+
+saveRDS(ens_arrays, file = file.path(outdir, "ensemble_output.rds")) # units: kg C m-2
 
 # --- 2. Create EFI Standard v1.0 long format data frame ---
 efi_long <- ens_results |>
     rename(datetime = time) |>
     select(datetime, site_id, ensemble, variable, prediction)
+# NOTE: prediction is in kg C m-2
 
 readr::write_csv(efi_long, file.path(outdir, "ensemble_output.csv"))
+# Consider: write a README or add a comment about units in the CSV
 
 ####--- 3. Create EFI Standard v1.0 NetCDF files
 library(ncdf4)
@@ -218,13 +227,13 @@ dims <- list(time_dim, site_dim, ensemble_dim)
 # Define forecast variables:
 agb_ncvar <- ncvar_def(
     name = "AGB",
-    units = "kg C m-2",
+    units = "kg C m-2", # correct units
     dim = dims,
     longname = "Total aboveground biomass"
 )
 soc_ncvar <- ncvar_def(
     name = "TotSoilCarb",
-    units = "kg C m-2",
+    units = "kg C m-2", # correct units
     dim = dims,
     longname = "Total Soil Carbon"
 )
@@ -281,7 +290,7 @@ ncvar_put(nc_out, soc_ncvar, ens_arrays[["TotSoilCarb"]])
 # Get Run metadata from log filename
 # ??? is there a more reliable way to do this?
 forecast_time <- readr::read_tsv(
-    file.path(basedir, 'output', "STATUS"),
+    file.path(modeloutdir, 'output', "STATUS"),
     col_names = FALSE
 ) |>
     filter(X1 == "FINISHED") |>
@@ -299,3 +308,4 @@ ncatt_put(nc_out, 0, "creation_date", format(Sys.time(), "%Y-%m-%d"))
 nc_close(nc_out)
 
 PEcAn.logger::logger.info("EFI-compliant netCDF file 'ensemble_output.nc' created.")
+PEcAn.logger::logger.info("All ensemble outputs (RDS, CSV, NetCDF) are in kg C m-2 for AGB and TotSoilCarb.")
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_agregate.R
index cf96f06..a7abc9f 100644
--- a/downscale/03_downscale_and_agregate.R
+++ b/downscale/03_downscale_and_agregate.R
@@ -21,7 +21,6 @@ library(sf)
 library(terra)
 library(furrr)
 library(patchwork) # for combining plots
-library(pdp)       # for computing partial dependence plots
 
 library(PEcAnAssimSequential)
 datadir <- "/projectnb/dietzelab/ccmmf/data"
@@ -29,7 +28,6 @@ basedir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859"
 settings <- PEcAn.settings::read.settings(file.path(basedir, "settings.xml"))
 outdir <- file.path(basedir, settings$modeloutdir)
 options(readr.show_col_types = FALSE)
-set.seed(123)
 
 
 #' ## Get Site Level Outputs
@@ -79,9 +77,11 @@ downscale_output_list <- purrr::map( # not using furrr b/c it is used inside dow
 
 ## Save to make it easier to restart
 # saveRDS(downscale_output, file = here::here("cache/downscale_output.rds"))
+PEcAn.logger::logger.info("Downscaling complete")
 
+PEcAn.logger::logger.info("Downscaling model results for each ensemble member:")
 metrics <- lapply(downscale_output_list, downscale_metrics)
-print(metrics)
+knitr::kable(metrics)
 
 median_metrics <- purrr::map(metrics, function(m) {
   m |>
@@ -95,6 +95,7 @@ median_metrics <- purrr::map(metrics, function(m) {
     )
 })
 
+PEcAn.logger::logger.info("Median downscaling model metrics:")
 bind_rows(median_metrics, .id = "carbon_pool") |>
   knitr::kable()
 
@@ -107,6 +108,7 @@ bind_rows(median_metrics, .id = "carbon_pool") |>
 # ca_fields <- readr::read_csv(here::here("data/ca_field_attributes.csv")) |>
 #   dplyr::select(id, lat, lon) |>
 #   rename(site = id)
+PEcAn.logger::logger.info("Aggregating to County Level")
 
 ca_fields_full <- sf::read_sf(file.path(datadir, "ca_fields.gpkg"))
 
@@ -125,42 +127,52 @@ get_downscale_preds <- function(downscale_output_list) {
 
 downscale_preds <- purrr::map(downscale_output_list, get_downscale_preds) |>
   dplyr::bind_rows(.id = "carbon_pool") |>
-  # Convert kg / ha to tonne (Mg) / field level totals
-  # first convert scale
-  mutate(c_density = PEcAn.utils::ud_convert(prediction, "kg/m2", "Tg/ha")) |>
-  mutate(total_c = c_density * area_ha)
+  # Convert kg/m2 to Mg/ha using PEcAn.utils::ud_convert
+  mutate(c_density_Mg_ha = PEcAn.utils::ud_convert(prediction, "kg/m2", "Mg/ha")) |>
+  # Calculate total Mg per field: c_density_Mg_ha * area_ha
+  mutate(total_c_Mg = c_density_Mg_ha * area_ha)
 
 ens_county_preds <- downscale_preds |>
   # Now aggregate to get county level totals for each pool x ensemble
   group_by(carbon_pool, county, ensemble) |>
   summarize(
-    total_c = sum(total_c),
+    n = n(),
+    total_c_Mg = sum(total_c_Mg),      # total Mg C per county
     total_ha = sum(area_ha)
   ) |>
   ungroup() |>
   mutate(
-    c_density = total_c / total_ha
+    total_c_Tg = PEcAn.utils::ud_convert(total_c_Mg, "Mg", "Tg"),
+    mean_c_density_Mg_ha = total_c_Mg / total_ha
   ) |>
   arrange(carbon_pool, county, ensemble)
 
+# Check number of ensemble members per county/carbon_pool
+ens_county_preds |>
+  group_by(carbon_pool, county) |>
+  summarize(n_vals = n_distinct(total_c_Mg)) |>
+  pull(n_vals) |>
+  unique()
+
+
 county_summaries <- ens_county_preds |>
     group_by(carbon_pool, county) |>
     summarize(
-      n = n(),
-      mean_total_c = mean(total_c),
-      #median_total_c = median(total_c),
-      sd_total_c = sd(total_c),
-      mean_c_density = mean(c_density),
-      sd_c_density = sd(c_density)
+      n = max(n), # Number of fields in county should be same for each ensemble member
+      co_mean_c_total_Tg = mean(total_c_Tg),
+      co_sd_c_total_Tg = sd(total_c_Tg),
+      co_mean_c_density_Mg_ha = mean(mean_c_density_Mg_ha),
+      co_sd_c_density_Mg_ha = sd(mean_c_density_Mg_ha)
     )
 
 readr::write_csv(
   county_summaries,
   file.path(outdir, "county_summaries.csv")
 )
-  
-# Lets plot the results!
+PEcAn.logger::logger.info("County summaries written to", file.path(outdir, "county_summaries.csv"))
 
+## Plot the results!
+PEcAn.logger::logger.info("Plotting County Level Summaries")
 county_boundaries <- st_read(here::here("data/counties.gpkg")) |>
   filter(state_name == "California") |>
   select(name)
@@ -169,13 +181,13 @@ co_preds_to_plot <- county_summaries |>
   right_join(county_boundaries, by = c("county" = "name")) |>
   arrange(county, carbon_pool) |>
   pivot_longer(
-    cols = c(mean_total_c, sd_total_c, mean_c_density, sd_c_density),
+    cols = c(mean_total_c_Tg, sd_total_c_Tg, mean_c_density_Mg_ha, sd_c_density_Mg_ha),
     names_to = "stat",
     values_to = "value"
   ) |>
   mutate(units = case_when(
     str_detect(stat, "total_c") ~ "Carbon Stock (Tg)",
-    str_detect(stat, "c_density") ~ "Carbon Density (kg/m2)"
+    str_detect(stat, "c_density") ~ "Carbon Density (Mg/ha)"
   ), stat = case_when(
     str_detect(stat, "mean") ~ "Mean",
     str_detect(stat, "sd") ~ "SD"
@@ -206,7 +218,7 @@ p <- purrr::map2(pool_x_units$carbon_pool, pool_x_units$units, function(pool, un
     )
 
   unit <- ifelse(unit == "Carbon Stock (Tg)", "stock",
-    ifelse(unit == "Carbon Density (kg/m2)", "density", NA)
+    ifelse(unit == "Carbon Density (Mg/ha)", "density", NA)
   )
 
   plotfile <- here::here("downscale/figures", paste0("county_", pool, "_carbon_", unit, ".png"))
@@ -249,77 +261,241 @@ importance_summary <- map_dfr(cpools, function(cp) {
   summary_df
 })
 
-# Now create and save combined importance + partial plots for each carbon pool
+# TODO consider separating out plotting 
+####---Create checkpoint---####
+# system.time(save(downscale_output_list, importance_summary, covariates, cpools# these are ~500MB
+#   file = file.path(outdir, "checkpoint.RData"),
+#   compress = FALSE
+# ))
+# outdir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859/output/out"
+# load(file.path(outdir, "checkpoint.RData"))
+#### ---End checkpoint---####
+
+covariate_names <- names(covariates |> select(where(is.numeric)))
+covariates_df <- as.data.frame(covariates) |>
+  # TODO pass scaled covariates from ensemble_downscale function
+  #    this will ensure that the scaling is the same.
+  mutate(covariates, across(all_of(covariate_names), scale))
+
+##### Subset data for plotting (speed + visible rug plots) #######
+subset_inputs <- TRUE
+# Subset data for testing / speed purposes
+if (subset_inputs) {
+  # cpools <- c("AGB")
+
+  # Subset covariates for testing (take only a small percentage of rows)
+  n_test_samples <- min(5000, nrow(covariates_df))
+  set.seed(123) # For reproducibility
+  test_indices <- sample(1:nrow(covariates_df), n_test_samples)
+  covariates_full <- covariates_df
+  covariates_df <- covariates_df[test_indices, ]
+
+  # For each model, subset the predictions to match the test indices
+  for (cp in cpools) {
+    if (length(downscale_output_list[[cp]]$predictions) > 0) {
+      downscale_output_list[[cp]]$predictions <-
+        lapply(downscale_output_list[[cp]]$predictions, function(x) x[test_indices])
+    }
+  }
+}
+
+##### End Subsetting Code#######
+
+##### Importance Plots #####
+
+
 for (cp in cpools) {
-  
-  # Filter importance data for carbon pool cp
+  png(
+    filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
+    width = 14, height = 6, units = "in", res = 300, bg = "white"
+  )
+
+  # Variable importance plot
   cp_importance <- importance_summary |>
-    dplyr::filter(carbon_pool == cp)
-  
-  # Select top 2 predictors
-  top_predictors <- cp_importance |>
-    dplyr::arrange(dplyr::desc(median_importance)) |>
-    dplyr::slice_head(n = 2) |>
-    dplyr::pull(predictor)
-  
-  # Build variable importance plot for carbon pool cp
-  p_importance_cp <- ggplot2::ggplot(cp_importance, 
-                                     ggplot2::aes(x = reorder(predictor, median_importance),
-                                                  y = median_importance)) +
-    ggplot2::geom_errorbar(ggplot2::aes(ymin = lcl_importance, ymax = ucl_importance),
-                           width = 0.2, color = "gray50") +
-    ggplot2::geom_point(size = 4, color = "steelblue") +
-    ggplot2::coord_flip() +
-    ggplot2::labs(title = paste("Variable Importance -", cp),
-                  x = "Predictor",
-                  y = "Median Increase MSE (SD)") +
-    ggplot2::theme_minimal()
-  
-  model <- downscale_output_list[[cp]][["model"]][[1]]
-  
-  # Bin top predictors in covariates to make partial dependence faster
-  # Covariates is ~400k rows, so binning will speed up the process
-  binned_covariates_df <- covariates |>
-    dplyr::mutate(dplyr::across(dplyr::all_of(top_predictors), ~ cut(.x, breaks = 100, labels = FALSE))) |>
-    as.data.frame()
-  
-  # Compute partial dependence for the top predictors,
-  # explicitly supplying the predict function via an anonymous function.
-  pd1 <- pdp::partial(object = model,
-                 pred.var = top_predictors[1],
-                 train = binned_covariates_df,
-                 grid.resolution = 10,
-                 .f = function(object, newdata) stats::predict(object, newdata))
-  
-  pd2 <- as.data.frame(
-    pdp::partial(object = model,
-                 pred.var = top_predictors[2],
-                 train = binned_covariates_df,
-                 grid.resolution = 10,
-                 .f = function(object, newdata) stats::predict(object, newdata))
+    filter(carbon_pool == cp)
+  with(
+    cp_importance,
+    dotchart(median_importance,
+      labels = reorder(predictor, median_importance),
+      xlab = "Median Increase MSE (SD)",
+      main = paste("Importance -", cp),
+      pch = 19, col = "steelblue", cex = 1.2
+    )
   )
-  
-  # Build ggplot-based partial dependence plots for each top predictor
-  p_pd1 <- ggplot2::ggplot(pd1, ggplot2::aes(x = !!sym(top_predictors[1]), y = yhat)) +
-    ggplot2::geom_line(linewidth = 1.2, color = "steelblue") +
-    ggplot2::labs(title = paste("Partial Dependence for", top_predictors[1]),
-                  x = top_predictors[1],
-                  y = paste("Predicted", cp)) +
-    ggplot2::theme_minimal()
-  
-  p_pd2 <- ggplot2::ggplot(pd2, ggplot2::aes(x = !!sym(top_predictors[2]), y = yhat)) +
-    ggplot2::geom_line(linewidth = 1.2, color = "steelblue") +
-    ggplot2::labs(title = paste("Partial Dependence for", top_predictors[2]),
-                  x = top_predictors[2],
-                  y = paste("Predicted", cp)) +
-    ggplot2::theme_minimal()
-  
-  # Combine the importance and partial dependence plots using patchwork
-  combined_plot <- p_importance_cp + p_pd1 + p_pd2 + 
-    patchwork::plot_layout(ncol = 3)
-  
-   ggplot2::ggsave(filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
-                  plot = combined_plot,
-                  width = 14, height = 6, bg = "white")
-  
+  with(
+    cp_importance,
+    segments(lcl_importance,
+      seq_along(predictor),
+      ucl_importance,
+      seq_along(predictor),
+      col = "gray50"
+    )
+  )
+  dev.off()
 }
+
+##### Importance and Partial Plots #####
+
+## Using pdp + ggplot2
+# # Loop over carbon pools
+# for (cp in cpools) {
+#   # Top 2 predictors for this carbon pool
+#   top_predictors <- importance_summary |>
+#     filter(carbon_pool == cp) |>
+#     arrange(desc(median_importance)) |>
+#     slice_head(n = 2) |>
+#     pull(predictor)
+
+#   # Retrieve model and covariate data
+#   model <- downscale_output_list[[cp]][["model"]][[1]]
+#   cov_df <- covariates_df # Already scaled
+
+#   ## 1. Create Variable Importance Plot with ggplot2
+#   cp_importance <- importance_summary |>
+#     filter(carbon_pool == cp)
+
+#   p_importance <- ggplot(cp_importance, aes(x = median_importance, y = reorder(predictor, median_importance))) +
+#     geom_point(color = "steelblue", size = 3) +
+#     geom_errorbarh(aes(xmin = lcl_importance, xmax = ucl_importance),
+#       height = 0.2,
+#       color = "gray50"
+#     ) +
+#     labs(
+#       title = paste("Importance -", cp),
+#       x = "Median Increase in MSE (SD)",
+#       y = ""
+#     ) +
+#     theme_minimal()
+
+#   ## 2. Create Partial Dependence Plot for the top predictor
+#   pd_data1 <- pdp::partial(
+#     object = model,
+#     pred.var = top_predictors[1],
+#     pred.data = cov_df,
+#     train = cov_df,
+#     plot = FALSE
+#   )
+#   ## Partial dependence for predictor 1
+#   p_partial1 <- ggplot(pd_data1, aes_string(x = top_predictors[1], y = "yhat")) +
+#     geom_line(color = "steelblue", size = 1.2) +
+#     geom_rug(
+#       data = cov_df, aes_string(x = top_predictors[1]),
+#       sides = "b", alpha = 0.5
+#     ) +
+#     labs(
+#       title = paste("Partial Dependence -", top_predictors[1]),
+#       x = top_predictors[1],
+#       y = paste("Predicted", cp)
+#     ) +
+#     theme_minimal()
+
+#   ## Partial dependence for predictor 2
+#   pd_data2 <- pdp::partial(
+#     object = model,
+#     pred.var = top_predictors[2],
+#     pred.data = cov_df,
+#     plot = TRUE,
+#     train = cov_df,
+#     parallel = TRUE
+#   )
+
+#   p_partial2 <- ggplot(pd_data2, aes_string(x = top_predictors[2], y = "yhat")) +
+#     geom_line(color = "steelblue", size = 1.2) +
+#     geom_rug(
+#       data = cov_df, aes_string(x = top_predictors[2]),
+#       sides = "b", alpha = 0.5
+#     ) +
+#     labs(
+#       title = paste("Partial Dependence -", top_predictors[2]),
+#       x = top_predictors[2],
+#       y = paste("Predicted", cp)
+#     ) +
+#     theme_minimal()
+
+#   combined_plot <- p_importance + p_partial1 + p_partial2 + plot_layout(ncol = 3)
+
+#   output_file <- here("downscale/figures", paste0(cp, "_importance_partial_plots.png"))
+#   ggsave(
+#     filename = output_file,
+#     plot = combined_plot,
+#     width = 14, height = 6, dpi = 300, bg = "white"
+#   )
+
+#   # also save pdp-generated plot 
+#   pdp_plots <- p_data1 + p_data2
+#   ggsave(pdp_plots,
+#     filename = here::here("downscale/figures", paste0(cp, "_PDP_", 
+#       top_predictors[1], "_", top_predictors[2], ".png")),
+#     width = 6, height = 4, dpi = 300, bg = "white"
+#   )
+# }
+
+## Using randomForest::partialPlot()
+# Combined importance + partial plots for each carbon pool
+
+
+# for (cp in cpools) {
+#   # Top 2 predictors for this carbon pool
+#   top_predictors <- importance_summary |>
+#     filter(carbon_pool == cp) |>
+#     arrange(desc(median_importance)) |>
+#     slice_head(n = 2) |>
+#     pull(predictor)
+
+#   # Prepare model and subset of covariates for plotting
+#   model <- downscale_output_list[[cp]][["model"]][[1]]
+#   cov_df <- covariates_df
+
+#   # Set up PNG for three panel plot
+#   png(
+#     filename = here::here("downscale/figures", paste0(cp, "_importance_partial_plots.png")),
+#     width = 14, height = 6, units = "in", res = 300, bg = "white"
+#   )
+#   par(mfrow = c(1, 3))
+
+#   # Panel 1: Variable importance plot
+#   cp_importance <- importance_summary |> filter(carbon_pool == cp)
+#   par(mar = c(5, 10, 4, 2))
+#   with(
+#     cp_importance,
+#     dotchart(median_importance,
+#       labels = reorder(predictor, median_importance),
+#       xlab = "Median Increase MSE (SD)",
+#       main = paste("Importance -", cp),
+#       pch = 19, col = "steelblue", cex = 1.2
+#     )
+#   )
+#   with(
+#     cp_importance,
+#     segments(lcl_importance,
+#       seq_along(predictor),
+#       ucl_importance,
+#       seq_along(predictor),
+#       col = "gray50"
+#     )
+#   )
+
+#   # Panel 2: Partial plot for top predictor
+#   par(mar = c(5, 5, 4, 2))
+#   randomForest::partialPlot(model,
+#     pred.data = cov_df,
+#     x.var = top_predictors[1],
+#     main = paste("Partial Dependence -", top_predictors[1]),
+#     xlab = top_predictors[1],
+#     ylab = paste("Predicted", cp),
+#     col = "steelblue",
+#     lwd = 2
+#   )
+
+#   # Panel 3: Partial plot for second predictor
+#   randomForest::partialPlot(model,
+#     pred.data = cov_df,
+#     x.var = top_predictors[2],
+#     main = paste("Partial Dependence -", top_predictors[2]),
+#     xlab = top_predictors[2],
+#     ylab = paste("Predicted", cp),
+#     col = "steelblue",
+#     lwd = 2
+#   )
+#   dev.off() # Save combined figure
+# }
diff --git a/downscale/04_downscaling_documentation_results.qmd b/downscale/04_downscaling_documentation_results.qmd
index a69fe99..245057d 100644
--- a/downscale/04_downscaling_documentation_results.qmd
+++ b/downscale/04_downscaling_documentation_results.qmd
@@ -211,7 +211,6 @@ Builds a Random Forest model for each ensemble member of each output to predict
 
 ## Results
 
-**County Carbon Stocks and Densities**
 ### County-Level Carbon Stock and Density Maps
 
 The following maps illustrate the spatial variation and uncertainty (mean and standard deviation) of the predicted carbon pools at the county level. 
@@ -238,6 +237,9 @@ The following plots show the variable importance from the random forest models u
 
 Variable importance quantifies how useful each covariate is in predicting the carbon stock<!-- or ghg flux-->. Partial dependence plots show the marginal effect of individual predictors on model response after averaging over the other predictors.
 
+![](figures/importance.png)
+
+<!--
 #### Variable Importance and Partial Dependence
 
 ![](figures/TotSoilCarb_importance_partial_plots.png)
@@ -246,44 +248,46 @@ Variable importance quantifies how useful each covariate is in predicting the ca
 
 ![](figures/AGB_importance_partial_plots.png)
 
-
+--->
 ### Searchable Table
 
 The table below provides a searchable summary of the county-level carbon stocks and densities.
 
 ```{r}
+
 outdir <- "/projectnb/dietzelab/ccmmf/ccmmf_phase_1b_20250319064759_14859/output/out"
 # Load county summaries data
-county_summaries <- read.csv(file.path(outdir, "county_summaries.csv"))
-
+county_summaries <- readr::read_csv(file.path(outdir, "county_summaries.csv"),
+  show_col_types = FALSE)
+#colnames(county_summaries)
 # Combine mean and SD into a single column for carbon density
-county_summaries <- county_summaries |> 
-   dplyr::mutate(
-      `Mean Total C (Tg/ county)` = paste0(
-         signif(mean_total_c, 2), 
-         " (", signif(sd_total_c, 2), ")"
-      ),
-      `Mean C Density (kg/ha)` = paste0(
-         signif(mean_c_density, 2), 
-         " (", signif(sd_c_density, 2), ")"
-      )
-   ) |> 
-   dplyr::rename(
-      `Carbon Pool` = carbon_pool,
-      `County` = county
-   ) |> 
-   dplyr::select(`Carbon Pool`, `County`, `Mean Total C (Tg/ county)`, `Mean C Density (kg/ha)`)
+county_summaries_table <- county_summaries |> 
+  dplyr::mutate(
+    `Mean Total C (Tg/county)` = paste0(
+       signif(co_mean_c_total_Tg, 2), 
+       " (", signif(co_sd_c_total_Tg, 2), ")"
+    ),
+    `Mean C Density (Mg/ha)` = paste0(
+       signif(co_mean_c_density_Mg_ha, 2), 
+       " (", signif(co_sd_c_density_Mg_ha, 2), ")"
+    )
+     ) |> 
+     dplyr::rename(
+    `Carbon Pool` = carbon_pool,
+    `County` = county,
+    `# Fields` = n
+     ) |> 
+     dplyr::select(`Carbon Pool`, `County`, `# Fields`, `Mean Total C (Tg/county)`, `Mean C Density (Mg/ha)`)
 
 # Create Table
 # TODO 
 #  - Fix point w/ missing county
-#  - Add n to table
 
 htmlwidgets::setWidgetIdSeed(123) # required to embed table self-contained in html
 options(htmlwidgets.TEMP_DIR = "htmlwidgets")
 
 DT::datatable(
-   county_summaries,
+   county_summaries_table,
    options = list(
       pageLength = 10,
       searchHighlight = TRUE

From 4c38cb10c6578424bdf3ff679cff47646924623e Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 16 Apr 2025 09:31:18 -0700
Subject: [PATCH 48/49] Update downscale/01_cluster_and_select_design_points.R

Co-authored-by: Chris Black <chris@ckblack.org>
---
 downscale/01_cluster_and_select_design_points.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/downscale/01_cluster_and_select_design_points.R b/downscale/01_cluster_and_select_design_points.R
index 8b973a4..8ca3b97 100644
--- a/downscale/01_cluster_and_select_design_points.R
+++ b/downscale/01_cluster_and_select_design_points.R
@@ -36,7 +36,7 @@ library(GGally)
 
 # settings
 set.seed(42)
-ca_albers_crs <- 3310 # use California Albers project (EPSG:3310) for speed,
+ca_albers_crs <- 3310 # use California Albers projection (EPSG:3310) for speed
 data_dir <- "/projectnb/dietzelab/ccmmf/data"
 #'
 #' ## Load Site Environmental Data Covariates

From 2271d7d5b6d91bf411eafac4f0658de4e611485b Mon Sep 17 00:00:00 2001
From: David LeBauer <dlebauer@gmail.com>
Date: Wed, 16 Apr 2025 17:30:25 -0400
Subject: [PATCH 49/49] moved .future.R and fixed spelling in 03_ **aggregate**

---
 .future.R => downscale/.future.R                                  | 0
 .../{03_downscale_and_agregate.R => 03_downscale_and_aggregate.R} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename .future.R => downscale/.future.R (100%)
 rename downscale/{03_downscale_and_agregate.R => 03_downscale_and_aggregate.R} (100%)

diff --git a/.future.R b/downscale/.future.R
similarity index 100%
rename from .future.R
rename to downscale/.future.R
diff --git a/downscale/03_downscale_and_agregate.R b/downscale/03_downscale_and_aggregate.R
similarity index 100%
rename from downscale/03_downscale_and_agregate.R
rename to downscale/03_downscale_and_aggregate.R