PHSKC-APDE · rwbuie · Mar 31, 2025 · Mar 18, 2025 · Mar 19, 2025 · Mar 24, 2025
diff --git a/R/chi_drop_illogical_ages.R b/R/chi_drop_illogical_ages.R
@@ -79,14 +79,13 @@ chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age') {
 
     # Create a standardized version of the age group
     ph.data[, (temp_catgroup) := data.table::fcase(
-      get(catgroup) == '<1', '0-0',
+      get(catgroup) == '<1' & grepl(' age$|^age$', get(catnum), ignore.case = T), '0-0',
 
-      grepl("<", get(catgroup)), gsub("<", "0-", get(catgroup)),
+      grepl("<", get(catgroup)) &grepl(' age$|^age$', get(catnum), ignore.case = T), gsub("<", "0-", get(catgroup)),
 
-      grepl("\\+", get(catgroup)), gsub("\\+", "-120", get(catgroup)),
+      grepl("\\+", get(catgroup))& grepl(' age$|^age$', get(catnum), ignore.case = T), gsub("\\+", "-120", get(catgroup)),
 
-      grepl('-', get(catgroup)), get(catgroup)
-    )]
+      grepl('-', get(catgroup))& grepl(' age$|^age$', get(catnum), ignore.case = T), as.character(get(catgroup)))]
 
     # Extract min and max age
     ph.data[, "min_age" := as.numeric(gsub("-.*", "", get(temp_catgroup)))]

diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R
@@ -26,7 +26,6 @@
 #'   }
 #'
 #' @param ph.analysis_set name of data.table to parse
-#' @param start.year the earliest year to be used for estimates
 #' @param end.year the latest year to be used for aggregate estimates
 #' @param year.span the number of years to be included in a single non-trend period
 #' @param trend.span the number of years to be included in a single trend period
@@ -54,52 +53,50 @@
 #' @export
 #'
 chi_generate_tro_shell <- function(ph.analysis_set,
-                                      start.year,
-                                      end.year,
-                                      year.span = NULL,
-                                      trend.span = NULL,
-                                      trend.periods = NULL){
+                                   end.year,
+                                   year.span = NULL,
+                                   trend.span = NULL,
+                                   trend.periods = NULL){
 
   # Input validation
-    if (missing(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be provided")
-    if (!is.data.frame(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be a data.frame or data.table")
-
-    if (missing(start.year)) stop("\n\U1F6D1 start.year must be provided")
-    if (!is.numeric(start.year) || length(start.year) != 1) stop("\n\U1F6D1 start.year must be a single numeric value")
+  if (missing(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be provided")
+  if (!is.data.frame(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be a data.frame or data.table")
+  if (!("set" %in% names(ph.analysis_set)) | anyNA(ph.analysis_set$set)) {
+    stop("\n\u1F6D1 set number must be provided for all rows")
+  }
+  if (missing(end.year)) stop("\n\U1F6D1 end.year must be provided")
+  if (!is.numeric(end.year) || length(end.year) != 1) stop("\n\U1F6D1 end.year must be a single numeric value")
 
-    if (missing(end.year)) stop("\n\U1F6D1 end.year must be provided")
-    if (!is.numeric(end.year) || length(end.year) != 1) stop("\n\U1F6D1 end.year must be a single numeric value")
+  if (!is.null(year.span) && (!is.numeric(year.span) || length(year.span) != 1)) {
+    stop("\n\U1F6D1 year.span must be NULL or a single numeric value")
+  }
 
-    if (!is.null(year.span) && (!is.numeric(year.span) || length(year.span) != 1)) {
-      stop("\n\U1F6D1 year.span must be NULL or a single numeric value")
-    }
+  if (!is.null(trend.span) && (!is.numeric(trend.span) || length(trend.span) != 1)) {
+    stop("\n\U1F6D1 trend.span must be NULL or a single numeric value")
+  }
 
-    if (!is.null(trend.span) && (!is.numeric(trend.span) || length(trend.span) != 1)) {
-      stop("\n\U1F6D1 trend.span must be NULL or a single numeric value")
-    }
+  if (!is.null(trend.periods) && (!is.numeric(trend.periods) || length(trend.periods) != 1)) {
+    stop("\n\U1F6D1 trend.periods must be NULL or a single numeric value")
+  }
 
-    if (!is.null(trend.periods) && (!is.numeric(trend.periods) || length(trend.periods) != 1)) {
-      stop("\n\U1F6D1 trend.periods must be NULL or a single numeric value")
-    }
 
-    # Convert to data.table if needed
-    if (!is.data.table(ph.analysis_set)) setDT(ph.analysis_set)
+  # Convert to data.table if needed
+  if (!is.data.table(ph.analysis_set)) setDT(ph.analysis_set)
 
-  #parameterization checks
+  # parameterization checks
   if("x" %in% ph.analysis_set$trends & (is.null(trend.span) | is.null(trend.periods))) {stop("you have indicated that a trends analysis is to be conducted, but have not indicated both the span and number of periods for this analysis.")}
 
-  #ph.analysis_set checks
-
-
-  #advisory messages
+  # advisory messages
   if("x" %in% ph.analysis_set$trends) {message("Note: trends are applied backwards from end.year")}
 
   # Race / ethnicity is a chronic headache with CHI. Need to remove rows for race4 & Ethnicity because should be Race/ethnicity
   ph.analysis_set <- ph.analysis_set[!(cat1_varname == 'race4' & cat1 == 'Ethnicity')]
 
   # apply the template generating function
+  # generate vector of sets
+  sets <- unique(ph.analysis_set$set)
   template <- rbindlist(
-    lapply(X = seq(1, length(unique(ph.analysis_set$set))),
+    lapply(X = sets,
            FUN = chi_process_nontrends, ph.analysis_set = ph.analysis_set))
 
   # split trends from other tabs because processed for multiple years
@@ -117,9 +114,9 @@ chi_generate_tro_shell <- function(ph.analysis_set,
   # add years to template (trends)
   if(nrow(template.trends) > 0){
     trend.years <- chi_process_trends(indicator_key = intersect(unique(template$indicator_key), unique(template.trends$indicator_key)),
-                                            trend.span = trend.span,
-                                            end.year = end.year,
-                                            trend.periods = trend.periods)
+                                      trend.span = trend.span,
+                                      end.year = end.year,
+                                      trend.periods = trend.periods)
     template.trends <- merge(template.trends, trend.years, by = 'indicator_key', all = T, allow.cartesian = T)
 
     # append trends template to main template

diff --git a/R/chi_get_cols.R b/R/chi_get_cols.R
@@ -0,0 +1,25 @@
+#' Get CHI variable column names
+#'
+#' @description
+#' Returns a character vector of column names defined in the CHI YAML reference file.
+#' This helper function provides easy access to the standardized CHI variable names.
+#'
+#' @param metadata returns metadata column names instead of primary data
+#'
+#' @return A character vector of column names for the chi data (Default) or metadata
+#' @importFrom yaml read_yaml
+#' @export
+#'
+#' @examples
+#' cols <- chi_get_cols()
+chi_get_cols <- function(metadata = FALSE) {
+  chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools")
+  if (chi.yaml.filepath == "") {
+    stop("Could not find reference file chi_qa.yaml")
+  }
+  chi.yaml <- yaml::read_yaml(chi.yaml.filepath)
+  if(metadata){
+    return(names(chi.yaml$metadata))
+  }
+  return(names(chi.yaml$vars))
+}
diff --git a/R/chi_get_yaml.R b/R/chi_get_yaml.R
@@ -1,23 +1,3 @@
-#' Get CHI variable column names
-#'
-#' Returns a character vector of column names defined in the CHI YAML reference file.
-#' This helper function provides easy access to the standardized CHI variable names.
-#'
-#' @return A character vector of column names
-#' @importFrom yaml read_yaml
-#' @export
-#'
-#' @examples
-#' cols <- chi_get_cols()
-chi_get_cols <- function() {
-  chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools")
-  if (chi.yaml.filepath == "") {
-    stop("Could not find reference file chi_qa.yaml")
-  }
-  chi.yaml <- yaml::read_yaml(chi.yaml.filepath)
-  return(names(chi.yaml$vars))
-}
-
 #' Get CHI YAML configuration
 #'
 #' Returns the complete CHI YAML configuration as a list.

diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R
@@ -148,7 +148,7 @@ chi_qa_tro <- function(CHIestimates,
     }
   }
 
-  missing.var <- setdiff(names(CHImetadata), names(unlist(chi_get_yaml()$metadata)))
+  missing.var <- setdiff(names(CHImetadata), chi_get_cols(metadata = TRUE))
   if(length(missing.var) > 0){
     status <- 0
     if(verbose){

diff --git a/inst/ref/chi_qa.yaml_pre2025.yaml b/inst/ref/chi_qa.yaml_pre2025.yaml
@@ -0,0 +1,62 @@
+schema: APDE_WIP
+table:
+years:
+
+vars:
+    data_source: varchar(255)
+    indicator_key: varchar(255)
+    tab: varchar(255)
+    year: varchar(255)
+    cat1: varchar(255)
+    cat1_group: nvarchar(2000)
+    cat1_varname: varchar(255)
+    cat2: varchar(255)
+    cat2_group: nvarchar(2000)
+    cat2_varname: varchar(255)
+    result: float
+    lower_bound: float
+    upper_bound: float
+    se: float
+    rse: float
+    comparison_with_kc: varchar(255)
+    time_trends: varchar(255)
+    significance: varchar(255)
+    caution: varchar(255)
+    suppression: varchar(255)
+    numerator: float
+    denominator: float
+    chi: tinyint
+    source_date: date
+    run_date: date
+
+metadata:
+    data_source: varchar(255)
+    indicator_key: varchar(255)
+    result_type: varchar(255)
+    valence: varchar(255)
+    latest_year: int
+    latest_year_result: float
+    latest_year_kc_pop: int
+    latest_year_count: int
+    map_type: varchar(255)
+    unit: varchar(255)
+    valid_years: varchar(255)
+    chi: tinyint
+    run_date: date
+
+toc:
+    data_source: varchar(255)
+    indicator_key: varchar(255)
+    topic_chi: varchar(255)
+    topic_bsk: varchar(255)
+    topic_bskhs: varchar(255)
+    title_toc: varchar(255)
+    description: varchar(1000)
+    url_backup: varchar(500)
+    latest_year_bk: varchar(255)
+    latest_rate_bk: varchar(255)
+    toc_bsk: float
+    toc_bskhs: float
+    toc_chi: float
+    toc_cc: float
+
diff --git a/man/chi_generate_tro_shell.Rd b/man/chi_generate_tro_shell.Rd
diff --git a/man/chi_get_cols.Rd b/man/chi_get_cols.Rd
diff --git a/quarto_docs/Calculating_Prevalences.qmd b/quarto_docs/Calculating_Prevalences.qmd
@@ -132,7 +132,6 @@ To analyze our data consistently, we need to generate a structured set of calcul
 
 myinstructions <- chi_generate_tro_shell(
   ph.analysis_set = analysis_sets,
-  start.year = latest_year-4, # earliest year to be used for estimates
   end.year = latest_year,     # latest year to be used for aggregate estimates
   year.span = 5,              # number of years included in a single period
   trend.span = 3,             # number of years included in a trend single period

diff --git a/tests/testthat/test-chi_generate_tro_shell.R b/tests/testthat/test-chi_generate_tro_shell.R
@@ -2,8 +2,8 @@ test_that("chi_generate_tro_shell validates inputs correctly", {
   test_data <- setup_test_data()
 
   expect_error(chi_generate_tro_shell(), "ph.analysis_set must be provided")
-  expect_error(chi_generate_tro_shell(data.frame(), start.year = "2023"),
-               "start.year must be a single numeric value")
+  expect_error(chi_generate_tro_shell(data.frame()),
+               "set number must be provided for all rows")
 })
 
 test_that("ingest template format", {
@@ -29,7 +29,6 @@ test_that("ingest template format", {
                          trends,
                          set_indicator_keys)
   DT <- chi_generate_tro_shell(ph.analysis_set = template,
-                               start.year = 2021,
                                end.year = 2022,
                                year.span = 5,
                                trend.span = 3,