diff --git a/DESCRIPTION b/DESCRIPTION index 05cc95d..556ff91 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,6 +35,7 @@ Imports: Remotes: github::PHSKC-APDE/rads, github::PHSKC-APDE/dtsurvey Suggests: + devtools, httr, keyring, knitr, diff --git a/NAMESPACE b/NAMESPACE index 6248b5c..bec5b5c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(chi_get_proper_pop) export(chi_get_yaml) export(chi_keep_proper_ages) export(chi_qa_tro) +export(chi_suppress_results) export(chi_update_sql) import(data.table) import(dtsurvey) @@ -30,10 +31,12 @@ importFrom(DBI,dbGetQuery) importFrom(DBI,dbWriteTable) importFrom(data.table,"%between%") importFrom(data.table,":=") +importFrom(data.table,':=') importFrom(data.table,.GRP) importFrom(data.table,.SD) importFrom(data.table,CJ) importFrom(data.table,`:=`) +importFrom(data.table,alloc.col) importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) @@ -46,18 +49,23 @@ importFrom(data.table,rbindlist) importFrom(data.table,set) importFrom(data.table,setDT) importFrom(data.table,setcolorder) +importFrom(data.table,setkey) importFrom(data.table,setnames) importFrom(data.table,setorder) importFrom(data.table,setorderv) importFrom(data.table,tstrsplit) importFrom(data.table,uniqueN) +importFrom(future,plan) importFrom(future.apply,future_lapply) importFrom(glue,glue) importFrom(glue,glue_sql) importFrom(odbc,odbc) +importFrom(progressr,handler_progress) importFrom(progressr,handlers) importFrom(progressr,progressor) importFrom(progressr,with_progress) +importFrom(qs,qread) +importFrom(qs,qsave) importFrom(rads,calc) importFrom(rads,round2) importFrom(rads,string_clean) diff --git a/R/chi_calc.R b/R/chi_calc.R index 7d0fb0d..0d06d75 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -267,15 +267,32 @@ chi_calc <- function(ph.data = NULL, # Tidy results ---- - # When there are no observation in a row, rads::calc() gives NA for numerator, but want zero ---- - tempCHIest[is.na(numerator), `:=` ( + # When we have no events, but a valid denominator ---- + tempCHIest[ + (is.na(numerator) | numerator == 0) & !is.na(denominator) & denominator > 0, + `:=` ( result = 0, se = 0, lower_bound = 0, upper_bound = 0, - numerator = 0, + numerator = ifelse(is.na(numerator), 0, numerator), rse = NA # undefined because mean will be zero - )] + ) + ] + + # When we have no events in the denominator ---- + tempCHIest[ + (is.na(denominator) | denominator == 0), + `:=` ( + result = NA, + se = NA, + lower_bound = NA, + upper_bound = NA, + numerator = 0, # This might not be necessary depending on your needs + denominator = 0, # Setting NA denominator to 0 + rse = NA + ) + ] # drop when cat1_group is missing (e.g., cat1 == 'Regions' and region is NA) ---- tempCHIest <- tempCHIest[!is.na(cat1_group)] @@ -367,10 +384,10 @@ chi_calc <- function(ph.data = NULL, tempCHIest[, data_source := source_name] if(small_num_suppress == TRUE){ - tempCHIest <- rads::suppress(sup_data = tempCHIest, - suppress_range = c(suppress_low, suppress_high), - secondary = T, - secondary_exclude = cat1_varname != 'race3') + tempCHIest <- apde.chi.tools::chi_suppress_results(ph.data = tempCHIest, + suppress_range = c(suppress_low, suppress_high), + secondary = T, + secondary_exclude = cat1_varname != 'race3') } else {tempCHIest[, suppression := NA_character_]} tempCHIest[rse>=30 | numerator == 0, caution := "!"] diff --git a/R/chi_compare_estimates.R b/R/chi_compare_estimates.R index d784505..f247124 100644 --- a/R/chi_compare_estimates.R +++ b/R/chi_compare_estimates.R @@ -84,8 +84,8 @@ chi_compare_estimates <- function(OLD = NULL, NEW = NULL, OLD.year = NULL, NEW.y # calculate percent differences between old (x) and new(y) comp[, relative.diff := round2(abs((result.x - result.y) / result.x)*100, 1)] - comp[result_type != "rate", absolute.diff := round2(abs(result.x - result.y)*100, 1)] - comp[result_type == "rate", absolute.diff := round2(abs(result.x - result.y), 1)] + comp[grepl("mean|proportion", result_type, ignore.case = T), absolute.diff := round2(abs(result.x - result.y)*100, 1)] + comp[grepl("rate", result_type, ignore.case = T), absolute.diff := round2(abs(result.x - result.y), 1)] comp <- comp[!is.na(absolute.diff)] # drop if absolute difference is NA # order variables @@ -111,18 +111,18 @@ chi_compare_estimates <- function(OLD = NULL, NEW = NULL, OLD.year = NULL, NEW.y # First, identify whether to use absolute or relative change for each indicator_key qa_type = NEW[tab=='_kingcounty'] qa_type[, qa_type := 'relative'] - qa_type[result_type=='proportion' & result >= 0.05, qa_type := 'absolute'] - qa_type[result_type=='rate' & result >= 5, qa_type := 'absolute'] + qa_type[grepl('mean|proportion', result_type, ignore.case = T) & result >= 0.05, qa_type := 'absolute'] + qa_type[grepl('rate', result_type, ignore.case = T) & result >= 5, qa_type := 'absolute'] qa_type <- qa_type[, list(indicator_key, qa_type)] # Merge on qa_type comp <- merge(comp, qa_type, by = 'indicator_key', all = T) # Identify notable changes - comp[qa_type == 'absolute' & result_type=='proportion' & absolute.diff >= 3, notable := 1] # absolute difference was multiplied by 100, so assess with >= 3 - comp[qa_type == 'absolute' & result_type=='rate' & absolute.diff >= 3, notable := 1] - comp[qa_type == 'relative' & result_type=='proportion' & relative.diff >= 50, notable := 1] # absolute difference was multiplied by 100, so assess with >= 3 - comp[qa_type == 'relative' & result_type=='rate' & relative.diff >= 50, notable := 1] + comp[qa_type == 'absolute' & grepl('mean|proportion', result_type, ignore.case = T) & absolute.diff >= 3, notable := 1] # absolute difference was multiplied by 100, so assess with >= 3 + comp[qa_type == 'absolute' & grepl('rate', result_type, ignore.case = T) & absolute.diff >= 3, notable := 1] + comp[qa_type == 'relative' & grepl('mean|proportion', result_type, ignore.case = T) & relative.diff >= 50, notable := 1] # absolute difference was multiplied by 100, so assess with >= 3 + comp[qa_type == 'relative' & grepl('rate', result_type, ignore.case = T) & relative.diff >= 50, notable := 1] # Return object ---- return(comp) diff --git a/R/chi_get_proper_pop.R b/R/chi_get_proper_pop.R index 2f673f9..d521204 100644 --- a/R/chi_get_proper_pop.R +++ b/R/chi_get_proper_pop.R @@ -9,533 +9,252 @@ #' @param pop.template A data.table produced by \code{\link{chi_generate_instructions_pop}}, #' containing instructions for population data retrieval with the following columns: #' \itemize{ -#' \item year: Year range (e.g., "2019-2021" or single year) -#' \item cat1, cat1_varname: Primary stratification variables -#' \item cat2, cat2_varname: Secondary stratification variables -#' \item tab: Visualization tab type -#' \item start, stop: Start and end years parsed from the year range -#' \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') -#' \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') -#' \item group_by1, group_by2: Race/eth grouping specifications ('race_eth', 'race') +#' \item \code{year}: Year range (e.g., "2019-2021" or single year) +#' \item \code{cat1}, \code{cat1_varname}: Primary stratification variables +#' \item \code{cat2}, \code{cat2_varname}: Secondary stratification variables +#' \item \code{tab}: Visualization tab type +#' \item \code{start}, \code{stop}: Start and end years parsed from the year range +#' \item \code{race_type}: Race categorization type ('race', 'race_eth', or 'race_aic') +#' \item \code{geo_type}: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') +#' \item \code{group_by1}, group_by2: Race/eth grouping specifications ('race_eth', 'race') #' } #' @param pop.genders Optional character vector specifying which genders to include. -#' Valid values are 'f', 'female', 'm', 'male'. If NULL (default), both genders are included. +#' Valid values are \code{'f'}, \code{'female'}, \code{'m'}, \code{'male'}. If NULL (default), both genders are included. #' @param pop.ages Optional integer vector specifying which ages to include. -#' If NULL (default), ages 0-100 are included. +#' If \code{NULL} (default), ages 0-100 are included. +#' @param is_chars Logical (T|F). If \code{TRUE}, will calculate King County population based +#' on ZIP codes beginning with 980 or 981, which is necessary for CHARS data. +#' Default \code{is_chars = FALSE}. #' #' @return A data.table containing population counts with columns: #' \itemize{ -#' \item chi_age: Single year age -#' \item year: Year -#' \item cat1, cat1_varname, cat1_group: Primary stratification variables -#' \item cat2, cat2_varname, cat2_group: Secondary stratification variables -#' \item tab: Visualization tab type -#' \item pop: Population count +#' \item \code{chi_age}: Single year age +#' \item \code{year}: Year +#' \item \code{cat1}, cat1_varname, cat1_group: Primary stratification variables +#' \item \code{cat2}, cat2_varname, cat2_group: Secondary stratification variables +#' \item \code{tab}: Visualization tab type +#' \item \code{pop}: Population count #' } #' #' @details -#' The function performs multiple steps to generate proper population denominators: -#' 1. Validates input parameters -#' 2. Creates population tables for each row of the template using rads::get_population -#' 3. Handles special cases for various geographic aggregations and crosswalks -#' 4. Returns a comprehensive, tidy dataset with population counts +#' The function: +#' +#' 1. Validates inputs +#' +#' 2. Batches similar population queries based on key parameters +#' +#' 3. Makes minimal database calls to cover all required year ranges +#' +#' 4. Post-processes data in R to match the original template requirements +#' +#' 5. When is_chars=TRUE, uses a different approach for King County that aggregates +#' populations from ZIP codes starting with 980/981 instead of using the standard +#' King County boundary +#' +#' For better performance, we recommended using futures for parallel processing. Here is one suggested set up: +#' +#' \preformatted{ +#' library(future) +#' +#' options(future.globals.maxSize = 3000 * 1024^2) # this sets a 3GB limit per future +#' +#' num.cores <- future::availableCores() - 1 # use one less than the available cores +#' +#' plan(multisession(workers = num.cores[1])) # set up a multisession plan +#' } #' #' @seealso #' \code{\link{chi_generate_instructions_pop}} which generates the instructions used as input -#' to this function +#' to this function, \code{\link{chi_get_proper_pop}} the original implementation #' -#' @importFrom data.table setDT rbindlist `:=` +#' @importFrom data.table alloc.col copy rbindlist set setkey uniqueN `:=` +#' @importFrom qs qread qsave +#' @importFrom future plan #' @importFrom future.apply future_lapply #' @importFrom tools toTitleCase -#' @importFrom progressr handlers progressor with_progress +#' @importFrom progressr handlers progressor with_progress handler_progress #' @export #' -chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages = NULL) { - # Validation of arguments ---- - # pop.template - if (is.null(pop.template)) { - stop("\n\U1F6D1 pop.template parameter is required") - } - - if (!is.data.frame(pop.template)) { - stop("\n\U1F6D1 pop.template must be a data.frame or data.table") - } else { - pop.template <- setDT(copy(pop.template)) - } - - # Check for required columns - required_columns <- c("year", "cat1", "cat1_varname", "cat2", "cat2_varname", - "start", "stop", "race_type", "geo_type", "tab") - missing_columns <- required_columns[!required_columns %in% names(pop.template)] - - if (length(missing_columns) > 0) { - stop("\n\U1F6D1 pop.template is missing required columns: ", - paste(missing_columns, collapse = ", ")) - } - - # pop.genders - if (is.null(pop.genders)) { - gender_values <- c("f", "m") - } else { - if (!tolower(pop.genders) %in% c('f', 'female', 'm', 'male')) { - stop("\n\U0001f47f if pop.genders is specified, it is limited to one of the following values: 'F', 'f', 'Female', 'female', 'M', 'm', 'Male', or 'male'") - } else { - gender_values <- pop.genders - } - } - # pop.ages - if (is.null(pop.ages)) { - age_values <- c(0:100) - } else { - if (!is.integer(pop.ages)) { - stop("\n\U0001f47f if pop.ages is specified it must be vector of integers, e.g., c(0:65)") - } else { - age_values <- pop.ages - } +chi_get_proper_pop <- function(pop.template = NULL, + pop.genders = NULL, + pop.ages = NULL, + is_chars = FALSE) { + # STEP 1: Validate and prepare inputs ---- + validated <- validate_inputs(pop.template, pop.genders, pop.ages, is_chars) + pop.template <- validated$pop.template + gender_values <- validated$gender_values + age_values <- validated$age_values + + # STEP 2: Batch similar queries ---- + pop.template <- standardize_category_names(pop.template) + pop.template <- create_query_keys(pop.template) + batched <- batch_population_queries(pop.template) + pop.template <- batched$pop.template + batched_queries <- batched$batched_queries + + # STEP 3: Get population data for batched queries ---- + message("\U023F3 Pulling ", nrow(batched_queries), " population tables from SQL ... be patient!", + "\n Key: cat1||cat1_varname||cat2||cat2_varname||race_type||geo_type||group_by1||group_by2") + + # If futures are NOT used: run sequentially ---- + if (inherits(future::plan(), "sequential")) { + message("Processing queries sequentially...") + all_population_data <- data.table::rbindlist( + lapply( + X = batched_queries$batched_id, + FUN = function(query_id) { + message(paste0("Processing ", query_id, " of ", + data.table::uniqueN(batched_queries$batched_id), ": ", + batched_queries[batched_id == query_id]$query_key)) + get_batched_population(query_id, + pop.template, + batched_queries, + gender_values, + age_values, + is_chars) + } + ), + fill = TRUE + ) } - # Define core population extraction function ---- - get_population_for_template_row <- function(row_index, pop.template = NULL) { - # Status update with progress indicator ---- - print(paste0("Process ID ", Sys.getpid(), ": Getting population ", row_index, " out of ", nrow(pop.template))) - - # Extract current template row ---- - current_row <- pop.template[row_index, ] - - # Standardize category names ---- - # Remove birthing person prefixes to standardize maternal data categories - pop.template[grepl("birthing person", cat1, ignore.case = TRUE), - cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] - pop.template[grepl("birthing person", cat2, ignore.case = TRUE), - cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] - - # Build grouping parameters for pop query ---- - grouping_vars <- unique(c( - c("ages", "geo_id"), - setdiff(c(current_row$group_by1, current_row$group_by2), c(NA)) - )) - - # Use rads::get_population() ---- - if (is.na(current_row$geo_type)) { - population_data <- rads::get_population( - group_by = grouping_vars, - race_type = current_row$race_type, - years = current_row$start:current_row$stop, - genders = gender_values, - ages = age_values, - round = FALSE - ) - } else { - population_data <- rads::get_population( - group_by = grouping_vars, - geo_type = current_row$geo_type, - race_type = current_row$race_type, - years = current_row$start:current_row$stop, - genders = gender_values, - ages = age_values, - round = FALSE + # If futures ARE used: run in parallel ---- + if (!inherits(future::plan(), "sequential")) { + progressr::handlers(progressr::handler_progress()) + progressr::with_progress({ + p <- progressr::progressor(nrow(batched_queries)) + all_population_data <- data.table::rbindlist( + future.apply::future_lapply( + X = batched_queries$batched_id, + FUN = function(query_id) { + p(paste0("Processing ", query_id, " of ", + data.table::uniqueN(batched_queries$batched_id), ": ", + batched_queries[batched_id == query_id]$query_key )) + get_batched_population(query_id, + pop.template, + batched_queries, + gender_values, + age_values, + is_chars) + }, + future.seed = 98104 # Set seed for parallel processes ) - } - - # Process demographic categories ---- - # Apply category grouping logic for both primary (cat1) and secondary (cat2) categories - for (catnum in c("cat1", "cat2")) { - ## Add category information from template to result ---- - catvarname <- paste0(catnum, '_varname') - catgroup <- paste0(catnum, '_group') - temp.groupby <- paste0("group_by", gsub('cat', '', catnum)) - - # had to use set function because regular := syntax caused errors b/c used catnum differently on both sides of := - data.table::set(population_data, - j = catnum, - value = current_row[[catnum]]) - - data.table::set(population_data, - j = catvarname, - value = current_row[[catvarname]]) - - data.table::set(population_data, - j = catgroup, - value = current_row[[temp.groupby]]) - - ## Process geographic categories ---- - # King County - population_data[get(catnum) == "King County", - c(catgroup) := "King County"] - - # Washington State - population_data[get(catnum) == "Washington State", - c(catgroup) := "Washington State"] - - # Handle NA values - suppressWarnings( - population_data[get(catnum) == "NA" | is.na(get(catnum)), - c(catnum, catgroup, catvarname) := "NA"] - ) - - # Cities/neighborhoods and Regions - population_data[get(catnum) %in% c("Cities/neighborhoods", "Regions") & - current_row$geo_type != 'blk', - c(catgroup) := geo_id] - - ## Process gender ---- - population_data[get(catnum) %in% c("Gender"), c(catgroup) := gender] - - ## Process 'Overall' ---- - population_data[get(catnum) %in% c("Overall"), c(catgroup) := "Overall"] - - ## Process race/ethnicity categories ---- - population_data[get(catnum) == "Ethnicity" | get(catvarname) %in% c('race4'), - c(catgroup) := race_eth] - - population_data[get(catnum) == 'Race' & get(catvarname) %in% c('race3'), - c(catgroup) := race] - - population_data <- population_data[get(catnum) != "Ethnicity" | (get(catnum) == "Ethnicity" & get(catgroup) == 'Hispanic'), ] - - population_data[get(catgroup) == "Multiple race", - c(catgroup) := "Multiple"] - - ## Process race_aic (alone or in combination) categories ---- - if (current_row$race_type == 'race_aic') { - # Filter to keep only relevant race_aic combinations - population_data <- population_data[ - !(grepl('_aic_', get(catvarname)) & - !((get(catvarname) == 'chi_race_aic_aian' & race_aic == 'AIAN') | - (get(catvarname) == 'chi_race_aic_asian' & race_aic == 'Asian') | - (get(catvarname) == 'chi_race_aic_black' & race_aic == 'Black') | - (get(catvarname) == 'chi_race_aic_his' & race_aic == 'Hispanic') | - (get(catvarname) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | - (get(catvarname) == 'chi_race_aic_wht' & race_aic == 'White')) - ) - ] - - # Assign race_aic value to group - population_data[grep('_aic', get(catvarname)), - c(catgroup) := race_aic] - } - - ## Process HRAs ---- - if (population_data[1, geo_type] == 'blk' & population_data[1, get(catnum)] == 'Cities/neighborhoods') { - - hra_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] - - population_data <- merge(population_data, - hra_crosswalk, - by = "geo_id", - all.x = TRUE, - all.y = FALSE) - - population_data[, c(catgroup) := hra20_name] - } - - ## Process Regions ---- - # Block to Region crosswalk - if (population_data[1, geo_type] == 'blk' & population_data[1, get(catnum)] == 'Regions') { - - region_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, region_name)] - - population_data <- merge(population_data, - region_crosswalk, - by = 'geo_id', - all.x = TRUE, - all.y = FALSE) - - population_data[, c(catgroup) := region_name] - } - - # HRA to Region crosswalk - if (population_data[1, geo_type] == 'hra' & population_data[1, get(catnum)] == 'Regions') { - - region_crosswalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] - - population_data <- merge(population_data, - region_crosswalk, - by = 'geo_id', - all.x = TRUE, - all.y = FALSE) - - population_data[, c(catgroup) := region_name] - } - - # ZIP to Region crosswalk with population weighting - if (population_data[1, geo_type] == 'zip' & population_data[1, get(catnum)] == 'Regions') { - - # Create ZIP to region crosswalk with population weights - zip_region_crosswalk <- rads.data::spatial_zip_to_hra20_pop - - zip_region_crosswalk <- merge(zip_region_crosswalk, - rads.data::spatial_hra20_to_region20[, list(hra20_name, region = region_name)], - by = 'hra20_name', - all = TRUE) - - # Aggregate fractional populations to region level - zip_region_crosswalk <- zip_region_crosswalk[,list(s2t_fraction = sum(s2t_fraction)), - list(geo_id = as.character(source_id), region)] - - # Apply population weighting by region - population_data <- merge(population_data, - zip_region_crosswalk, - by = "geo_id", - all.x = TRUE, - all.y = FALSE, - allow.cartesian = TRUE) - population_data[, pop := pop * s2t_fraction] # Apply weight to population - population_data[, c(catgroup) := region] - } - - ## Process Big Cities ---- - if (population_data[1, get(catnum)] == 'Big cities') { - # Block to big city crosswalk - if (population_data[1, geo_type] == 'blk') { - - # Two-step crosswalk: block to HRA to big city - block_to_hra <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] - hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] - - block_to_bigcity <- merge(hra_to_bigcity, - block_to_hra, - by = 'hra20_name', - all.x = T, - all.y = F)[, hra20_name := NULL] - - population_data <- merge(population_data, - block_to_bigcity, - by = "geo_id", - all.x = TRUE, - all.y = FALSE) - } - - # HRA to big city crosswalk - if (population_data[1, geo_type] == 'hra') { - hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] - population_data <- merge(population_data, - hra_to_bigcity, - by.x = 'geo_id', - by.y = 'hra20_name', - all.x = TRUE, - all.y = FALSE) - } - - # Assign big city name to group - population_data[, c(catgroup) := bigcity] - } - - ## Process age groupings ---- - # Age 6 groups: <18, 18-24, 25-44, 45-64, 65-74, 75+ - population_data[get(catvarname) == "age6" & age %in% 0:17, - c(catgroup) := "<18"] - population_data[get(catvarname) == "age6" & age %in% 18:24, - c(catgroup) := "18-24"] - population_data[get(catvarname) == "age6" & age %in% 25:44, - c(catgroup) := "25-44"] - population_data[get(catvarname) == "age6" & age %in% 45:64, - c(catgroup) := "45-64"] - population_data[get(catvarname) == "age6" & age %in% 65:74, - c(catgroup) := "65-74"] - population_data[get(catvarname) == "age6" & age >= 75, - c(catgroup) := "75+"] - - # Maternal age 5 groups: 10-17, 18-24, 25-34, 35-44, 45+ - population_data[get(catvarname) == "mage5" & age %in% 10:17, - c(catgroup) := "10-17"] - population_data[get(catvarname) == "mage5" & age %in% 18:24, - c(catgroup) := "18-24"] - population_data[get(catvarname) == "mage5" & age %in% 25:34, - c(catgroup) := "25-34"] - population_data[get(catvarname) == "mage5" & age %in% 35:44, - c(catgroup) := "35-44"] - population_data[get(catvarname) == "mage5" & age >= 45, - c(catgroup) := "45+"] - - # Youth age 4 groups: 0-4, 5-9, 10-14, 15-17 - population_data[get(catvarname) == "yage4" & age %in% 0:4, - c(catgroup) := "0-4"] - population_data[get(catvarname) == "yage4" & age %in% 5:9, - c(catgroup) := "5-9"] - population_data[get(catvarname) == "yage4" & age %in% 10:14, - c(catgroup) := "10-14"] - population_data[get(catvarname) == "yage4" & age %in% 15:17, - c(catgroup) := "15-17"] - - ## Process poverty groupings ---- - # Block level poverty - if (population_data[1, geo_type] == 'blk' & - grepl("poverty$", population_data[1, get(catnum)], ignore.case = TRUE)) { - # Extract tract ID from block ID (first 11 characters) - population_data[, geo_tract2020 := substr(geo_id, 1, 11)] - - # Join poverty group data - population_data <- merge( - population_data, - rads.data::misc_poverty_groups[geo_type == 'Tract'][, list(geo_tract2020 = geo_id, pov200grp)], - by = "geo_tract2020", - all.x = TRUE, - all.y = FALSE - ) - - # Assign poverty group - population_data[, c(catgroup) := pov200grp] - } - - # ZIP level poverty - if (population_data[1, geo_type] == 'zip' & - grepl("poverty$", population_data[1, get(catnum)], ignore.case = TRUE)) { - # Join poverty group data - population_data <- merge( - population_data, - rads.data::misc_poverty_groups[geo_type == 'ZCTA'][, list(geo_id, pov200grp)], - by = 'geo_id', - all.x = TRUE, - all.y = FALSE - ) - - # Assign poverty group - population_data[, c(catgroup) := pov200grp] - } - - } # close looping over cat1/cat2 - - # Filter and clean results ---- - # Remove rows with missing primary category group - population_data <- population_data[!is.na(cat1_group)] - - # Remove rows with missing secondary category group (when category exists) - population_data <- population_data[is.na(cat2) | cat2 == 'NA' | - (!is.na(cat2) & cat2 != 'NA' & !is.na(cat2_group) & cat2_group != 'NA'),] - - # Aggregate population data ---- - # Collapse to one row per demographic combination with sum of population - population_data <- population_data[, list(pop = sum(pop)), - list(chi_age = age, - year, - cat1, cat1_varname, cat1_group, - cat2, cat2_varname, cat2_group)] - - # Generate complete demographic combinations ---- - # Function to handle age group processing and demographic merging - process_age_category <- function(population_data, cat_num) { - # Define prefix and complementary prefix - cat_prefix <- paste0("cat", cat_num) - other_cat_prefix <- if(cat_num == 1) "cat2" else "cat1" # need to select the cat that does not have the age var - - # Get the variable name for this category - cat_varname <- population_data[1][[paste0(cat_prefix, "_varname")]] - - # Create appropriate age groups based on cat_varname - if (cat_varname == 'age6') { - age_groups <- data.table( - chi_age = 0:100 - ) - age_groups[, paste0(cat_prefix, "_group") := cut( - chi_age, - breaks = c(-1, 17, 24, 44, 64, 74, 120), - labels = c("<18", "18-24", "25-44", "45-64", "65-74", "75+") - )] - } else if (cat_varname == 'mage5') { - age_groups <- data.table( - chi_age = 10:100 - ) - age_groups[, paste0(cat_prefix, "_group") := cut( - chi_age, - breaks = c(9, 17, 24, 34, 44, 120), - labels = c('10-17', '18-24', '25-34', '35-44', '45+') - )] - } else if (cat_varname == 'yage4') { - age_groups <- data.table( - chi_age = 0:17 - ) - age_groups[, paste0(cat_prefix, "_group") := cut( - chi_age, - breaks = c(-1, 4, 9, 14, 17), - labels = c('0-4', '5-9', '10-14', '15-17') - )] - } - - # Add category info - age_groups[, (cat_prefix) := "Age"] - age_groups[, paste0(cat_prefix, "_varname") := cat_varname] - - # Combine demographic dimensions with age groups - cols_to_select <- c("year", other_cat_prefix, - paste0(other_cat_prefix, "_varname"), - paste0(other_cat_prefix, "_group")) - - unique_pop_data <- unique(population_data[, cols_to_select, with = FALSE][, mykey := 1]) - - complete_demographics <- unique_pop_data[age_groups[, mykey := 1], on = "mykey", allow.cartesian = TRUE] - complete_demographics[, mykey := NULL] - - return(complete_demographics) - } - - # Use function to handle age group processing - if (population_data[1]$cat1 == "Age") { - complete_demographics <- process_age_category(population_data, 1) - } - - if (population_data[1]$cat2 == "Age") { - complete_demographics <- process_age_category(population_data, 2) - } - - if (population_data[1]$cat1 != "Age" & population_data[1]$cat2 != "Age"){ - # Get unique cat1 groups - cat1_groups <- unique(population_data[, list(cat1, cat1_varname, cat1_group, mykey = 1)]) - - # Get unique cat2 groups - cat2_groups <- unique(population_data[, list(cat2, cat2_varname, cat2_group, mykey = 1)]) - - # All combos of cat1 and cat2 groups - complete_demographics <- cat1_groups[cat2_groups, on = "mykey", allow.cartesian = TRUE] - - # Create year and age combos - year_age <- data.table(year = as.character(current_row$year), chi_age = 0:100, mykey = 1) - - # Get combos for each year/age cat1/cat2 combo - complete_demographics <- complete_demographics[year_age, on = "mykey", allow.cartesian = TRUE] + , fill = TRUE) + }) + } - # Drop key - complete_demographics[, mykey := NULL] + # STEP 4: Process each template row ---- + message("\U023F3 Allocating the population data for each row in pop.template. Be patient!") + + # Convert years to to integers for comparisons + data.table::set(all_population_data, j = "year", value = as.integer(all_population_data[["year"]])) + if(is.character(pop.template$start)) pop.template[, start := as.integer(start)] + if(is.character(pop.template$stop)) pop.template[, stop := as.integer(stop)] + + # If futures are NOT used: run sequentially ---- + if (inherits(future::plan(), "sequential")) { + message("Processing rows sequentially...") + final_population_data <- data.table::rbindlist( + lapply( + X = seq_len(nrow(pop.template)), + FUN = function(row_index) { + message(paste0("Processing row ", row_index, " of ", nrow(pop.template))) + + # Extract data for this row directly + current_row <- pop.template[row_index, ] + filtered_population <- all_population_data[ + batched_id == current_row$batched_id & + year >= current_row$start & + year <= current_row$stop + ] + + # Process without temp files + process_template_row(row_index, filtered_population, pop.template) } + ) + ) + } + # If futures ARE used: run in parallel ---- + if (!inherits(future::plan(), "sequential")) { + # Allocate some extra column slots to prevent data.table errors during non-equi joins + invisible(data.table::alloc.col(all_population_data, 5)) + invisible(data.table::alloc.col(pop.template, 5)) + + # Add row index to pop.template for identifying which rows to process + pop.template[, row_index := .I] + + # Use a non-equi join to match all_population_data rows with pop.template rows + # based on batched_id and proper year range + population_subsets <- all_population_data[ + pop.template[, .(batched_id, start, stop, row_index)], + on = .(batched_id == batched_id, year >= start, year <= stop), + nomatch = 0 # Drop rows that don't match + ] + + # Remove any year columns from the population data (was only needed for the join) + population_subsets[, grep('year', names(population_subsets), value = T) := NULL] + + # Create temporary directory to store filtered data files + # This allows each future worker to load only what it needs - + # avoids the persistent memory issues of sending all the pop data to each future + temp_dir <- file.path(tempdir(), "filtered_data") + dir.create(temp_dir, showWarnings = FALSE, recursive = TRUE) + + # Save each subset of data as a separate file in the temporary directory + message("\U023F3 Saving filtered population data to temporary files...") + for(i in unique(pop.template$row_index)) { + # Extract and save just the data needed for this template row + + qs::qsave(population_subsets[row_index == i], # faster alternative to saveRDS + file = file.path(temp_dir, paste0("row_", i, ".qs")), + preset = "fast") + + if(i %% 5 == 0) {message('Saved ', i, ' of ', max(pop.template$row_index), ' population tables')} + } + # Free up memory by removing the population data that is no longer needed + rm(list = c('population_subsets', 'all_population_data')) + gc() - # Merge population data with complete demographics grid ---- - population_data <- suppressWarnings(merge(population_data, complete_demographics, all = TRUE)) - - population_data[is.na(pop), pop := 0] # Fill missing population values with zero - - # Add tab column and finalize ---- - population_data[, tab := current_row$tab] - - # Convert placeholder "NA" strings back to true NA values ---- - population_data[cat2 == "NA", c("cat2", "cat2_varname", "cat2_group") := NA] - - # Return completed population dataset ---- - return(population_data) - } - - # Process all template rows in parallel ---- - # Combine results from all template rows using future_lapply for parallel processing - progressr::handlers(handler_progress()) + # Process each row of pop.template in parallel + message("\U023F3 Producing a combined population table with standard CHI columns") + progressr::handlers(progressr::handler_progress()) progressr::with_progress({ p <- progressr::progressor(nrow(pop.template)) - all_population_data <- rbindlist( - future_lapply( - X = as.list(seq(1, nrow(pop.template))), - FUN = function(row_index) { - p(paste0("Processing row ", row_index, " of ", nrow(pop.template) )) - set.seed(98104) # Set consistent seed for reproducibility - get_population_for_template_row(row_index, pop.template = pop.template) - }, - future.seed = 98104 # Set seed for parallel processes - ) + + final_population_data <- data.table::rbindlist( + future.apply::future_lapply( + X = seq_len(nrow(pop.template)), + FUN = function(row_index) { + # Update progress + p(paste0("Post-processing row ", row_index, " of ", nrow(pop.template))) + + # Load only the data needed for this specific row (faster than readRDS) + filtered_population <- qs::qread(file.path(temp_dir, paste0("row_", row_index, ".qs"))) + + # Process the data using the template row + result <- process_template_row( + row_index = row_index, + population_data = filtered_population, + pop.template = pop.template) + + return(result) + }, + future.seed = 98104 # Ensure reproducible results across parallel processes ) - }) # closet progressr::with_progress() + ) + }) + + # Clean up temporary files + message("Cleaning up temporary files...") + unlink(temp_dir, recursive = TRUE) + } - # Remove duplicate rows in final dataset - all_population_data <- unique(all_population_data) + # STEP 5: Finalize and return result ---- + # Remove duplicate rows in final dataset + final_population_data <- unique(final_population_data) - # Return final population dataset ---- - return(all_population_data) + # Return final population dataset + return(final_population_data) } diff --git a/R/chi_get_proper_pop_helpers.r b/R/chi_get_proper_pop_helpers.r new file mode 100644 index 0000000..ef51e45 --- /dev/null +++ b/R/chi_get_proper_pop_helpers.r @@ -0,0 +1,691 @@ +######################################### ---- +# HELPER FUNCTIONS for chi_get_proper_pop ---- +######################################### ---- +# Input Validation: validate_inputs() ---- +#' Input Validation: validate_inputs() ---- +#' @noRd + validate_inputs <- function(pop.template, pop.genders, pop.ages, is_chars) { + # pop.template + if (is.null(pop.template)) { + stop("\n\U1F6D1 pop.template parameter is required") + } + + if (!is.data.frame(pop.template)) { + stop("\n\U1F6D1 pop.template must be a data.frame or data.table") + } else { + pop.template <- data.table::setDT(data.table::copy(pop.template)) + } + + # Check for required columns + required_columns <- c("year", "cat1", "cat1_varname", "cat2", "cat2_varname", + "start", "stop", "race_type", "geo_type", "tab") + missing_columns <- required_columns[!required_columns %in% names(pop.template)] + + if (length(missing_columns) > 0) { + stop("\n\U1F6D1 pop.template is missing required columns: ", + paste(missing_columns, collapse = ", ")) + } + + # pop.genders + if (is.null(pop.genders)) { + gender_values <- c("f", "m") + } else { + if (!tolower(pop.genders) %in% c('f', 'female', 'm', 'male')) { + stop("\n\U0001f47f if pop.genders is specified, it is limited to one of the following values: 'F', 'f', 'Female', 'female', 'M', 'm', 'Male', or 'male'") + } else { + gender_values <- pop.genders + } + } + + # pop.ages + if (is.null(pop.ages)) { + age_values <- c(0:100) + } else { + if (!is.integer(pop.ages)) { + stop("\n\U0001f47f if pop.ages is specified it must be vector of integers, e.g., c(0:65)") + } else { + age_values <- pop.ages + } + } + + # is_chars + if(!is.logical(is_chars)) { + stop("\n\U0001f47f The `is_chars` argument must be a logical, i.e., TRUE | FALSE") + } + if(length(is_chars) != 1) { + stop("\n\U0001f47f The `is_chars` argument must be of length 1") + } + + return(list( + pop.template = pop.template, + gender_values = gender_values, + age_values = age_values, + is_chars = is_chars + )) + } + +# Standardize Category Names: standardize_category_names() ---- +#' Standardize Category Names: standardize_category_names() ---- +#' @noRd + standardize_category_names <- function(pop.template) { + # Remove birthing person prefixes to standardize maternal data categories + pop.template[grepl("birthing person", cat1, ignore.case = TRUE), + cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] + pop.template[grepl("birthing person", cat2, ignore.case = TRUE), + cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] + + return(pop.template) + } + +# Create Query Keys for Consolidation: create_query_keys() ---- +#' Create Query Keys for Consolidation: create_query_keys() ---- +#' @noRd + create_query_keys <- function(pop.template) { + # Create a key to identify unique population query patterns + pop.template[, query_key := paste( + cat1, cat1_varname, cat2, cat2_varname, + race_type, geo_type, + ifelse(is.na(group_by1), "NA", group_by1), + ifelse(is.na(group_by2), "NA", group_by2), + sep = "||" + )] + + data.table::setorder(pop.template, query_key) # for consistent representative_row_index + + return(pop.template) + } + +# Batch Similar Querys: batch_population_queries() ---- +#' Batch Similar Querys: batch_population_queries() ---- +#' @noRd + batch_population_queries <- function(pop.template) { + # For each unique query key, find the min start year and max stop year + batched_queries <- pop.template[, list( + min_start = min(start), + max_stop = max(stop), + representative_row_index = .I[1] # Keep representative row per pattern + ), by = query_key] + + # Add a unique ID for each batched query + batched_queries[, batched_id := .I] + + # Join back to original template + pop.template <- merge( + pop.template, + batched_queries[, list(query_key, batched_id, min_start, max_stop)], + by = "query_key" + ) + + return(list( + pop.template = pop.template, + batched_queries = batched_queries + )) + } + +# Get Population Data for a batched Query: get_batched_population() ---- +#' Get Population Data for a batched Query: get_batched_population() ---- +#' @noRd + get_batched_population <- function(query_id, + pop.template, + batched_queries, + gender_values, + age_values, + is_chars) { + # Get the batched query info + current_query <- batched_queries[batched_id == query_id] + + # Get a representative row from the original template + template_row <- pop.template[current_query$representative_row_index] + + # Show progress + message(paste0("Process ID ", Sys.getpid(), ": get_population call ", + query_id, " (covers ", sum(pop.template$batched_id == query_id), + " original queries: ", current_query$query_key, ")")) + + # Build grouping parameters for population query + grouping_vars <- unique(c( + c("ages", 'years', "geo_id"), + setdiff(c(template_row$group_by1, template_row$group_by2), c(NA)) + )) + + # Call get_population with the batched year range + if (is_chars && template_row$geo_type == 'kc') { + # For CHARS data, use ZIP aggregation method for King County + population_data <- rads::get_population( + kingco = FALSE, # intentionally FALSE to retrieve ALL ZIP codes, which we'll filter later + group_by = grouping_vars, + geo_type = 'zip', # note this is purposefully not 'kc' + race_type = template_row$race_type, + years = current_query$min_start:current_query$max_stop, + genders = gender_values, + ages = age_values, + round = FALSE + ) + + # Filter to Define King County as ZIPs that begin with 980/981 + population_data <- population_data[grepl('^980|^981', geo_id)] + + # Sum the population across all ZIP codes while maintaining other dimensions + group_cols <- setdiff(names(population_data), c("geo_id", "pop")) + population_data <- population_data[, list(pop = sum(pop)), by = group_cols] + + # Add King County identifier for consistent processing + population_data[, geo_type := 'kc'] + population_data[, geo_id := 'King County'] + } else if (is.na(template_row$geo_type)) { + population_data <- rads::get_population( + group_by = grouping_vars, + race_type = template_row$race_type, + years = current_query$min_start:current_query$max_stop, + genders = gender_values, + ages = age_values, + round = FALSE + ) + } else { + population_data <- rads::get_population( + group_by = grouping_vars, + geo_type = template_row$geo_type, + race_type = template_row$race_type, + years = current_query$min_start:current_query$max_stop, + genders = gender_values, + ages = age_values, + round = FALSE + ) + } + + # Add batched_id to population data for later processing + population_data[, batched_id := query_id] + + return(population_data) + } + +# Process Age Group Patterns: process_age_patterns() ---- +#' Process Age Group Patterns: process_age_patterns() ---- +#' @noRd + process_age_patterns <- function(age_var) { + # Check if this age variable exists in misc_chi_byvars + chi_age_groups <- rads.data::misc_chi_byvars[cat == "Age" & varname == age_var] + + if (nrow(chi_age_groups) == 0) { + stop(paste0("\n\U1F6D1 Age variable ", age_var, " not found in rads.data::misc_chi_byvars[cat == \"Age\"]")) + } + + # Create a result table with age ranges + age_ranges <- data.table(group_value = character(), min_age = numeric(), max_age = numeric()) + + # Process each age group + for (g in unique(chi_age_groups$group)) { + # Pattern: "<#" (e.g., "<18") + if (grepl("^<\\d+$", g)) { + max_age <- as.numeric(gsub("<", "", g)) - 1 + age_ranges <- rbind(age_ranges, data.table(group_value = g, min_age = 0, max_age = max_age)) + } + # Pattern: "#-#" (e.g., "18-24") + else if (grepl("^\\d+-\\d+$", g)) { + range_parts <- as.numeric(strsplit(g, "-")[[1]]) + min_age <- range_parts[1] + max_age <- range_parts[2] + age_ranges <- rbind(age_ranges, data.table(group_value = g, min_age = min_age, max_age = max_age)) + } + # Pattern: "#+", (e.g., "75+") + else if (grepl("^\\d+\\+$", g)) { + min_age <- as.numeric(gsub("\\+", "", g)) + age_ranges <- rbind(age_ranges, data.table(group_value = g, min_age = min_age, max_age = Inf)) + } + # Other valid groups like "All", "Adults", "Children", "Seniors" + else if (g %in% c("All", "Adults", "Children", "Seniors")) { + if (g == "All") { + age_ranges <- rbind(age_ranges, data.table(group_value = g, min_age = 0, max_age = Inf)) + } else if (g == "Adults") { + age_ranges <- rbind(age_ranges, data.table(group_value = g, min_age = 18, max_age = Inf)) + } else if (g == "Children") { + age_ranges <- rbind(age_ranges, data.table(group_value = g, min_age = 0, max_age = 17)) + } else if (g == "Seniors") { + age_ranges <- rbind(age_ranges, data.table(group_value = g, min_age = 65, max_age = Inf)) + } + } + # Error for unrecognized patterns + else { + stop(paste0("\n\U1F6D1 Age group ", g, " in age_var = ", age_var, + " does not follow the expected pattern (<#, #-#, or #+) and cannot be used", + "\n Valid options are found in rads.data::misc_chi_byvars[cat == \"Age\"]")) + } + } + + return(age_ranges) + } + +# Assign Race/Ethnicity Categories: assign_race_ethnicity() ---- +#' Assign Race/Ethnicity Categories: assign_race_ethnicity() ---- +#' @noRd + assign_race_ethnicity <- function(population_data, catnum, catvarname, catgroup, race_type) { + # Standardize race_eth into race + if('race' %in% names(population_data)) { + population_data[, race := as.character(race)] + } + + if('race_eth' %in% names(population_data)) { + population_data[, race_eth := as.character(race_eth)] + } + + if('race' %in% names(population_data) & 'race_eth' %in% names(population_data)) { + population_data[is.na(race) | race == 'NA', race := race_eth] + } + + if(!'race' %in% names(population_data) & 'race_eth' %in% names(population_data)) { + data.table::setnames(population_data, 'race_eth', 'race') + } + + # Process specific race/ethnicity groups + population_data[get(catvarname) %in% c('race4', 'race3'), (catgroup) := race] + + # Filter out Ethnicity unless the group == 'Hispanic' + population_data <- population_data[get(catnum) != "Ethnicity" | + (get(catnum) == "Ethnicity" & + get(catgroup) == 'Hispanic'), ] + + # Standardize "Multiple" label + population_data[get(catgroup) == "Multiple race", (catgroup) := "Multiple"] + + # Process race_aic (alone or in combination) categories + if (race_type == 'race_aic') { + # Filter to keep only relevant race_aic combinations + population_data <- population_data[ + !(grepl('_aic_', get(catvarname)) & + !((get(catvarname) == 'chi_race_aic_aian' & race_aic == 'AIAN') | + (get(catvarname) == 'chi_race_aic_asian' & race_aic == 'Asian') | + (get(catvarname) == 'chi_race_aic_black' & race_aic == 'Black') | + (get(catvarname) == 'chi_race_aic_his' & race_aic == 'Hispanic') | + (get(catvarname) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | + (get(catvarname) == 'chi_race_aic_wht' & race_aic == 'White')) + ) + ] + + # Assign race_aic value to group + population_data[grep('_aic', get(catvarname)), (catgroup) := race_aic] + } + + return(population_data) + } + +# Assign Age Groupings using misc_chi_byvars:assign_age_groupings() ---- +#' Assign Age Groupings using misc_chi_byvars:assign_age_groupings() ---- +#' @noRd + assign_age_groupings <- function(population_data, catnum, catvarname, catgroup) { + # Check if this is an age category + if (population_data[1, get(catnum)] == "Age") { + # Get the age variable name + age_var <- population_data[1, get(catvarname)] + + # Get age ranges + age_ranges <- process_age_patterns(age_var) + + # Assign each age range to the population data + for (i in 1:nrow(age_ranges)) { + group_value <- age_ranges[i, group_value] + min_age <- age_ranges[i, min_age] + max_age <- age_ranges[i, max_age] + + if (is.infinite(max_age)) { + population_data[get(catvarname) == age_var & age >= min_age, (catgroup) := group_value] + } else { + population_data[get(catvarname) == age_var & age >= min_age & age <= max_age, + (catgroup) := group_value] + } + } + } + + return(population_data) + } + +# Assign Geographic Crosswalks: assign_geographic_crosswalks() ---- +#' Assign Geographic Crosswalks: assign_geographic_crosswalks() ---- +#' @noRd + assign_geographic_crosswalks <- function(population_data, catnum, catgroup, catvarname, geo_type) { + # Process HRAs + if (geo_type == 'blk' && population_data[1, get(catnum)] == 'Cities/neighborhoods') { + hra_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, + list(geo_id = GEOID20, + hra20_name)] + + population_data <- merge(population_data, + hra_crosswalk, + by = "geo_id", + all.x = TRUE, + all.y = FALSE) + + population_data[, (catgroup) := hra20_name] + } + + # Process Region crosswalks + # Block to Region + if (geo_type == 'blk' && population_data[1, get(catnum)] == 'Regions') { + region_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[,list(geo_id = GEOID20, region_name)] + + population_data <- merge(population_data, + region_crosswalk, + by = 'geo_id', + all.x = TRUE, + all.y = FALSE) + + population_data[, (catgroup) := region_name] + } + + # HRA to Region + if (geo_type == 'hra' && population_data[1, get(catnum)] == 'Regions') { + region_crosswalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] + + population_data <- merge(population_data, + region_crosswalk, + by = 'geo_id', + all.x = TRUE, + all.y = FALSE) + + population_data[, (catgroup) := region_name] + } + + # ZIP to Region with population weighting + if (geo_type == 'zip' && population_data[1, get(catnum)] == 'Regions') { + # Create ZIP to region crosswalk with population weights + zip_region_crosswalk <- rads.data::spatial_zip_to_hra20_pop + + zip_region_crosswalk <- merge(zip_region_crosswalk, + rads.data::spatial_hra20_to_region20[, list(hra20_name, region = region_name)], + by = 'hra20_name', + all = TRUE) + + # Aggregate fractional populations to region level + zip_region_crosswalk <- zip_region_crosswalk[, + list(s2t_fraction = sum(s2t_fraction)), + list(geo_id = as.character(source_id), region)] + + # Assign population weighting by region + population_data <- merge(population_data, + zip_region_crosswalk, + by = "geo_id", + all.x = TRUE, + all.y = FALSE, + allow.cartesian = TRUE) + population_data[, pop := pop * s2t_fraction] # Assign weight to population + population_data[, (catgroup) := region] + } + + # Process Big Cities + if (population_data[1, get(catnum)] == 'Big cities') { + # Block to big city crosswalk + if (geo_type == 'blk') { + # Two-step crosswalk: block to HRA to big city + block_to_hra <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] + hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] + + block_to_bigcity <- merge(hra_to_bigcity, + block_to_hra, + by = 'hra20_name', + all.x = TRUE, + all.y = FALSE)[, hra20_name := NULL] + + population_data <- merge(population_data, + block_to_bigcity, + by = "geo_id", + all.x = TRUE, + all.y = FALSE) + } + + # HRA to big city crosswalk + if (geo_type == 'hra') { + hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] + population_data <- merge(population_data, + hra_to_bigcity, + by.x = 'geo_id', + by.y = 'hra20_name', + all.x = TRUE, + all.y = FALSE) + } + + # Assign big city name to group + population_data[, (catgroup) := bigcity] + } + + # Process poverty groupings + # Block level poverty + if (geo_type == 'blk' && population_data[1, get(catvarname)] == 'pov200grp') { + # Extract tract ID from block ID (first 11 characters) + population_data[, geo_tract2020 := substr(geo_id, 1, 11)] + + # Join poverty group data + population_data <- merge( + population_data, + rads.data::misc_poverty_groups[geo_type == 'Tract'][, list(geo_tract2020 = geo_id, pov200grp)], + by = "geo_tract2020", + all.x = TRUE, + all.y = FALSE + ) + + # Assign poverty group + population_data[, (catgroup) := pov200grp] + } + + # ZIP level poverty + if (geo_type == 'zip' && population_data[1, get(catvarname)] == 'pov200grp') { + # Join poverty group data + population_data <- merge( + population_data, + rads.data::misc_poverty_groups[geo_type == 'ZCTA'][,list(geo_id, pov200grp)], + by = 'geo_id', + all.x = TRUE, + all.y = FALSE + ) + + # Assign poverty group + population_data[, (catgroup) := pov200grp] + } + + return(population_data) + } + +# Create demographic shell for population analysis: create_demographic_shell() ---- +#' Create demographic shell for population analysis: create_demographic_shell() ---- +#' @noRd + create_demographic_shell <- function(population_data, template_row) { + # Function to handle age group processing + process_age_category <- function(population_data, cat_num) { + # Define prefix and complementary prefix + cat_prefix <- paste0("cat", cat_num) + other_cat_prefix <- if(cat_num == 1) "cat2" else "cat1" + + # Get the variable name for this category + cat_varname <- population_data[1][[paste0(cat_prefix, "_varname")]] + + # Create a data table with all possible ages + age_groups <- data.table(chi_age = 0:100) + + # Get age ranges + age_ranges <- process_age_patterns(cat_varname) + + # Assign each age range to the age groups table + for (i in 1:nrow(age_ranges)) { + group_value <- age_ranges[i, group_value] + min_age <- age_ranges[i, min_age] + max_age <- age_ranges[i, max_age] + + if (is.infinite(max_age)) { + age_groups[chi_age >= min_age, paste0(cat_prefix, "_group") := group_value] + } else { + age_groups[chi_age >= min_age & chi_age <= max_age, + paste0(cat_prefix, "_group") := group_value] + } + } + + # Filter out ages without assigned groups + age_groups <- age_groups[!is.na(get(paste0(cat_prefix, "_group")))] + + # Add category info + age_groups[, (cat_prefix) := "Age"] + age_groups[, paste0(cat_prefix, "_varname") := cat_varname] + + # Combine demographic dimensions with age groups + cols_to_select <- c("year", other_cat_prefix, + paste0(other_cat_prefix, "_varname"), + paste0(other_cat_prefix, "_group")) + + unique_pop_data <- unique(population_data[, cols_to_select, with = FALSE][, mykey := 1]) + + complete_demographics <- merge(unique_pop_data, + age_groups[, mykey := 1], + by = 'mykey', + allow.cartesian = TRUE) + + complete_demographics[, mykey := NULL] + + return(complete_demographics) + } + + # Use function to handle age group processing + if (population_data[1]$cat1 == "Age") { + complete_demographics <- process_age_category(population_data, 1) + } else if (population_data[1]$cat2 == "Age") { + complete_demographics <- process_age_category(population_data, 2) + } else { + # Get unique cat1 groups + cat1_groups <- unique(population_data[, list(cat1, cat1_varname, cat1_group, mykey = 1)]) + + # Get unique cat2 groups + cat2_groups <- unique(population_data[, list(cat2, cat2_varname, cat2_group, mykey = 1)]) + + # All combos of cat1 and cat2 groups + complete_demographics <- merge(cat1_groups, + cat2_groups, + by = 'mykey', + allow.cartesian = TRUE) + + # Create year and age combos + year_age <- data.table(year = as.character(template_row$year), + chi_age = 0:100, + mykey = 1) + + # Get combos for each year/age cat1/cat2 combo + complete_demographics <- merge(complete_demographics, + year_age, + by = 'mykey', + allow.cartesian = TRUE) + + # Drop key + complete_demographics[, mykey := NULL] + } + + return(complete_demographics) + } + +# Process a single template row: process_template_row() ---- +#' Process a single template row: process_template_row() ---- +#' @noRd + process_template_row <- function(row_index, population_data, pop.template) { + # Basic subsetting tidying ---- + current_row <- pop.template[row_index, ] + + # Set the correct year format + population_data[, year := current_row$year] + + # Process demographic categories ---- + for (catnum in c("cat1", "cat2")) { + # Define variable names + catvarname <- paste0(catnum, '_varname') + catgroup <- paste0(catnum, '_group') + temp.groupby <- paste0("group_by", gsub('cat', '', catnum)) + + # Set basic category info from template + population_data[, (catnum) := current_row[[catnum]]] + population_data[, (catvarname) := current_row[[catvarname]]] + population_data[, (catgroup) := current_row[[temp.groupby]]] + + # Process standard geographic categories + # King County + population_data[get(catnum) == "King County", (catgroup) := "King County"] + + # Washington State + population_data[get(catnum) == "Washington State", (catgroup) := "Washington State"] + + # Handle NA values + suppressWarnings( + population_data[get(catnum) == "NA" | is.na(get(catnum)), + c(catnum, catgroup, catvarname) := "NA"] + ) + + # Cities/neighborhoods and Regions + population_data[get(catnum) %in% c("Cities/neighborhoods", "Regions") & + current_row$geo_type != 'blk', (catgroup) := geo_id] + + # Process gender + population_data[get(catnum) %in% c("Gender"), (catgroup) := gender] + + # Process 'Overall' + population_data[get(catnum) %in% c("Overall"), (catgroup) := "Overall"] + + # Process race/ethnicity categories + if (!is.na(population_data[1, get(catnum)]) && + (population_data[1, get(catnum)] %in% c("Race", "Race/ethnicity", "Race/Ethnicity", "Ethnicity") || + (current_row$race_type == 'race_aic' && grepl('_aic_', get(catvarname))))) { + population_data <- assign_race_ethnicity(population_data, + catnum, + catvarname, + catgroup, + current_row$race_type) + } + + # Assign geographic crosswalks + if (!is.na(population_data[1, get(catnum)]) && + (population_data[1, get(catnum)] %in% c("Cities/neighborhoods", "Regions", "Big cities") || + population_data[1, get(catvarname)] == 'pov200grp')) { # pov200grp is neighborhood poverty, so geography based + population_data <- assign_geographic_crosswalks(population_data, + catnum, + catgroup, + catvarname, + current_row$geo_type) + } + + # Assign age groupings + if (!is.na(population_data[1, get(catnum)]) && population_data[1, get(catnum)] == "Age") { + population_data <- assign_age_groupings(population_data, + catnum, + catvarname, + catgroup) + } + + } + + # Filter and clean results ---- + # Remove rows with missing primary category group + population_data <- population_data[!is.na(cat1_group)] + + # Remove rows with missing secondary category group (when category exists) + population_data <- population_data[is.na(cat2) | cat2 == 'NA' | + (!is.na(cat2) & cat2 != 'NA' & + !is.na(cat2_group) & cat2_group != 'NA'),] + + # Aggregate population data + population_data <- population_data[, list(pop = sum(pop)), + list(chi_age = age, + year, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group)] + + # Generate complete demographic combinations + complete_demographics <- create_demographic_shell(population_data, current_row) + + # Merge population data with complete demographics grid + population_data <- suppressWarnings(merge(population_data, + complete_demographics, + all = TRUE)) + + # Fill missing population values with zero + population_data[is.na(pop), pop := 0] + + # Add tab column + population_data[, tab := current_row$tab] + + # Convert placeholder "NA" strings back to true NA values + population_data[cat2 == "NA", c("cat2", "cat2_varname", "cat2_group") := NA] + + return(population_data) + } + diff --git a/R/chi_suppress_results.R b/R/chi_suppress_results.R new file mode 100644 index 0000000..e8815c0 --- /dev/null +++ b/R/chi_suppress_results.R @@ -0,0 +1,304 @@ +#' Suppress data due to small numbers and create a caution flag due to reliability +#' +#' @description +#' This function applies primary data suppression based on the numerator and +#' denominator and secondary suppression based on the numerator. It also flags +#' low reliability. Suppressed data are noted by \code{suppression = '^'} and +#' unreliable data are noted by \code{caution = '!'}. +#' +#' @details +#' Data source specific suppression ranges can be found in +#' \href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDE_SmallNumberUpdate.xlsx}{APDE_SmallNumberUpdate.xlsx} +#' +#' +#' Please review the +#' \href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDEDataPresentationAlgorithm_2020_Approved.pptx}{APDE} +#' and \href{https://www.doh.wa.gov/Portals/1/Documents/1500/SmallNumbers.pdf}{DOH} +#' suppression guidelines for details on the logic used in this function. +#' +#' This function expects data that has already been formatted for CHI. However, +#' the user can modify the default parameter settings for use with most tables of +#' summary data. +#' +#' @param ph.data A data.table or data.frame. Must contain the data to be suppressed +#' with standard metric names, +#' i.e., mean, median, sum, rate, lower, upper, se, rse, numerator, denominator, +#' proportion +#' @param suppress_range Integer vector of length 2. They specify the minimum and +#' maximum range for suppression. +#' @param secondary Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +#' indicating whether secondary suppression should be run +#' @param secondary_ids Character vector of column names which are used to define +#' groups for secondary suppression. +#' Note, this should not include the most granular level. For example, if you wanted +#' secondary suppression for race/ethnicity within HRAs +#' where \code{cat1_varname == 'race4'} and \code{cat1_group == 'AIAN'}, +#' \code{cat1_group == 'Asian'}, \code{cat1_group == 'Black'}, etc., you should +#' have \code{secondary_ids = c('hra20_name', 'cat1_varname')} rather than +#' \code{secondary_ids = c('hra20_name', 'cat1_varname', 'cat1_group')} +#' @param secondary_exclude An unquoted expression that evaluates to a logical vector +#' indicating which rows to include in secondary suppression analysis. Use this parameter +#' to exclude categories that are not mutually exclusive (e.g., overlapping demographic +#' groups). The expression should use column names from the dataset and evaluate to TRUE +#' for rows to include. For example: `secondary_exclude = cat1_varname != "race3"` would +#' exclude all rows where cat1_varname equals "race3" from secondary suppression. +#' @param flag_only Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +#' indicating whether data to be +#' suppressed should be flagged without setting estimates to NA +#' @param numerator_col Character string with the name of the numerator column. Default is "numerator". +#' @param denominator_col Character string with the name of the denominator column. Default is "denominator". +#' @param rse_col Character string with the name of the relative standard error column. Default is "rse". +#' @param columns_to_suppress Character vector of column names to be suppressed (set to NA) when +#' suppression is flagged. Default includes common result columns. +#' +#' @return A data.table with suppression applied to CHI standard columns. +#' +#' @export +#' +#' @keywords suppression +#' +#' @importFrom data.table ':=' data.table is.data.table setDT fsetdiff setorder setorderv copy +#' +#' @examples +#' \dontrun{ +#' set.seed(98104) +#' dt <- data.table::data.table( +#' chi_year = rep(2018, 100), +#' result = rnorm(100, .25, .05), +#' numerator = round(rnorm(100, 20, 9), 0), +#' denominator = round(rnorm(100, 100, 30), 0), +#' rse = sample(1:100, 100, replace = TRUE) +#' ) +#' +#' # Basic suppression with default parameters +#' newdt <- chi_suppress_results(ph.data = dt, +#' suppress_range = c(1, 9), +#' secondary = FALSE) +#' +#' nrow(dt[numerator %in% 1:9 | denominator %in% 1:9]) # rows needed suppression +#' nrow(newdt[suppression=='^']) # rows suppressed +#' nrow(newdt[rse >= 30 | numerator == 0]) # rows needing caution +#' nrow(newdt[caution=='!']) # rows with caution +#' +#' # With secondary suppression +#' dt$region <- sample(c("North", "South", "East", "Seattle"), 100, replace = TRUE) +#' dt$category <- sample(c("A", "B", "C"), 100, replace = TRUE) +#' +#' newdt2 <- chi_suppress_results(ph.data = dt, +#' suppress_range = c(1, 9), +#' secondary = TRUE, +#' secondary_ids = c("region", "category")) +#' +#' nrow(newdt[suppression=='^']) # only primary suppression +#' nrow(newdt2[suppression=='^']) # with secondary suppression +#' +#' # Using custom column names +#' dt2 <- data.table::data.table( +#' chi_year = rep(2018, 100), +#' mean = rnorm(100, .25, .05), +#' num = round(rnorm(100, 20, 9), 0), +#' denom = round(rnorm(100, 100, 30), 0), +#' rel_se = sample(1:100, 100, replace = TRUE) +#' ) +#' +#' newdt3 <- chi_suppress_results(ph.data = dt2, +#' suppress_range = c(1, 9), +#' numerator_col = "num", +#' denominator_col = "denom", +#' rse_col = "rel_se", +#' columns_to_suppress = c("mean", "num", "denom")) +#' +#' nrow(dt2[num %in% 1:9 | denom %in% 1:9]) # rows need suppression +#' nrow(newdt3[suppression == '^']) # rows suppressed +#' } +#' + +chi_suppress_results <- function(ph.data = NULL, + suppress_range = c(1, 9), + secondary = FALSE, + secondary_ids = c("tab", "indicator_key", "cat1", "cat2_group", "year"), + secondary_exclude, + flag_only = FALSE, + numerator_col = "numerator", + denominator_col = "denominator", + rse_col = "rse", + columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse", + "numerator", "denominator")){ + + ## Global variables used by data.table declared as NULL here to play nice with devtools::check() + numerator <- denominator <- suppression <- my.group <- my.order <- my.rowct <- + suppressed.group <- my.flag <- rse <- caution <- rows.unsuppressed <- + result <- lower_bound <- upper_bound <- se <- NULL + + # ---- Validate 'ph.data' ---- + if(is.null(ph.data)){ + stop("\n\U1F6D1 You must specify a dataset (i.e., 'ph.data' must be defined)") + } + + if(!data.table::is.data.table(ph.data)){ + if(is.data.frame(ph.data)){ + data.table::setDT(ph.data) + } else { + stop(paste0("\n\U1F6D1 <{ph.data}> must be the name of a data.frame or data.table.")) + } + } + + # ---- Validate 'suppress_range' ---- + if(is.null(suppress_range)){ + suppress_range <- c(0, 9) + } + + if(!is.null(suppress_range) & + (length(suppress_range) != 2 | suppress_range[1] %% 1 != 0 | suppress_range[2] %% 1 != 0 | + suppress_range[1] < 0 | suppress_range[2] < 0)){ + stop("\n\U1F6D1 must be comprised of two non-negative integers (i.e., 'c(0, 9)'") + } + + # ---- Validate 'secondary' ---- + if(!is.logical(secondary)){ + stop("\n\U1F6D1 'secondary' must be specified as a logical (i.e., TRUE, T, FALSE, or F)") + } + + # ---- Validate 'secondary_ids' ---- + if(secondary == TRUE & length(setdiff(secondary_ids, names(ph.data))) > 0){ + stop("\n\U1F6D1 At least one name in 'secondary_ids' is not found among the column names in 'ph.data'") + } + + # ---- Validate 'secondary_exclude' ---- + if(!missing(secondary_exclude)){ + call = match.call() + + if(is.character(call[['secondary_exclude']])){ + where = str2lang(call[['secondary_exclude']]) + warning('\u26A0\ufe0f `secondary_exclude` is a string. It was converted so that it would work, but in the future, this might turn into an error. + In the future, please pass unquoted commands that will resolve to a logical') + } else { + where = copy(call[['secondary_exclude']]) + } + + e <- substitute(expr = where) # get parse tree expression `where` + myfilter <- eval(expr = e, envir = ph.data, enclos = parent.frame()) # evaluate + stopifnot('`where` does not resolve to a logical' = is.logical(myfilter)) + if(nrow(ph.data[myfilter,]) < 1){ + stop(paste0("\n\U1F6D1 Your 'secondary_exclude' argument filters out all rows of data. Please revise and submit again")) + } + } + + # ---- Validate 'flag_only' ---- + if(!is.logical(flag_only)){ + stop("\n\U1F6D1 'flag_only' must be specified as a logical (i.e., TRUE, T, FALSE, or F)") + } + + # ---- Validate 'numerator_col' ---- + if(!numerator_col %in% names(ph.data)){ + stop(paste0("\n\U1F6D1 Required column '", numerator_col, "' is missing from the dataset")) + } + + # ---- Validate 'denominator_col' ---- + if(!denominator_col %in% names(ph.data)){ + warning(paste0("\u26A0\ufe0f Column '", denominator_col, "' is missing from the dataset. Only numerator-based suppression will be applied.")) + } + + # ---- Validate 'rse_col' ---- + if(!rse_col %in% names(ph.data)){ + warning(paste0("\u26A0\ufe0f Column '", rse_col, "', the value of `rse_col`, is missing from the dataset. `caution` flag for reliability will not be generated.")) + } + + # ---- Validate 'columns_to_suppress' ---- + missing_cols <- setdiff(columns_to_suppress, names(ph.data)) + if(length(missing_cols) > 0){ + warning(paste0("\u26A0\ufe0f The following columns specified in 'columns_to_suppress' are missing from the dataset and will be ignored: ", + paste(missing_cols, collapse = ", "))) + columns_to_suppress <- intersect(columns_to_suppress, names(ph.data)) + } + + + # ---- Copy ph.data to avoid changing the underlying data.table due to modification by references---- + temp.dt <- data.table::setDT(copy(ph.data)) + + # ---- Check for existing suppression and caution columns ---- + if("suppression" %in% names(temp.dt)){ + warning("\u26A0\ufe0f Existing 'suppression' column will be overwritten") + temp.dt[, suppression := NULL] # Remove existing column + } + + if("caution" %in% names(temp.dt)){ + warning("\u26A0\ufe0f Existing 'caution' column will be overwritten") + temp.dt[, caution := NULL] # Remove existing column + } + + # ---- Identify primary suppression ---- + # Check both numerator and denominator for suppression + temp.dt[, suppression := NA_character_] + + temp.dt[get(numerator_col) >= suppress_range[1] & get(numerator_col) <= suppress_range[2], suppression := "^"] + + if(denominator_col %in% names(temp.dt)){ # above used warning, not stop, if denominator_col didn't exist + temp.dt[get(denominator_col) >= suppress_range[1] & get(denominator_col) <= suppress_range[2], suppression := "^"] + } + + # ---- Identify secondary suppression (only based on numerator) ---- + if(isTRUE(secondary)){ + + # Apply secondary_exclude argument + if(!missing(secondary_exclude)){ + myfilter <- eval(expr = e, envir = temp.dt, enclos = parent.frame()) + temp.dt.aside <- data.table::fsetdiff(temp.dt, temp.dt[myfilter,]) + temp.dt <- temp.dt[myfilter,] + } + + # Create group id for each set of secondary_ids + temp.dt[, my.group := .GRP, by = secondary_ids] + data.table::setorder(temp.dt, my.group) + + # Identify max number of rows per group defined by secondary_ids + temp.dt[, my.rowct := .N, by = secondary_ids] + + # Identify groups that had initial suppression + temp.dt[, suppressed.group := FALSE] + temp.dt[my.group %in% unique(temp.dt[suppression == "^"]$my.group), suppressed.group := TRUE] + + # Within groups that had suppression, count the number of rows that were not suppressed + temp.dt[my.group %in% unique(temp.dt[suppressed.group == TRUE]$my.group) & is.na(suppression), + rows.unsuppressed := .N, by = secondary_ids] + suppressWarnings(temp.dt[, rows.unsuppressed := max(rows.unsuppressed, na.rm = TRUE), by = my.group]) + + # Identify when the number of un-suppressed rows (in groups that had suppression) is max rows minus 1 + # (these need secondary suppression) + temp.dt[is.na(suppression) & rows.unsuppressed == my.rowct - 1, + my.flag := "group needs secondary suppression"] + + # Sort table so the smallest numerator per group that needs secondary suppression is first + data.table::setorderv(temp.dt, c('my.group', numerator_col), na.last = TRUE) + + # Suppress row with smallest numerator among groups needing secondary suppression + if(nrow(temp.dt[my.flag == "group needs secondary suppression"]) > 0){ + temp.dt[my.flag == "group needs secondary suppression", my.order := 1:.N, by = my.group] + temp.dt[my.order == 1, suppression := "^"] + } + + # Drop all temporary variables + temp.dt[, (intersect(c("my.group", "suppressed.group", "my.rowct", "my.flag", "my.order", + "rows.unsuppressed"), names(temp.dt))) := NULL] + + # Combine back with data filtered out by secondary_exclude + if(exists("temp.dt.aside")){ + temp.dt <- rbind(temp.dt, temp.dt.aside) + rm(temp.dt.aside) + } + } + + # ---- Apply suppression to columns_to_suppress unless flag_only = FALSE ---- + # Use validated columns_to_suppress + if(isFALSE(flag_only)){ + temp.dt[suppression == "^", (columns_to_suppress) := NA] + } + + # ---- Apply caution flag if possible ---- + if(rse_col %in% names(temp.dt)){ + temp.dt[get(rse_col) >= 30 | get(numerator_col) == 0, caution := "!"] + } + + return(temp.dt) + +} diff --git a/R/globals.R b/R/globals.R index e51db41..133abd0 100644 --- a/R/globals.R +++ b/R/globals.R @@ -4,12 +4,13 @@ utils::globalVariables(c( "age", "AgeMax", "AgeMin", + "batched_id", "bigcity", + "catvarname", "cat1", "cat1_group", "cat1_varname", "cat2", - "cat2", "cat2_group", "cat2_varname", "caution", @@ -36,6 +37,7 @@ utils::globalVariables(c( "group", "group_by1", "group_by2", + "group_value", "hospitalizations", "hospitalizations_icd9", "hospitalizations_icd10", @@ -55,9 +57,11 @@ utils::globalVariables(c( "level", "lower_bound", "mechanism", + "min_age", + "min_start", "min1", "min2", - "min_age", + "max_stop", "max1", "max2", "max_age", @@ -69,6 +73,7 @@ utils::globalVariables(c( "pattern", "pop", "pov200grp", + "query_key", "race", "race_aic", "race_eth", @@ -77,11 +82,13 @@ utils::globalVariables(c( "region", "region_name", "relative.diff", + "representative_row_index", "result", "result_type", "result.x", "result.y", "round2", + "row_index", "rse", "run_date", "run_datex", @@ -90,6 +97,7 @@ utils::globalVariables(c( "source_id", "span", "start", + "stop", "suppression", "tab", "tempbv", diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..d591e0b --- /dev/null +++ b/R/zzz.R @@ -0,0 +1,41 @@ +.onAttach <- function(libname, pkgname) { + tryCatch({ + if (!requireNamespace("httr", quietly = TRUE)) { + packageStartupMessage("apde.chi.tools update check skipped: \n'httr' package is needed to check for updates and is not installed.") + return() + } + + url <- "https://raw.githubusercontent.com/PHSKC-APDE/apde.chi.tools/main/DESCRIPTION" + resp <- httr::GET(url) + + if (httr::status_code(resp) == 200) { # 200 means is okay / was successful + desc_text <- httr::content(resp, "text") + + # use Debian Control Format (.dcf) because the DESCRIPTION file in R uses dcf + desc_vals <- tryCatch(read.dcf(textConnection(desc_text)), error = function(e) NULL) + + remote_version <- NULL + if (!is.null(desc_vals) && "Version" %in% colnames(desc_vals)) { + remote_version <- package_version(desc_vals[1, "Version"]) + } + + local_version <- utils::packageVersion(pkgname) + + if (!is.null(remote_version) && remote_version > local_version) { + packageStartupMessage("-----------------------------------------------------") + packageStartupMessage("\u26A0\ufe0f A newer version of apde.chi.tools is available: ", remote_version) + packageStartupMessage("Your version: ", local_version) + update_msg <- ifelse(requireNamespace("devtools", quietly = TRUE), + '\U0001f504 To update, run: devtools::install_github("PHSKC-APDE/apde.chi.tools", auth_token = NULL)', + '\U0001f504 To update, install devtools and then run: devtools::install_github("PHSKC-APDE/apde.chi.tools", auth_token = NULL)' + ) + packageStartupMessage(update_msg) + packageStartupMessage("-----------------------------------------------------") + } + } else { + packageStartupMessage("Could not retrieve version info from GitHub (status: ", httr::status_code(resp), ")") + } + }, error = function(e) { + packageStartupMessage("Version check skipped: could not connect to GitHub") + }) +} diff --git a/man/chi_get_proper_pop.Rd b/man/chi_get_proper_pop.Rd index ab46819..53e6432 100644 --- a/man/chi_get_proper_pop.Rd +++ b/man/chi_get_proper_pop.Rd @@ -1,40 +1,49 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/chi_get_proper_pop.R +% Please edit documentation in R/chi_get_proper_pop.r \name{chi_get_proper_pop} \alias{chi_get_proper_pop} \title{Get Population Denominators for CHI Analysis} \usage{ -chi_get_proper_pop(pop.template = NULL, pop.genders = NULL, pop.ages = NULL) +chi_get_proper_pop( + pop.template = NULL, + pop.genders = NULL, + pop.ages = NULL, + is_chars = FALSE +) } \arguments{ \item{pop.template}{A data.table produced by \code{\link{chi_generate_instructions_pop}}, containing instructions for population data retrieval with the following columns: \itemize{ - \item year: Year range (e.g., "2019-2021" or single year) - \item cat1, cat1_varname: Primary stratification variables - \item cat2, cat2_varname: Secondary stratification variables - \item tab: Visualization tab type - \item start, stop: Start and end years parsed from the year range - \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') - \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') - \item group_by1, group_by2: Race/eth grouping specifications ('race_eth', 'race') + \item \code{year}: Year range (e.g., "2019-2021" or single year) + \item \code{cat1}, \code{cat1_varname}: Primary stratification variables + \item \code{cat2}, \code{cat2_varname}: Secondary stratification variables + \item \code{tab}: Visualization tab type + \item \code{start}, \code{stop}: Start and end years parsed from the year range + \item \code{race_type}: Race categorization type ('race', 'race_eth', or 'race_aic') + \item \code{geo_type}: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') + \item \code{group_by1}, group_by2: Race/eth grouping specifications ('race_eth', 'race') }} \item{pop.genders}{Optional character vector specifying which genders to include. -Valid values are 'f', 'female', 'm', 'male'. If NULL (default), both genders are included.} +Valid values are \code{'f'}, \code{'female'}, \code{'m'}, \code{'male'}. If NULL (default), both genders are included.} \item{pop.ages}{Optional integer vector specifying which ages to include. -If NULL (default), ages 0-100 are included.} +If \code{NULL} (default), ages 0-100 are included.} + +\item{is_chars}{Logical (T|F). If \code{TRUE}, will calculate King County population based +on ZIP codes beginning with 980 or 981, which is necessary for CHARS data. +Default \code{is_chars = FALSE}.} } \value{ A data.table containing population counts with columns: \itemize{ - \item chi_age: Single year age - \item year: Year - \item cat1, cat1_varname, cat1_group: Primary stratification variables - \item cat2, cat2_varname, cat2_group: Secondary stratification variables - \item tab: Visualization tab type - \item pop: Population count + \item \code{chi_age}: Single year age + \item \code{year}: Year + \item \code{cat1}, cat1_varname, cat1_group: Primary stratification variables + \item \code{cat2}, cat2_varname, cat2_group: Secondary stratification variables + \item \code{tab}: Visualization tab type + \item \code{pop}: Population count } } \description{ @@ -44,13 +53,33 @@ categories, geographic levels, and time periods to create appropriate population denominators for use in CHI rate calculations. } \details{ -The function performs multiple steps to generate proper population denominators: -1. Validates input parameters -2. Creates population tables for each row of the template using rads::get_population -3. Handles special cases for various geographic aggregations and crosswalks -4. Returns a comprehensive, tidy dataset with population counts +The function: + +1. Validates inputs + +2. Batches similar population queries based on key parameters + +3. Makes minimal database calls to cover all required year ranges + +4. Post-processes data in R to match the original template requirements + +5. When is_chars=TRUE, uses a different approach for King County that aggregates + populations from ZIP codes starting with 980/981 instead of using the standard + King County boundary + +For better performance, we recommended using futures for parallel processing. Here is one suggested set up: + +\preformatted{ +library(future) + +options(future.globals.maxSize = 3000 * 1024^2) # this sets a 3GB limit per future + +num.cores <- future::availableCores() - 1 # use one less than the available cores + +plan(multisession(workers = num.cores[1])) # set up a multisession plan +} } \seealso{ \code{\link{chi_generate_instructions_pop}} which generates the instructions used as input -to this function +to this function, \code{\link{chi_get_proper_pop}} the original implementation } diff --git a/man/chi_suppress_results.Rd b/man/chi_suppress_results.Rd new file mode 100644 index 0000000..a6c283e --- /dev/null +++ b/man/chi_suppress_results.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_suppress_results.R +\name{chi_suppress_results} +\alias{chi_suppress_results} +\title{Suppress data due to small numbers and create a caution flag due to reliability} +\usage{ +chi_suppress_results( + ph.data = NULL, + suppress_range = c(1, 9), + secondary = FALSE, + secondary_ids = c("tab", "indicator_key", "cat1", "cat2_group", "year"), + secondary_exclude, + flag_only = FALSE, + numerator_col = "numerator", + denominator_col = "denominator", + rse_col = "rse", + columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse", + "numerator", "denominator") +) +} +\arguments{ +\item{ph.data}{A data.table or data.frame. Must contain the data to be suppressed +with standard metric names, +i.e., mean, median, sum, rate, lower, upper, se, rse, numerator, denominator, +proportion} + +\item{suppress_range}{Integer vector of length 2. They specify the minimum and +maximum range for suppression.} + +\item{secondary}{Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +indicating whether secondary suppression should be run} + +\item{secondary_ids}{Character vector of column names which are used to define +groups for secondary suppression. +Note, this should not include the most granular level. For example, if you wanted +secondary suppression for race/ethnicity within HRAs +where \code{cat1_varname == 'race4'} and \code{cat1_group == 'AIAN'}, +\code{cat1_group == 'Asian'}, \code{cat1_group == 'Black'}, etc., you should +have \code{secondary_ids = c('hra20_name', 'cat1_varname')} rather than +\code{secondary_ids = c('hra20_name', 'cat1_varname', 'cat1_group')}} + +\item{secondary_exclude}{An unquoted expression that evaluates to a logical vector +indicating which rows to include in secondary suppression analysis. Use this parameter +to exclude categories that are not mutually exclusive (e.g., overlapping demographic +groups). The expression should use column names from the dataset and evaluate to TRUE +for rows to include. For example: `secondary_exclude = cat1_varname != "race3"` would +exclude all rows where cat1_varname equals "race3" from secondary suppression.} + +\item{flag_only}{Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +indicating whether data to be +suppressed should be flagged without setting estimates to NA} + +\item{numerator_col}{Character string with the name of the numerator column. Default is "numerator".} + +\item{denominator_col}{Character string with the name of the denominator column. Default is "denominator".} + +\item{rse_col}{Character string with the name of the relative standard error column. Default is "rse".} + +\item{columns_to_suppress}{Character vector of column names to be suppressed (set to NA) when +suppression is flagged. Default includes common result columns.} +} +\value{ +A data.table with suppression applied to CHI standard columns. +} +\description{ +This function applies primary data suppression based on the numerator and +denominator and secondary suppression based on the numerator. It also flags +low reliability. Suppressed data are noted by \code{suppression = '^'} and +unreliable data are noted by \code{caution = '!'}. +} +\details{ +Data source specific suppression ranges can be found in +\href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDE_SmallNumberUpdate.xlsx}{APDE_SmallNumberUpdate.xlsx} + + +Please review the +\href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDEDataPresentationAlgorithm_2020_Approved.pptx}{APDE} +and \href{https://www.doh.wa.gov/Portals/1/Documents/1500/SmallNumbers.pdf}{DOH} +suppression guidelines for details on the logic used in this function. + +This function expects data that has already been formatted for CHI. However, +the user can modify the default parameter settings for use with most tables of +summary data. +} +\examples{ +\dontrun{ +set.seed(98104) +dt <- data.table::data.table( + chi_year = rep(2018, 100), + result = rnorm(100, .25, .05), + numerator = round(rnorm(100, 20, 9), 0), + denominator = round(rnorm(100, 100, 30), 0), + rse = sample(1:100, 100, replace = TRUE) +) + +# Basic suppression with default parameters +newdt <- chi_suppress_results(ph.data = dt, + suppress_range = c(1, 9), + secondary = FALSE) + +nrow(dt[numerator \%in\% 1:9 | denominator \%in\% 1:9]) # rows needed suppression +nrow(newdt[suppression=='^']) # rows suppressed +nrow(newdt[rse >= 30 | numerator == 0]) # rows needing caution +nrow(newdt[caution=='!']) # rows with caution + +# With secondary suppression +dt$region <- sample(c("North", "South", "East", "Seattle"), 100, replace = TRUE) +dt$category <- sample(c("A", "B", "C"), 100, replace = TRUE) + +newdt2 <- chi_suppress_results(ph.data = dt, + suppress_range = c(1, 9), + secondary = TRUE, + secondary_ids = c("region", "category")) + +nrow(newdt[suppression=='^']) # only primary suppression +nrow(newdt2[suppression=='^']) # with secondary suppression + +# Using custom column names +dt2 <- data.table::data.table( + chi_year = rep(2018, 100), + mean = rnorm(100, .25, .05), + num = round(rnorm(100, 20, 9), 0), + denom = round(rnorm(100, 100, 30), 0), + rel_se = sample(1:100, 100, replace = TRUE) +) + +newdt3 <- chi_suppress_results(ph.data = dt2, + suppress_range = c(1, 9), + numerator_col = "num", + denominator_col = "denom", + rse_col = "rel_se", + columns_to_suppress = c("mean", "num", "denom")) + +nrow(dt2[num \%in\% 1:9 | denom \%in\% 1:9]) # rows need suppression +nrow(newdt3[suppression == '^']) # rows suppressed +} + +} +\keyword{suppression} diff --git a/tests/testthat/test-chi_suppress_results.R b/tests/testthat/test-chi_suppress_results.R new file mode 100644 index 0000000..3d1e234 --- /dev/null +++ b/tests/testthat/test-chi_suppress_results.R @@ -0,0 +1,224 @@ +library('data.table') +library('testthat') + +# create test data ---- +set.seed(98104) +dt <- suppressWarnings(data.table::data.table(chi_year = 2022, + indicator = "home ownership", + team = sample(as.vector(outer(letters, 1:15, paste0)), rep = T), + color = c("red", "blue", "yellow", "green"), + numerator = sample(1:200, 1000, rep = TRUE))) +dt[, denominator := sample(500:1000, 1000, rep = TRUE)] +dt[, result := numerator / denominator] +dt[, se := sqrt(result/100)] # not a real formula! +dt[, lower_bound := result - (1.96 * se)] +dt[, upper_bound := result + (1.96 * se)] +dt[, rse := 100*se / result] +setorder(dt, indicator, team, color, numerator) +dt[, counter := 1:.N, c("indicator", "team", "color")] +dt <- dt[counter == 1][, counter := NULL] + +# test defaults ---- +dt1 <- chi_suppress_results(dt) +test_that('Check that defaults work as expected',{ + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt[numerator %in% 1:9 | denominator %in% 1:9])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(result)])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(se)])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(lower_bound)])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(rse)])) + expect_equal(nrow(dt1[caution=="!"]), nrow(dt[!(numerator %in% 1:9 | denominator %in% 1:9) & (rse >=30 | numerator == 0)])) +}) + +# test suppression range ---- +dt2 <- chi_suppress_results(dt, suppress_range = c(0,10), secondary = FALSE) +test_that('Check that the suppression_range argument works',{ + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt[numerator <= 10 | denominator <= 10])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(result)])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(se)])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(lower_bound)])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(rse)])) + expect_equal(nrow(dt2[caution=="!"]), nrow(dt[!(numerator %in% 0:10 | denominator %in% 0:10) & (rse >=30)])) +}) + +# test secondary suppression ---- +dt3 <- chi_suppress_results(dt, suppress_range = c(0,10), + secondary = TRUE, + secondary_ids = c("indicator", "team")) +#ugly manual method to apply secondary suppression for comparison +sec.suppress3 <- copy(dt2) # build off results from initial / primary suppression +sec.suppress3[, max.grp.rows := .N, .(indicator, team)] # num of rows per set of secondary_ids +sec.suppress3[, group := .GRP, by = .(indicator, team)] # create group id for each set of secondary_ids +supp.ids <- unique(sec.suppress3[suppression=="^"]$group) # get group ids where there was initial suppression +sec.suppress3[, suppressed.group := F] +sec.suppress3[group %in% supp.ids, suppressed.group := T] # identify groups with initial suppression in table +sec.suppress3[group %in% supp.ids & is.na(suppression), unsuppressed := .N, .(indicator, team)] # rows unsuppressed per group +suppressWarnings(sec.suppress3[, unsuppressed := max(unsuppressed, na.rm = T), .(indicator, team)]) # fill in NA for rows unsuppressed +sec.suppress3[is.na(suppression) & unsuppressed == max.grp.rows - 1, secondary.suppression := T] # identify groups that need secondary suppression (groups with exactly one suppressed row) +setorder(sec.suppress3, group, numerator, na.last = T) # sort from smallest to largest numerator by group +sec.suppress3[secondary.suppression == T, order := 1:.N, group] # identify 1st row (smallest numerator) of each group needing secondary suppression +sec.suppress3[order==1, suppression := "^"] # mark the specific rows to have secondary suppression +sec.suppress3[suppression == "^", c("numerator", "denominator", "result", "se", "lower_bound", "upper_bound", "rse", "caution") := NA] +sec.suppress3[, c("max.grp.rows", "group", "suppressed.group", "unsuppressed", "secondary.suppression", "order") := NULL] + +test_that('Check that secondary suppression works',{ + expect_equal(nrow(dt3[suppression=="^"]), nrow(sec.suppress3[suppression=="^"])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(result)])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(se)])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(lower_bound)])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(rse)])) +}) + +# test NA handling in numerator and demominator ---- +dt_na <- copy(dt) +dt_na[1:10, numerator := NA] +dt_na[11:20, denominator := NA] +dt_na_result <- chi_suppress_results(dt_na) +test_that('Check NA handling in numerator and denominator', { + # Check whether NAs are properly handled and not incorrectly flagged + expect_equal(sum(is.na(dt_na_result[1:20]$suppression)), 20) +}) + +# test missing numerator column ---- +dt_no_num <- copy(dt)[, numerator := NULL] +test_that('Check error when numerator column is missing', { + expect_error(chi_suppress_results(dt_no_num), "Required column 'numerator' is missing") +}) + +# test secondary suppression group ---- +dt_single <- data.table( + chi_year = 2022, + indicator = c("A", "A", "B"), + team = c("team1", "team2", "team1"), + numerator = c(5, 15, 20), + denominator = c(100, 100, 100) +) +dt_single[, result := numerator / denominator] +dt_single[, rse := 20] + +dt_single_result <- suppressWarnings(chi_suppress_results(dt_single, + secondary = TRUE, + secondary_ids = c("indicator"))) + +test_that('Check secondary suppression with single row group', { + # Group A should be entirely suppressed and + # Group B shoudl have one unsuppressed row + expect_equal(nrow(dt_single_result[indicator == "A" & suppression=='^']), 2) + expect_equal(nrow(dt_single_result[indicator == "B" & is.na(suppression)]), 1) +}) + +# test secondary suppression with secondary_exclude ---- +dt4 <- chi_suppress_results(dt, suppress_range = c(0,10), + secondary = TRUE, + secondary_ids = c("indicator", "team"), + secondary_exclude = !team %in% c('a10', 'a11')) + +#ugly manual method to apply secondary suppression for testing +exclusion4 <- copy(dt2)[team %in% c('a10', 'a11')] # partition off part excluded from secondary suppression +sec.suppress4 <- copy(dt2)[!team %in% c('a10', 'a11')] # build off results from initial / primary suppression +sec.suppress4[, max.grp.rows := .N, .(indicator, team)] # num of rows per set of secondary_ids +sec.suppress4[, group := .GRP, by = .(indicator, team)] # create group id for each set of secondary_ids +supp.ids <- unique(sec.suppress4[suppression=="^"]$group) # get group ids where there was initial suppression +sec.suppress4[, suppressed.group := F] +sec.suppress4[group %in% supp.ids, suppressed.group := T] # identify groups with initial suppression in table +sec.suppress4[group %in% supp.ids & is.na(suppression), unsuppressed := .N, .(indicator, team)] # rows unsuppressed per group +suppressWarnings(sec.suppress4[, unsuppressed := max(unsuppressed, na.rm = T), .(indicator, team)]) # fill in NA for rows unsuppressed +sec.suppress4[is.na(suppression) & unsuppressed == max.grp.rows - 1, secondary.suppression := T] # identify groups that need secondary suppression (groups with exactly one suppressed row) +setorder(sec.suppress4, group, numerator, na.last = T) # sort from smallest to largest numerator by group +sec.suppress4[secondary.suppression == T, order := 1:.N, group] # identify 1st row (smallest numerator) of each group needing secondary suppression +sec.suppress4[order==1, suppression := "^"] # mark the specific rows to have secondary suppression +sec.suppress4[suppression == "^", c("numerator", "denominator", "result", "se", "lower_bound", "upper_bound", "rse", "caution") := NA] +sec.suppress4[, c("max.grp.rows", "group", "suppressed.group", "unsuppressed", "secondary.suppression", "order") := NULL] +sec.suppress4 <- rbind(sec.suppress4, exclusion4) + +test_that('Check that secondary suppression with exclusion works',{ + expect_equal(nrow(dt4[suppression=="^"]), nrow(sec.suppress4[suppression=="^"])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(result)])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(se)])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(lower_bound)])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(rse)])) +}) + +# test flag_only ---- +dt5 <- chi_suppress_results(dt, flag_only = TRUE) +test_that('Check that flag_only works',{ + expect_equal(nrow(dt5[suppression=="^"]), nrow(dt[numerator <= 9 | denominator <= 9])) + expect_equal(0, nrow(dt5[is.na(result)])) + expect_equal(0, nrow(dt5[is.na(se)])) + expect_equal(0, nrow(dt5[is.na(lower_bound)])) + expect_equal(0, nrow(dt5[is.na(rse)])) + expect_equal(nrow(dt5[caution=="!"]), nrow(dt[rse >=30 | numerator == 0])) +}) + +# test secondary_exclude when character and unquoted expression ---- +test_that('Check that the same results are returned whether or not quoted',{ + expect_warning(dt6 <- chi_suppress_results(dt, secondary_exclude = "team %like% '^a|^b|^c|^d'")) + dt7 <- chi_suppress_results(dt, secondary_exclude = team %like% '^a|^b|^c|^d') + expect_identical(dt6, dt7) +}) + +# test custom column names ---- +dt_custom <- copy(dt) +setnames(dt_custom, + old = c("numerator", "denominator", "result", "rse"), + new = c("num", "denom", "value", "rel_error")) + +test_that('Check that custom column names work correctly', { + dt_custom_result <- chi_suppress_results( + dt_custom, + numerator_col = "num", + denominator_col = "denom", + rse_col = "rel_error", + columns_to_suppress = c("value", "se", "lower_bound", "upper_bound", "num", "denom") + ) + + expect_equal(nrow(dt_custom_result[suppression=="^"]), + nrow(dt_custom[num <= 9 | denom <= 9])) + expect_equal(nrow(dt_custom_result[caution=="!"]), + nrow(dt_custom[rel_error >= 30 | num == 0])) +}) + +# test missing columns handling ---- +dt_missing <- copy(dt)[, rse := NULL] + +test_that("Check that missing rse column is handled properly", { + warnings <- character() + dt_missing_result <- withCallingHandlers( # needed because will produce two warnings + chi_suppress_results(dt_missing), + warning = function(w) { + warnings <<- c(warnings, conditionMessage(w)) + invokeRestart("muffleWarning") # prevents printing the warning + } + ) + + expect_true(any(grepl("Column 'rse', the value of `rse_col`, is missing", warnings))) + expect_true(any(grepl("columns specified in 'columns_to_suppress' are missing", warnings))) + expect_false("caution" %in% names(dt_missing_result)) +}) + +# test non-existent columns in columns_to_suppress ---- +test_that('Check handling of non-existent columns in columns_to_suppress', { + expect_warning( + dt_cols <- chi_suppress_results( + dt, + columns_to_suppress = c("result", "non_existent_column", "another_missing") + ), + "The following columns specified in 'columns_to_suppress' are missing from the dataset" + ) + + # Should only suppress existing columns + expect_true(all(is.na(dt_cols[suppression=="^"]$result))) +}) + +# test existing suppression column ---- +dt_exist <- copy(dt) +dt_exist[, suppression := "old"] + +test_that('Check that existing suppression column is overwritten with warning', { + expect_warning( + dt_exist_result <- chi_suppress_results(dt_exist), + "Existing 'suppression' column will be overwritten" + ) + + # Should have overwritten the old suppression column + expect_false(nrow(dt_exist_result[suppression == "old"]) > 0) +})