From c41afbdef9c64a96565f5c9403a577e1ceef7349 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 7 Mar 2025 14:47:54 -0800 Subject: [PATCH 01/11] Re-engineered chi_count_by_age - new validations - built in more tweaks for race3 headaches - dropped dependency on tidyr::crossing - progress meter - better documentation / comments --- NAMESPACE | 4 + R/globals.R | 5 +- R/proto_chi_count_by_age.R | 440 +++++++++++++++++++++++++++---------- man/chi_count_by_age.Rd | 40 +++- 4 files changed, 367 insertions(+), 122 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index e0ff381..698bbab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,6 +26,7 @@ importFrom(data.table,"%between%") importFrom(data.table,":=") importFrom(data.table,.GRP) importFrom(data.table,.SD) +importFrom(data.table,CJ) importFrom(data.table,`:=`) importFrom(data.table,between) importFrom(data.table,copy) @@ -47,6 +48,9 @@ importFrom(future.apply,future_lapply) importFrom(glue,glue) importFrom(glue,glue_sql) importFrom(odbc,odbc) +importFrom(progressr,handlers) +importFrom(progressr,progressor) +importFrom(progressr,with_progress) importFrom(rads,calc) importFrom(rads,round2) importFrom(rads,string_clean) diff --git a/R/globals.R b/R/globals.R index d646335..81642ca 100644 --- a/R/globals.R +++ b/R/globals.R @@ -17,6 +17,7 @@ utils::globalVariables(c( "chi_age", "chi_geo_kc", "chi_year", + "combo_idx", "comparison_with_kc_sig", "count", "crosstabs", @@ -50,6 +51,7 @@ utils::globalVariables(c( "lower_bound", "notable", "numerator", + "overall", "pattern", "pop", "pov200grp", @@ -91,5 +93,6 @@ utils::globalVariables(c( "_kingcounty.x", "_kingcounty.y", "_wastate.x", - "_wastate.y" + "_wastate.y", + ".I" )) diff --git a/R/proto_chi_count_by_age.R b/R/proto_chi_count_by_age.R index 67c1bcb..377a668 100644 --- a/R/proto_chi_count_by_age.R +++ b/R/proto_chi_count_by_age.R @@ -1,132 +1,344 @@ -#' Generate Age-Specific Counts for CHI Data +#' Generate Age-Specific Counts for Community Health Indicators #' #' @description -#' Creates a detailed breakdown of counts by age for CHI data analysis, most -#' often for age standardization.Processes data according to provided -#' instructions and handles demographic groupings. +#' Creates a detailed breakdown of counts by age for CHI data analysis. Primarily +#' used for age standardization and rate calculations when combined with +#' population estimates. Processes data according to provided instructions and +#' handles demographic groupings with special treatment for race and ethnicity +#' variables. #' #' @param ph.data Input data frame or data table containing CHI data -#' @param ph.instructions Data frame or data table containing calculation instructions -#' @param source_date Date of data source +#' @param ph.instructions Data frame or data table containing calculation specifications with columns: +#' \itemize{ +#' \item indicator_key: Name of the health metric to calculate +#' \item tab: Visualization tab type (kingcounty, demgroups, crosstabs) +#' \item cat1_varname, cat2_varname: Variable names for stratification +#' \item cat1, cat2: Human-friendly labels for these variables +#' \item start, end: Year range for the calculation +#' } +#' @param source_date Date of data source, added to output metadata #' -#' @return A data.table containing age-specific counts with standard CHI groupings -#' @importFrom data.table setDT rbindlist setnames := setorder data.table +#' @return A data.table containing age-specific counts with standard CHI groupings: +#' \itemize{ +#' \item indicator_key: Health metric identifier +#' \item year: Year range of data (e.g., "2019-2021" or single year) +#' \item tab: Visualization tab type +#' \item cat1, cat1_varname, cat1_group: Primary stratification variable details +#' \item cat2, cat2_varname, cat2_group: Secondary stratification variable details (if applicable) +#' \item chi_age: Age value (0-100) +#' \item count: Number of cases in that demographic-age group +#' \item source_date: Date of data source (if provided) +#' } +#' +#' @seealso +#' \code{\link{chi_generate_tro_shell}} which creates ph.instructions used by +#' \code{chi_count_by_age} +#' +#' \code{\link{chi_generate_instructions_pop}} which uses the output of the output +#' of \code{chi_count_by_age} +#' +#' @importFrom data.table setDT rbindlist setnames := setorder data.table CJ #' @importFrom rads calc #' @importFrom future.apply future_lapply -#' @importFrom tidyr crossing +#' @importFrom progressr handlers progressor with_progress #' @export #' chi_count_by_age <- function(ph.data = NULL, ph.instructions = NULL, - source_date = NULL){ - # Create 'Overall' if needed for crosstabs ---- - if(!'overall' %in% names(ph.data)){ - ph.data$overall <- with(ph.data, ifelse(chi_geo_kc == 'King County', 'Overall', NA_character_)) - } - - # Check to make sure all variables needed exist in the data ---- - neededbyvars <- setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA)) - if("race3" %in% neededbyvars & !"race3_hispanic" %in% neededbyvars){neededbyvars <- c(neededbyvars, 'race3_hispanic')} # By definition, Hispanic cannot be contained within race3 - - neededvars <- setdiff(unique(c(ph.instructions$indicator_key, neededbyvars)), c(NA)) - - missingvars <- setdiff(neededvars, names(ph.data)) - if(length(missingvars) > 0 ){ - stop(paste0("\n\U2620 ph.data is missing the following columns that are specified in ph.instructions: ", paste0(missingvars, collapse = ', '), ". ", - "\nIf `race3_hispanic` is listed, that is because, by definition, `race3` cannot have a Hispanic ethnicity in the same variable. So, two ", - "\nvariables (`race3` & `race3_hispanic`) will be processed and in the output, it will be called `race3`")) - } else{message("\U0001f642 All specified variables exist in ph.data")} - - # Check to make sure all byvariables have the CHI specified encoding ---- - stdbyvars <- rads.data::misc_chi_byvars[varname %in% setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA))][, list(varname, group, keepme, reference = 1)] - stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data - phbyvars <- rbindlist(lapply( - X=as.list(neededbyvars), - FUN = function(X){data.table(varname = X, group = setdiff(unique(ph.data[[X]]), NA), ph.data = 1)})) - compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) - if(nrow(compbyvars[is.na(reference)| is.na(ph.data)]) > 0){ - print(compbyvars[is.na(reference)| is.na(ph.data)]) - stop("\n\U2620 the table above shows the varname/group combinations that do not align between the reference table and your ph.data.") - } else {message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.")} - - # Cycle through a function that generates counts by age ---- - message("\U023F3 Be patient! The function is generating counts for each row of ph.instructions.") - tempCHIcount <- rbindlist(future_lapply( - X = as.list(seq(1, nrow(ph.instructions), 1)), - FUN = function(X){ - message(paste0("Calculating estimates for ph.instructions row ", X, " of ", nrow(ph.instructions), "...")) - - # create constants for calc---- - tempbv1 <- setdiff(ph.instructions[X][['cat1_varname']], c()) - tempbv2 <- setdiff(ph.instructions[X][['cat2_varname']], c()) - if(length(tempbv2) == 0){tempbv2 = NA} - tempbv <- setdiff(c(tempbv1, tempbv2), c(NA)) - tempbv <- c(tempbv, "chi_age") - - # create variables of interest used in calc function below - tempbv <- tempbv - tempend <- ph.instructions[X][['end']] - tempstart <- ph.instructions[X][['start']] - - # use calc---- - if(any(grepl('wastate', tempbv))){ - tempcount <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend, - by = tempbv, - metrics = c('numerator')) + source_date = NULL) { + + # Basic validation of inputs ---- + if (is.null(ph.data) || is.null(ph.instructions)) { + stop("Both ph.data and ph.instructions parameters are required") + } + + if (!is.data.frame(ph.data)) stop("\n\U1F6D1 ph.data must be a data.frame or data.table") + if (!is.data.frame(ph.instructions)) stop("\n\U1F6D1 ph.instructions must be a data.frame or data.table") + if (!is.null(source_date) & !inherits(source_date, "Date")) stop("\n\U1F6D1 source_date must be of type Date, if it is provided") + + # Convert inputs to data.table if they're not already + ph.data <- setDT(copy(ph.data)) + ph.instructions <- setDT(copy(ph.instructions)) + + # Create 'Overall' category if needed for crosstabs ---- + if (!"overall" %in% names(ph.data)) { + ph.data[, overall := ifelse(chi_geo_kc == "King County", "Overall", NA_character_)] + } + + # Check for required variables in the data ---- + # Extract all variable names needed from instructions + needed_byvars <- setdiff( + unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), + c(NA) + ) + + # Special case: race3 requires race3_hispanic by definition + # race3 in the Tableau viz is presented as a single variables, but in reality + # it is made up of a race indicator and a hispanic ethnicity indicator. So, + # when instructions request race3, they need race3_hispanic as well to calculate + # the final race3 values (which is race, with Hispanic as ethnicity) + if ("race3" %in% needed_byvars & !"race3_hispanic" %in% needed_byvars) { + needed_byvars <- c(needed_byvars, "race3_hispanic") + } + + # Combine byvar names with indicator keys to get all required variables + needed_vars <- setdiff( + unique(c(ph.instructions$indicator_key, needed_byvars)), + c(NA) + ) + + # Check if any required variables are missing from the data + missing_vars <- setdiff(needed_vars, names(ph.data)) + if (length(missing_vars) > 0) { + stop(paste0( + "\n\U2620 ph.data is missing the following columns that are specified in ph.instructions: ", + paste0(missing_vars, collapse = ", "), ". ", + "\nIf `race3_hispanic` is listed, that is because, by definition, `race3` cannot have a Hispanic ethnicity in the same variable. ", + "\nTwo variables (`race3` & `race3_hispanic`) will be processed and in the output, it will be called `race3`" + )) } else { - tempcount <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County', - by = tempbv, - metrics = c('numerator')) + message("\U0001f642 All specified variables exist in ph.data") } - # tidy---- - tempcount[, cat1 := ph.instructions[X][['cat1']]] - setnames(tempcount, ph.instructions[X][['cat1_varname']], 'cat1_group') - tempcount[, cat1_varname := ph.instructions[X][['cat1_varname']]] - tempcount[, cat2 := ph.instructions[X][['cat2']]] - if(!is.na(tempbv2) & tempbv1 != tempbv2){ - setnames(tempcount, ph.instructions[X][['cat2_varname']], 'cat2_group')} else{ - tempcount[, cat2_group := NA] } - tempcount[, cat2_varname := ph.instructions[X][['cat2_varname']]] - - tempcount <- tempcount[!is.na(cat1_group)] - tempcount <- tempcount[!(is.na(cat2_group) & !is.na(cat2))] - tempcount <- tempcount[!is.na(chi_age)] - tempcount <- tempcount[, list(cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, count = numerator)] - - # create reference table with every combination of cat1 x cat2 and age---- - cat1table <- data.table(cat1 = ph.instructions[X][['cat1']], - cat1_varname = tempbv1, - cat1_group = sort(setdiff(as.character(unique(ph.data[[tempbv1]])), NA)) ) - cat2table <- suppressWarnings(data.table(cat2 = ph.instructions[X][['cat2']], - cat2_varname = tempbv2, - cat2_group = sort(setdiff(as.character(unique(ph.data[[tempbv2]])), NA))) ) - cattable <- tidyr::crossing(cat1table, cat2table) - cattable <- setDT(tidyr::crossing(cattable, data.table(chi_age = 0:100))) - - # merge the counts onto the reference table to get every combo of age x cat1 x cat2---- - tempcount <- merge(cattable, tempcount, by = c('cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group', 'chi_age'), all = T) - tempcount[is.na(count), count := 0] # when count is NA, it is actually zero and zero is needed for calculating age adjusted rates - - # add on remaining essential identifiers---- - tempcount[, indicator_key := ph.instructions[X][['indicator_key']]] - tempcount[, tab := ph.instructions[X][['tab']]] - tempcount[ph.instructions[X][['end']] != ph.instructions[X][['start']], - year := paste0(ph.instructions[X][['start']], "-", ph.instructions[X][['end']])] - tempcount[ph.instructions[X][['end']] == ph.instructions[X][['start']], - year := ph.instructions[X][['end']]] - - # order output---- - tempcount <- tempcount[, list(indicator_key, year, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, count)] - setorder(tempcount, cat1_group, cat2_group, chi_age) + # Verify that categorical variables follow CHI encoding standards ---- + # Get standardized byvar values from reference data + std_byvars <- rads.data::misc_chi_byvars + std_byvars <- std_byvars[varname %in% setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA))] + std_byvars <- std_byvars[, list(varname, group, keepme, reference = 1)] + + # Handle race3 and Hispanic special case + std_byvars[group %in% c("Hispanic", "Non-Hispanic") & varname == "race3", + varname := "race3_hispanic"] + + # Extract unique values for each byvar in the actual data + data_byvars <- rbindlist(lapply( + X = as.list(needed_byvars), + FUN = function(x) { + data.table( + varname = x, + group = setdiff(unique(ph.data[[x]]), NA), + ph.data = 1 + ) + } + )) + + # Merge actual values to reference standards + byvar_comparison <- merge(std_byvars, + data_byvars, + by = c("varname", "group"), + all = TRUE) + + # Check for any mismatches between data and reference standards + if (nrow(byvar_comparison[is.na(reference) | is.na(ph.data)]) > 0) { + print(byvar_comparison[is.na(reference) | is.na(ph.data)]) + stop("\n\U2620 The table above shows the varname/group combinations that do not align between the reference table and your ph.data.") + } else { + message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.") + } + + # Generate counts for each row in instructions ---- + message("\U023F3 Be patient! The function is generating counts for each row of ph.instructions.") + + progressr::handlers(handler_progress()) + with_progress({ + p <- progressor(nrow(ph.instructions)) + count_results <- rbindlist(future_lapply( + X = as.list(seq_len(nrow(ph.instructions))), + FUN = function(row_idx) { + p(paste0("Processing row ", row_idx, " of ", nrow(ph.instructions) )) + # Set up calculations ---- + # Extract parameters for this calculation + current_instruction <- ph.instructions[row_idx] + primary_byvar <- current_instruction[["cat1_varname"]] + secondary_byvar <- current_instruction[["cat2_varname"]] + if (is.null(secondary_byvar) || length(secondary_byvar) == 0) { + secondary_byvar <- NA + } + # Combine all byvars including age + all_byvars <- setdiff(c(primary_byvar, secondary_byvar), c(NA)) + all_byvars <- unique(c(all_byvars, "chi_age")) + + # Calculate counts using rads::calc ---- + if (any(grepl("wastate", all_byvars))) { + # Washington state calculations + age_counts <- rads::calc( + ph.data = ph.data[chi_year >= current_instruction[["start"]] & chi_year <= current_instruction[["end"]]], + what = current_instruction[["indicator_key"]], + by = all_byvars, + metrics = c("numerator") + ) + } else { + # King County calculations + age_counts <- rads::calc( + ph.data = ph.data[chi_year >= current_instruction[["start"]] & chi_year <= current_instruction[["end"]] & chi_geo_kc == "King County"], + what = current_instruction[["indicator_key"]], + by = all_byvars, + metrics = c("numerator") + ) + } + + # Add cat1/cat2 information to results ---- + # Add cat1# info + age_counts[, cat1 := current_instruction[["cat1"]]] + setnames(age_counts, primary_byvar, "cat1_group") + age_counts[, cat1_varname := primary_byvar] + + # Add cat2# info + age_counts[, cat2 := current_instruction[["cat2"]]] + if (!is.na(secondary_byvar) & primary_byvar != secondary_byvar) { + setnames(age_counts, secondary_byvar, "cat2_group") + } else { + age_counts[, cat2_group := NA] + } + age_counts[, cat2_varname := secondary_byvar] + + # Filter out invalid combinations + age_counts <- age_counts[!is.na(cat1_group)] + age_counts <- age_counts[!(is.na(cat2_group) & !is.na(cat2))] + age_counts <- age_counts[!is.na(chi_age)] + + # Keep only necessary columns + age_counts <- age_counts[, list( + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + chi_age, count = numerator + )] + + # Create complete reference table for all combinations ---- + # When there is no data for a specific combination (e.g., NHPI 85+ in + # particular geography), the row will be missing. We want the row to be + # noted but with the count of 0, not NA. + + # cat1 summary table + cat1_table <- data.table( + cat1 = current_instruction[["cat1"]], + cat1_varname = primary_byvar, + cat1_group = sort(setdiff(as.character(unique(ph.data[[primary_byvar]])), NA)) + ) + + # cat2 summary table + if (!is.na(secondary_byvar)) { + cat2_table <- data.table( + cat2 = current_instruction[["cat2"]], + cat2_varname = secondary_byvar, + cat2_group = sort(setdiff(as.character(unique(ph.data[[secondary_byvar]])), NA)) + ) + } else { + cat2_table <- data.table( + cat2 = current_instruction[["cat2"]], + cat2_varname = secondary_byvar, + cat2_group = NA_character_ + ) + } + + # Create all logical combinations of cat1_table rows & cat2_table rows + # This approach preserves the correct relationship between variable + # names and their values. For example, if cat1_varname is 'chi_sex' then + # cat1_group must only contain values like 'Male' or 'Female', not values + # from other variables like 'Asian' or 'NHPI'. + # + # A simpler approach (like CJ with all columns directly) would create + # invalid combinations such as cat1_varname='chi_sex' with cat1_group='Hispanic', + # which is nonsensical. + # + # The nested lapply approach ensures that each category variable is only + # paired with its own valid values while still creating all valid combinations + # needed for the complete age-specific count table. + + all_combinations <- NULL + if (nrow(cat2_table) > 0) { + # For each row in cat1_table, combine with each row in cat2_table + all_combinations <- rbindlist(lapply(1:nrow(cat1_table), function(i) { + rbindlist(lapply(1:nrow(cat2_table), function(j) { + data.table( + cat1 = cat1_table[i]$cat1, + cat1_varname = cat1_table[i]$cat1_varname, + cat1_group = cat1_table[i]$cat1_group, + cat2 = cat2_table[j]$cat2, + cat2_varname = cat2_table[j]$cat2_varname, + cat2_group = cat2_table[j]$cat2_group + ) + })) + })) + } else { + all_combinations <- cat1_table + all_combinations[, `:=`( + cat2 = current_instruction[["cat2"]], + cat2_varname = secondary_byvar, + cat2_group = NA_character_ + )] + } + + # Now add all ages to create the final cartesian product + all_combinations_with_age <- CJ( + combo_idx = 1:nrow(all_combinations), + chi_age = 0:100, + unique = TRUE + ) + + all_combinations_with_age <- merge( + all_combinations_with_age, + all_combinations[, combo_idx := .I], + by = "combo_idx" + ) + + all_combinations_with_age[, combo_idx := NULL] + + + # Merge counts onto the reference table with all possible combinations ---- + complete_counts <- merge( + all_combinations_with_age, + age_counts, + by = c("cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group", "chi_age"), + all.x = TRUE + ) + + # Replace NA counts with 0 (needed for age-adjusted rates) + complete_counts[is.na(count), count := 0] + + # Add remaining identifiers + complete_counts[, indicator_key := current_instruction[["indicator_key"]]] + complete_counts[, tab := current_instruction[["tab"]]] + + # Format year range appropriately + if (current_instruction[["end"]] != current_instruction[["start"]]) { + complete_counts[, year := paste0(current_instruction[["start"]], "-", current_instruction[["end"]])] + } else { + complete_counts[, year := as.character(current_instruction[["start"]])] + } + + # Order and select final output columns + complete_counts <- complete_counts[, list( + indicator_key, year, tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + chi_age, count + )] + + setorder(complete_counts, cat1_group, cat2_group, chi_age) + return(complete_counts) + } # close function within future_lapply + ), # close future_lapply + use.names = TRUE) # close rbindlist + }) # close with_progress + + # Add source_date if provided + if (!is.null(source_date)) { + count_results[, source_date := source_date] + } else { + count_results[, source_date := as.Date(NA_character_)] } - ), use.names = TRUE) - # Return ---- - return(tempCHIcount) + # Modify race3 / race3_hispanic ---- + # as noted above, these are two distinct variables that are presented as a single + # variable in Tableau viz so the variable names need to be harmonized + count_results[cat1_varname == 'race3_hispanic', cat1_varname := 'race3'] + count_results[cat2_varname == 'race3_hispanic', cat2_varname := 'race3'] + drop_race3_groups <- rads.data::misc_chi_byvars[varname == 'race3' & keepme == 'No']$group + count_results <- count_results[!(cat1_varname == 'race3' & cat1_group %in% drop_race3_groups)] + count_results <- count_results[is.na(cat2) | !(cat2_varname == 'race3' & cat2_group %in% drop_race3_groups)] + + # Return the final results ---- + return(count_results) } diff --git a/man/chi_count_by_age.Rd b/man/chi_count_by_age.Rd index d5af4ac..d5645c5 100644 --- a/man/chi_count_by_age.Rd +++ b/man/chi_count_by_age.Rd @@ -2,22 +2,48 @@ % Please edit documentation in R/proto_chi_count_by_age.R \name{chi_count_by_age} \alias{chi_count_by_age} -\title{Generate Age-Specific Counts for CHI Data} +\title{Generate Age-Specific Counts for Community Health Indicators} \usage{ chi_count_by_age(ph.data = NULL, ph.instructions = NULL, source_date = NULL) } \arguments{ \item{ph.data}{Input data frame or data table containing CHI data} -\item{ph.instructions}{Data frame or data table containing calculation instructions} +\item{ph.instructions}{Data frame or data table containing calculation specifications with columns: +\itemize{ + \item indicator_key: Name of the health metric to calculate + \item tab: Visualization tab type (kingcounty, demgroups, crosstabs) + \item cat1_varname, cat2_varname: Variable names for stratification + \item cat1, cat2: Human-friendly labels for these variables + \item start, end: Year range for the calculation +}} -\item{source_date}{Date of data source} +\item{source_date}{Date of data source, added to output metadata} } \value{ -A data.table containing age-specific counts with standard CHI groupings +A data.table containing age-specific counts with standard CHI groupings: + \itemize{ + \item indicator_key: Health metric identifier + \item year: Year range of data (e.g., "2019-2021" or single year) + \item tab: Visualization tab type + \item cat1, cat1_varname, cat1_group: Primary stratification variable details + \item cat2, cat2_varname, cat2_group: Secondary stratification variable details (if applicable) + \item chi_age: Age value (0-100) + \item count: Number of cases in that demographic-age group + \item source_date: Date of data source (if provided) + } } \description{ -Creates a detailed breakdown of counts by age for CHI data analysis, most -often for age standardization.Processes data according to provided -instructions and handles demographic groupings. +Creates a detailed breakdown of counts by age for CHI data analysis. Primarily +used for age standardization and rate calculations when combined with +population estimates. Processes data according to provided instructions and +handles demographic groupings with special treatment for race and ethnicity +variables. +} +\seealso{ +\code{\link{chi_generate_tro_shell}} which creates ph.instructions used by +\code{chi_count_by_age} + +\code{\link{chi_generate_instructions_pop}} which uses the output of the output +of \code{chi_count_by_age} } From 53b4c5640fd39f06e34b4434a1fc28db386f763a Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 7 Mar 2025 14:48:49 -0800 Subject: [PATCH 02/11] Renamed proto_chi_count_by_age > chi_count_by_age --- R/{proto_chi_count_by_age.R => chi_count_by_age.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{proto_chi_count_by_age.R => chi_count_by_age.R} (100%) diff --git a/R/proto_chi_count_by_age.R b/R/chi_count_by_age.R similarity index 100% rename from R/proto_chi_count_by_age.R rename to R/chi_count_by_age.R From da04629d4657a3948ef1a2be4cb96c7e8b3b723b Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 7 Mar 2025 16:10:26 -0800 Subject: [PATCH 03/11] improved chi_generate_instructions_pop - improved roxygen2 header - simplified / tidier code, but same fundamental logic / structure --- R/proto_chi_generate_instructions_pop.R | 191 +++++++++++++++++------- 1 file changed, 135 insertions(+), 56 deletions(-) diff --git a/R/proto_chi_generate_instructions_pop.R b/R/proto_chi_generate_instructions_pop.R index e679bc7..4883416 100644 --- a/R/proto_chi_generate_instructions_pop.R +++ b/R/proto_chi_generate_instructions_pop.R @@ -1,67 +1,146 @@ #' Generate Population Instructions for CHI Analysis #' #' @description -#' Creates a instructions for rads::get_population() based on count data -#' specifications. Handles various geographic types and demographic groupings. +#' Creates instructions for \link{\code{CHI_get_proper_pop}} based on a table of count +#' data. These instructions configure appropriate demographic groupings, +#' geographic types, and time periods for retrieving population denominators used +#' in CHI rate calculations. #' -#' @param mycount.data Input data.table containing count data specifications -#' @param povgeo Geographic level for poverty analysis (NA or 'zip') +#' @param mycount.data Input data.table produced by \link{\code{chi_count_by_age}}, +#' containing the following columns: +#' \itemize{ +#' \item indicator_key: indicator_key used by CHI +#' \item year: Year range (e.g., "2019-2021" or single year) +#' \item tab: Visualization tab type +#' \item cat1, cat1_varname, cat1_group: Primary stratification variables +#' \item cat2, cat2_varname, cat2_group: Secondary stratification variables +#' \item chi_age: Single year age +#' \item count: Count of events (births, death, hospitalizations, etc. ) +#' } +#' @param Geographic level for poverty analysis ('blk' or 'zip') +#' +#' @return A data.table containing population processing instructions with columns: +#' \itemize{ +#' \item year: Original year range from input +#' \item cat1, cat1_varname: Primary stratification details +#' \item cat2, cat2_varname: Secondary stratification details +#' \item tab: Visualization tab type +#' \item start, stop: Start and end years parsed from the year range +#' \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') +#' \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') +#' \item group_by1, group_by2: Demographic grouping specifications +#' } +#' +#' @seealso +#' \code{\link{chi_count_by_age}} which generates the count data used as input +#' to this function +#' +#' \code{\link{chi_get_proper_pop}} which uses the output of this function #' -#' @return A data.table containing population processing instructions #' @importFrom data.table copy `:=` setorder tstrsplit #' @importFrom tools toTitleCase #' @export #' +chi_generate_instructions_pop <- function(mycount.data, + povgeo = c('blk', 'zip')) { + + # Validation of inputs ---- + if (is.null(mycount.data)) { + stop("\n\U1F6D1 mycount.data parameter is required") + } + + if (!is.data.frame(mycount.data)) { + stop("\n\U1F6D1 mycount.data must be a data.frame or data.table") + } else {mycount.data <- setDT(copy(mycount.data))} + + povgeo <- match.arg(povgeo) + + # Initial data preparation ---- + # Create a template with only the necessary columns to avoid duplicates + pop.template <- copy(mycount.data) + pop.template <- unique(copy(pop.template)[, list(year, cat1, cat1_varname, cat2, cat2_varname, tab)]) + + # Process year ranges ---- + # Split year ranges (e.g., "2019-2021") into start and stop years + pop.template[, c("start", "stop") := tstrsplit(year, split = '-')] + # For single years, set the stop year equal to the start year + pop.template[is.na(stop), stop := start] + + # Set default demographic settings ---- + # Default race type includes ethnicity (Hispanic as race) + pop.template[, race_type := 'race_eth'] + + # Handle maternal data prefixes ---- + # Remove "Birthing person's" prefix to standardize maternal data categories + pop.template[grepl("birthing person", cat1, ignore.case = TRUE), + cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] + pop.template[grepl("birthing person", cat2, ignore.case = TRUE), + cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] + + # Process geographic types and demographic groupings ---- + # Define OMB AIC (alone or in combination) + omb_aic <- c("chi_race_aic_aian", "chi_race_aic_asian", "chi_race_aic_black", + "chi_race_aic_his", "chi_race_aic_nhpi", "chi_race_aic_wht") + + # Process both primary (cat1) and secondary (cat2) stratification variables + for(catnum in c("1", "2")) { + temp.cat <- paste0("cat", catnum) + temp.varname <- paste0(temp.cat, "_varname") + temp.groupby <- paste0("group_by", catnum) + + # Set geographic type based on category + pop.template[get(temp.cat) == "Cities/neighborhoods", geo_type := "hra"] + + # Set race_type and group_by based on race/ethnicity variable + pop.template[get(temp.varname) == "race3", c("race_type", temp.groupby) := 'race'] + pop.template[get(temp.varname) == "race4", c("race_type", temp.groupby) := 'race_eth'] + pop.template[get(temp.varname) %in% omb_aic, c("race_type", temp.groupby) := 'race_aic'] + + # Filter out non-standard AIC race/ethnicity categories that don't have population data + pop.template <- pop.template[!(grepl('_aic_', get(temp.varname)) & + !get(temp.varname) %in% omb_aic)] + + # Set demographic grouping based on category label + pop.template[get(temp.cat) == "Ethnicity", c("race_type", temp.groupby) := 'race_eth'] + pop.template[get(temp.cat) == "Gender", (temp.groupby) := 'genders'] + pop.template[get(temp.cat) %in% c("Race", "Race/ethnicity") & get(temp.varname) == 'race4', + (temp.groupby) := 'race_eth'] + pop.template[(get(temp.cat) == "Race" & get(temp.varname) == 'race3'), + (temp.groupby) := 'race'] + + # Set geographic type based on regions + pop.template[get(temp.cat) == "Regions" & (is.na(geo_type) | geo_type != 'hra'), + geo_type := "region"] + pop.template[get(temp.cat) == "Big cities", geo_type := "hra"] + pop.template[get(temp.cat) == "Washington State", geo_type := "wa"] + } + + # Handle special geographic cases ---- + # Set geographic type for poverty analysis to block group level by default + pop.template[grepl("poverty$", cat1, ignore.case = TRUE) | + grepl("poverty$", cat2, ignore.case = TRUE), + geo_type := "blk"] + + # Override with zip code level for poverty analysis if specified in povgeo parameter + if(!is.na(povgeo) && povgeo == 'zip') { + pop.template[grepl("poverty$", cat1, ignore.case = TRUE) | + grepl("poverty$", cat2, ignore.case = TRUE), + geo_type := "zip"] + } + + # Special case for combined regions and cities/neighborhoods analysis + pop.template[(cat1 == "Regions" & cat2 == "Cities/neighborhoods") | + (cat2 == "Regions" & cat1 == "Cities/neighborhoods"), + geo_type := "blk"] + + # Set missing or default values ---- + # Replace NA cat2 with "NA" string for consistent processing + pop.template[is.na(cat2), cat2 := "NA"] + + # Set default geographic type as King County when not specified + pop.template[is.na(geo_type), geo_type := 'kc'] -chi_generate_instructions_pop <- function(mycount.data, povgeo = NA){ - pop.template <- copy(mycount.data) - pop.template <- unique(copy(pop.template)[, list(year, cat1, cat1_varname, cat2, cat2_varname, tab)]) - pop.template[, c("start", "stop") := tstrsplit(year, split = '-') ] - pop.template[is.na(stop), stop := start] # need to have an end date even if it is just one year - - pop.template[, race_type := 'race_eth'] # by default has race and OMB 97 with Hispanic as race - - # Drop prefix when using maternal data because do not want to create multiple alternative codings below ---- - pop.template[grepl("birthing person", cat1, ignore.case = T), cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] - pop.template[grepl("birthing person", cat2, ignore.case = T), cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] - - # Create geo_type & group_by arguments ---- - omb_aic <- c("chi_race_aic_aian", "chi_race_aic_asian", "chi_race_aic_black", "chi_race_aic_his", "chi_race_aic_nhpi", "chi_race_aic_wht") - - for(catnum in c("1", "2")){ - temp.cat <- paste0("cat", catnum) - pop.template[get(temp.cat) == "Cities/neighborhoods", geo_type := "hra"] - pop.template[get(paste0(temp.cat, "_varname")) == "race3", c("race_type", paste0("group_by", catnum)) := 'race'] - pop.template[get(paste0(temp.cat, "_varname")) == "race4", c("race_type", paste0("group_by", catnum)) := 'race_eth'] - pop.template[get(paste0(temp.cat, "_varname")) %in% omb_aic, c("race_type", paste0("group_by", catnum)) := 'race_aic'] - - # the only AIC race/eth with pop data are the standard OMB categories - pop.template <- pop.template[!(grepl('_aic_', get(paste0(temp.cat, "_varname"))) & !get(paste0(temp.cat, "_varname")) %in% omb_aic)] - - pop.template[get(temp.cat) == "Ethnicity", c("race_type", paste0("group_by", catnum)) := 'race_eth'] - pop.template[get(temp.cat) == "Gender", paste0("group_by", catnum) := 'genders'] - pop.template[get(temp.cat) %in% c("Race", "Race/ethnicity") & get(paste0(temp.cat, "_varname")) == 'race4', - paste0("group_by", catnum) := 'race_eth'] - pop.template[(get(temp.cat) == "Race" & get(paste0(temp.cat, "_varname")) == 'race3') , - paste0("group_by", catnum) := 'race'] - pop.template[get(temp.cat) == "Regions" & (is.na(geo_type) | geo_type != 'hra'), `:=` (geo_type = "region")] - pop.template[get(temp.cat) == "Big cities", `:=` (geo_type = "hra")] - pop.template[get(temp.cat) == "Washington State", `:=` (geo_type = "wa")] - } - - pop.template[grepl("poverty$", cat1, ignore.case = T) | grepl("poverty$", cat2, ignore.case = T), geo_type := "blk"] - if(povgeo == 'zip'){ - pop.template[grepl("poverty$", cat1, ignore.case = T) | grepl("poverty$", cat2, ignore.case = T), geo_type := "zip"] - } - pop.template[(cat1 == "Regions" & cat2 == "Cities/neighborhoods") | - (cat2 == "Regions" & cat1 == "Cities/neighborhoods"), - geo_type := "blk"] - - # the only AIC race/eth with population data are the OMB standard categories - - pop.template[is.na(cat2), cat2 := "NA"] # temporarily set NA to "NA" to facilitate processing with function - - pop.template[is.na(geo_type), geo_type := 'kc'] # when not specified, it is for KC - - pop.template <- unique(pop.template) # because want to minimize the calls to get_popualation to improve speed + # Return final results ---- + # Remove duplicate rows to minimize calls to get_population for efficiency + return(unique(pop.template)) } From 34d7401bf6f8dafd755a935b6fce3e0239bd4506 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 7 Mar 2025 16:11:27 -0800 Subject: [PATCH 04/11] rename proto_chi_generate_instructions_pop - file is now called chi_generate_instructions_pop --- ...enerate_instructions_pop.R => chi_generate_instructions_pop.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{proto_chi_generate_instructions_pop.R => chi_generate_instructions_pop.R} (100%) diff --git a/R/proto_chi_generate_instructions_pop.R b/R/chi_generate_instructions_pop.R similarity index 100% rename from R/proto_chi_generate_instructions_pop.R rename to R/chi_generate_instructions_pop.R From 78691b4fc4f87ba2b55069a2d9bd338c24da66e8 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Mar 2025 17:40:56 -0700 Subject: [PATCH 05/11] update / revise chi_get_proper_pop * Improved documentation with detailed Roxygen comments * Added input validation with helpful error messages * Clearer section comments * Replaced tidyr::crossing with data.table * progressr progress tracking * Standardized variable naming * Improved work with categorical variables and crosswalks --- NAMESPACE | 1 + R/chi_generate_instructions_pop.R | 6 +- R/globals.R | 1 + R/proto_chi_get_proper_pop.R | 788 ++++++++++++++++++--------- man/chi_count_by_age.Rd | 2 +- man/chi_generate_instructions_pop.Rd | 42 +- man/chi_get_proper_pop.Rd | 56 ++ 7 files changed, 623 insertions(+), 273 deletions(-) create mode 100644 man/chi_get_proper_pop.Rd diff --git a/NAMESPACE b/NAMESPACE index 698bbab..de2cc8a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(chi_generate_instructions_pop) export(chi_generate_metadata) export(chi_generate_tro_shell) export(chi_get_cols) +export(chi_get_proper_pop) export(chi_get_yaml) export(chi_qa_tro) export(chi_update_sql) diff --git a/R/chi_generate_instructions_pop.R b/R/chi_generate_instructions_pop.R index 4883416..92d74db 100644 --- a/R/chi_generate_instructions_pop.R +++ b/R/chi_generate_instructions_pop.R @@ -1,12 +1,12 @@ #' Generate Population Instructions for CHI Analysis #' #' @description -#' Creates instructions for \link{\code{CHI_get_proper_pop}} based on a table of count +#' Creates instructions for \code{\link{chi_get_proper_pop}} based on a table of count #' data. These instructions configure appropriate demographic groupings, #' geographic types, and time periods for retrieving population denominators used #' in CHI rate calculations. #' -#' @param mycount.data Input data.table produced by \link{\code{chi_count_by_age}}, +#' @param mycount.data Input data.table produced by \code{\link{chi_count_by_age}}, #' containing the following columns: #' \itemize{ #' \item indicator_key: indicator_key used by CHI @@ -17,7 +17,7 @@ #' \item chi_age: Single year age #' \item count: Count of events (births, death, hospitalizations, etc. ) #' } -#' @param Geographic level for poverty analysis ('blk' or 'zip') +#' @param povgeo Geographic level for poverty analysis ('blk' or 'zip') #' #' @return A data.table containing population processing instructions with columns: #' \itemize{ diff --git a/R/globals.R b/R/globals.R index 81642ca..fb2743b 100644 --- a/R/globals.R +++ b/R/globals.R @@ -49,6 +49,7 @@ utils::globalVariables(c( "latest_yearx", "level", "lower_bound", + "mykey", "notable", "numerator", "overall", diff --git a/R/proto_chi_get_proper_pop.R b/R/proto_chi_get_proper_pop.R index dff8d76..2d1fa91 100644 --- a/R/proto_chi_get_proper_pop.R +++ b/R/proto_chi_get_proper_pop.R @@ -1,277 +1,541 @@ -# CHI_get_proper_pop() - function to get population for a single row specified by the output of CHI_generate_instructions_pop() ---- -chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages = NULL){ - # check for valid values of pop.genders ---- - if(is.null(pop.genders)){gendery = c("f", "m") - }else{if(!tolower(pop.genders) %in% c('f', 'female', 'm', 'male')){ - stop("\n\U0001f47f if pop.genders is specified it must have one of the following values: 'F', 'f', 'Female', 'female', 'M', 'm', 'Male', or 'male'") - } else {gendery = pop.genders}} - - # check for valid values of pop.ages ---- - if(is.null(pop.ages)){agesy = c(0:100) - }else{if(!is.integer(pop.ages)){ - stop("\n\U0001f47f if pop.ages is specified it must be vector of integers, e.g., c(0:65)") - } else {agesy = pop.ages}} - - # create function to generate the population table corresponding to each row of the pop.template---- - CHI_get_proper_pop_engine <- function(X, pop.template = NULL){ - # Status updates ---- - print(paste0("Process ID ", Sys.getpid(), ": Getting population ", X, " out of ", nrow(pop.template))) - - # Drop prefix when using maternal data because do not want to create multiple alternative codings below ---- - pop.template[grepl("birthing person", cat1, ignore.case = T), cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] - pop.template[grepl("birthing person", cat2, ignore.case = T), cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] - - # create the group_by argument ---- - groupy <- unique(c(c("ages", "geo_id"), setdiff(c(pop.template[X, group_by1], pop.template[X, group_by2]), c(NA)))) - - # use rads::get_population ---- - if(is.na(pop.template[X, geo_type])){ - tempy <- rads::get_population(group_by = groupy, - race_type = pop.template[X, race_type], - years = pop.template[X, start]:pop.template[X, stop], - genders = gendery, - ages = agesy, - round = F) - } - if(!is.na(pop.template[X, geo_type])){ - tempy <- rads::get_population(group_by = groupy, - geo_type = pop.template[X, geo_type], - race_type = pop.template[X, race_type], - years = pop.template[X, start]:pop.template[X, stop], - genders = gendery, - ages = agesy, - round = F) - } - - # tidy the population data ---- - for(catnum in c("cat1", "cat2")){ - # misc ---- - tempy[, paste0(catnum) := pop.template[X, get(catnum)]] - tempy[, paste0(catnum, "_varname") := pop.template[X, get(paste0(catnum, "_varname"))]] - - tempy[get(catnum) == "King County", paste0(catnum, "_group") := "King County"] - - tempy[get(catnum) == "Washington State", paste0(catnum, "_group") := "Washington State"] - - suppressWarnings(tempy[get(catnum) == "NA" | is.na(get(catnum)), - c(catnum, paste0(catnum, "_group"), paste0(catnum, "_varname")) := "NA"]) # just a random fill value for NA, which will be changed to true NA later - - tempy[get(catnum) %in% c("Cities/neighborhoods", "Regions") & pop.template[X, geo_type] != 'blk', - paste0(catnum, "_group") := geo_id] - - tempy[get(catnum) %in% c("Gender"), paste0(catnum, "_group") := gender] - - tempy[get(catnum) %in% c("Overall"), paste0(catnum, "_group") := "Overall"] - - - # race/eth ---- - tempy[get(catnum) == "Ethnicity" | get(paste0(catnum, "_varname")) %in% c('race4'), paste0(catnum, "_group") := race_eth] - tempy[get(catnum) == 'Race' & get(paste0(catnum, "_varname")) %in% c('race3'), paste0(catnum, "_group") := race] - tempy[get(paste0(catnum, "_group")) == "Multiple race", paste0(catnum, "_group") := "Multiple"] - tempy <- tempy[get(catnum) != "Ethnicity" | (get(catnum) == "Ethnicity" & get(paste0(catnum, "_group")) == 'Hispanic'), ] - - # race_aic ---- - if(pop.template[X, race_type] == 'race_aic'){ - tempy <- tempy[!(grepl('_aic_', get(paste0(catnum, "_varname"))) & - !((get(paste0(catnum, "_varname")) == 'chi_race_aic_aian' & race_aic == 'AIAN') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_asian' & race_aic == 'Asian') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_black' & race_aic == 'Black')| - (get(paste0(catnum, "_varname")) == 'chi_race_aic_his' & race_aic == 'Hispanic') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_wht' & race_aic == 'White')) - )] - tempy[grep('_aic', get(paste0(catnum, "_varname"))), paste0(catnum, "_group") := race_aic] - } - - # HRAS ---- - if(tempy[1, geo_type] == 'blk' & tempy[1, get(catnum)] == 'Cities/neighborhoods'){ - temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] - tempy <- merge(tempy, temp.xwalk, by = "geo_id", all.x = T, all.y = F) - tempy[, paste0(catnum, "_group") := hra20_name] +#' Get Population Denominators for CHI Analysis +#' +#' @description +#' Retrieves population estimates based on instructions generated by +#' \code{\link{chi_generate_instructions_pop}}. This function processes demographic +#' categories, geographic levels, and time periods to create appropriate +#' population denominators for use in CHI rate calculations. +#' +#' @param pop.template A data.table produced by \code{\link{chi_generate_instructions_pop}}, +#' containing instructions for population data retrieval with the following columns: +#' \itemize{ +#' \item year: Year range (e.g., "2019-2021" or single year) +#' \item cat1, cat1_varname: Primary stratification variables +#' \item cat2, cat2_varname: Secondary stratification variables +#' \item tab: Visualization tab type +#' \item start, stop: Start and end years parsed from the year range +#' \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') +#' \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') +#' \item group_by1, group_by2: Race/eth grouping specifications ('race_eth', 'race') +#' } +#' @param pop.genders Optional character vector specifying which genders to include. +#' Valid values are 'f', 'female', 'm', 'male'. If NULL (default), both genders are included. +#' @param pop.ages Optional integer vector specifying which ages to include. +#' If NULL (default), ages 0-100 are included. +#' +#' @return A data.table containing population counts with columns: +#' \itemize{ +#' \item chi_age: Single year age +#' \item year: Year +#' \item cat1, cat1_varname, cat1_group: Primary stratification variables +#' \item cat2, cat2_varname, cat2_group: Secondary stratification variables +#' \item tab: Visualization tab type +#' \item pop: Population count +#' } +#' +#' @details +#' The function performs multiple steps to generate proper population denominators: +#' 1. Validates input parameters +#' 2. Creates population tables for each row of the template using rads::get_population +#' 3. Handles special cases for various geographic aggregations and crosswalks +#' 4. Returns a comprehensive, tidy dataset with population counts +#' +#' @seealso +#' \code{\link{chi_generate_instructions_pop}} which generates the instructions used as input +#' to this function +#' +#' @importFrom data.table setDT rbindlist `:=` +#' @importFrom future.apply future_lapply +#' @importFrom tools toTitleCase +#' @importFrom progressr handlers progressor with_progress +#' @export +#' +chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages = NULL) { + # Validation of arguments ---- + # pop.template + if (is.null(pop.template)) { + stop("\n\U1F6D1 pop.template parameter is required") } - # Regions ---- - if(tempy[1, geo_type] == 'blk' & tempy[1, get(catnum)] == 'Regions'){ - temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, region_name)] - tempy <- merge(tempy, temp.xwalk, by = 'geo_id', all.x = T, all.y = F) - - tempy[, paste0(catnum, "_group") := region_name] + if (!is.data.frame(pop.template)) { + stop("\n\U1F6D1 pop.template must be a data.frame or data.table") + } else { + pop.template <- setDT(copy(pop.template)) } - if(tempy[1, geo_type] == 'hra' & tempy[1, get(catnum)] == 'Regions'){ - temp.xwalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] - tempy <- merge(tempy, temp.xwalk, by = 'geo_id', all.x = T, all.y = F) - - tempy[, paste0(catnum, "_group") := region_name] - } + # Check for required columns + required_columns <- c("year", "cat1", "cat1_varname", "cat2", "cat2_varname", + "start", "stop", "race_type", "geo_type", "tab") + missing_columns <- required_columns[!required_columns %in% names(pop.template)] - if(tempy[1, geo_type] == 'zip' & tempy[1, get(catnum)] == 'Regions'){ - zip_2_region <- rads.data::spatial_zip_to_hra20_pop - zip_2_region <- merge(zip_2_region, - rads.data::spatial_hra20_to_region20[, list(hra20_name, region = region_name)], - by = 'hra20_name', - all = T) - zip_2_region <- zip_2_region[, list(s2t_fraction = sum(s2t_fraction)), # collapse fractions down to region level - list(geo_id = as.character(source_id), region)] - - tempy <- merge(tempy, zip_2_region, by = "geo_id", all.x = T, all.y = F, allow.cartesian = T) - tempy[, pop := pop * s2t_fraction] # calculate weighted pop - tempy[, paste0(catnum, "_group") := region] + if (length(missing_columns) > 0) { + stop("\n\U1F6D1 pop.template is missing required columns: ", + paste(missing_columns, collapse = ", ")) } - # Big Cities ---- - if(tempy[1, get(catnum)] == 'Big cities'){ - if(tempy[1, geo_type] == 'blk'){ - blk20_hra20 <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] - tempy <- merge(tempy, blk20_hra20, by = "geo_id", all.x = T, all.y = F) - hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] - tempy <- merge(tempy, hra20_bigcity, by = 'hra20_name', all.x = T, all.y = F) + # pop.genders + if (is.null(pop.genders)) { + gender_values <- c("f", "m") + } else { + if (!tolower(pop.genders) %in% c('f', 'female', 'm', 'male')) { + stop("\n\U0001f47f if pop.genders is specified, it is limited to one of the following values: 'F', 'f', 'Female', 'female', 'M', 'm', 'Male', or 'male'") + } else { + gender_values <- pop.genders } - if(tempy[1, geo_type] == 'hra'){ - hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] - tempy <- merge(tempy, hra20_bigcity, by.x = 'geo_id', by.y = 'hra20_name', all.x = T, all.y = F) - } - tempy[, paste0(catnum, "_group") := bigcity] - } - - # age6 ---- - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 0:17, paste0(catnum, "_group") := "<18"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 18:24, paste0(catnum, "_group") := "18-24"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 25:44, paste0(catnum, "_group") := "25-44"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 45:64, paste0(catnum, "_group") := "45-64"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 65:74, paste0(catnum, "_group") := "65-74"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age >= 75, paste0(catnum, "_group") := "75+"] - - # mage5 ---- - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 10:17, paste0(catnum, "_group") := "10-17"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 18:24, paste0(catnum, "_group") := "18-24"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 25:34, paste0(catnum, "_group") := "25-34"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 35:44, paste0(catnum, "_group") := "35-44"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age >=45, paste0(catnum, "_group") := "45+"] - - # yage4 ---- - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 0:4, paste0(catnum, "_group") := "0-4"] - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 5:9, paste0(catnum, "_group") := "5-9"] - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 10:14, paste0(catnum, "_group") := "10-14"] - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 15:17, paste0(catnum, "_group") := "15-17"] - - # pov200grp ---- - if(tempy[1, geo_type] == 'blk' & grepl("poverty$", tempy[1, get(catnum)], ignore.case = T)){ - tempy[, geo_tract2020 := substr(geo_id, 1, 11)] # have blocks (15 char), so keep first 11 for tracts - tempy <- merge(tempy, - rads.data::misc_poverty_groups[geo_type=='Tract'][, list(geo_tract2020 = geo_id, pov200grp)], - by = "geo_tract2020", - all.x = T, - all.y = F) - tempy[, paste0(catnum, "_group") := pov200grp] } - if( tempy[1, geo_type] == 'zip' & grepl("poverty$", tempy[1, get(catnum)], ignore.case = T)){ - tempy <- merge(tempy, - rads.data::misc_poverty_groups[geo_type=='ZCTA'][, list(geo_id, pov200grp)], - by = 'geo_id', - all.x = T, - all.y = F) - tempy[, paste0(catnum, "_group") := pov200grp] + # pop.ages + if (is.null(pop.ages)) { + age_values <- c(0:100) + } else { + if (!is.integer(pop.ages)) { + stop("\n\U0001f47f if pop.ages is specified it must be vector of integers, e.g., c(0:65)") + } else { + age_values <- pop.ages + } } - } - - # Drop if is.na(cat1_group) - tempy <- tempy[!is.na(cat1_group)] - - # drop if is.na(cat2_group) - tempy <- tempy[!(cat2 != 'NA' & (cat2_group == 'NA') | is.na(cat2_group))] # did not yet switch back to true NA at this point - - # collapse to one row per demographic combination and keep minimum needed columns ---- - tempy <- tempy[, list(pop = sum(pop)), list(chi_age = age, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] - - # ensure each demographic has rows for all relevant ages & only relevant ages ---- - if(tempy[1]$cat1 == "Age"){ - if(tempy[1]$cat1_varname == 'age6'){ - tempage <- data.table(cat1 = "Age", cat1_varname = "age6", chi_age = 0:100) - tempage[, cat1_group := cut(chi_age, - breaks = c(-1, 17, 24, 44, 64, 74, 120), - labels = c("<18", "18-24", "25-44", "45-64", "65-74", "75+"))]} - - if(tempy[1]$cat1_varname == 'mage5'){ - tempage <- data.table(cat1 = "Age", cat1_varname = "mage5", chi_age = 10:100) - tempage[, cat1_group := cut(chi_age, - breaks = c(9, 17, 24, 34, 44, 120), - labels = c('10-17', '18-24', '25-34', '35-44', '45+'))]} - - if(tempy[1]$cat1_varname == 'yage4'){ - tempage <- data.table(cat1 = "Age", cat1_varname = "yage4", chi_age = 0:17) - tempage[, cat1_group := cut(chi_age, - breaks = c(-1, 4, 9, 14, 17), - labels = c('0-4', '5-9', '10-14', '15-17'))]} - - temp.demog <- setDT(tidyr::crossing(unique(tempy[, list(year = as.character(year), cat2, cat2_varname, cat2_group)]), - tempage)) - } - - if(tempy[1]$cat2 == "Age"){ - if(tempy[1]$cat2_varname == 'age6'){ - tempage <- data.table(cat2 = "Age", cat2_varname = "age6", chi_age = 0:100) - tempage[, cat2_group := cut(chi_age, - breaks = c(-1, 17, 24, 44, 64, 74, 120), - labels = c("<18", "18-24", "25-44", "45-64", "65-74", "75+"))]} - - if(tempy[1]$cat2_varname == 'mage5'){ - tempage <- data.table(cat2 = "Age", cat2_varname = "mage5", chi_age = 10:100) - tempage[, cat2_group := cut(chi_age, - breaks = c(9, 17, 24, 34, 44, 120), - labels = c('10-17', '18-24', '25-34', '35-44', '45+'))]} - - if(tempy[1]$cat2_varname == 'yage4'){ - tempage <- data.table(cat2 = "Age", cat2_varname = "yage4", chi_age = 0:17) - tempage[, cat2_group := cut(chi_age, - breaks = c(-1, 4, 9, 14, 17), - labels = c('0-4', '5-9', '10-14', '15-17'))]} - - temp.demog <- setDT(tidyr::crossing(unique(tempy[, list(year = as.character(year), cat1, cat1_varname, cat1_group)]), - tempage)) - } + # Define core population extraction function ---- + get_population_for_template_row <- function(row_index, pop.template = NULL) { + # Status update with progress indicator ---- + print(paste0("Process ID ", Sys.getpid(), ": Getting population ", row_index, " out of ", nrow(pop.template))) + + # Extract current template row ---- + current_row <- pop.template[row_index, ] + + # Standardize category names ---- + # Remove birthing person prefixes to standardize maternal data categories + pop.template[grepl("birthing person", cat1, ignore.case = TRUE), + cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] + pop.template[grepl("birthing person", cat2, ignore.case = TRUE), + cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] + + # Build grouping parameters for pop query ---- + grouping_vars <- unique(c( + c("ages", "geo_id"), + setdiff(c(current_row$group_by1, current_row$group_by2), c(NA)) + )) + + # Use rads::get_population() ---- + if (is.na(current_row$geo_type)) { + population_data <- rads::get_population( + group_by = grouping_vars, + race_type = current_row$race_type, + years = current_row$start:current_row$stop, + genders = gender_values, + ages = age_values, + round = FALSE + ) + } else { + population_data <- rads::get_population( + group_by = grouping_vars, + geo_type = current_row$geo_type, + race_type = current_row$race_type, + years = current_row$start:current_row$stop, + genders = gender_values, + ages = age_values, + round = FALSE + ) + } - if(!"Age" %in% unique(c(tempy$cat1, tempy$cat2))){ - # all combinations of cat1 x cat2 - temp.demog <- setDT(tidyr::crossing( - unique(tempy[, list(cat1, cat1_varname, cat1_group)]), - unique(tempy[, list(cat2, cat2_varname, cat2_group)]) - )) - # all combination of cat table with year and age - temp.demog <- setDT(tidyr::crossing( - temp.demog, - data.table(year = as.character(pop.template[X, ]$year), chi_age = 0:100) - )) + # Process demographic categories ---- + # Apply category grouping logic for both primary (cat1) and secondary (cat2) categories + for (catnum in c("1", "2")) { + ## Add category information from template to result ---- + temp.cat <- paste0("cat", catnum) + temp.varname <- paste0("cat", catnum, '_varname') + temp.group <- paste0("cat", catnum, '_group') + temp.groupby <- paste0("group_by", catnum) + + # had to use set function because regular := syntax caused errors b/c used temp.cat differently on both sides of := + data.table::set(population_data, + j = temp.cat, + value = current_row[[temp.cat]]) + + data.table::set(population_data, + j = temp.varname, + value = current_row[[temp.varname]]) + + data.table::set(population_data, + j = temp.group, + value = current_row[[temp.groupby]]) + + ## Process geographic categories ---- + # King County + population_data[get(temp.cat) == "King County", + c(temp.group) := "King County"] + + # Washington State + population_data[get(temp.cat) == "Washington State", + c(temp.group) := "Washington State"] + + # Handle NA values + suppressWarnings( + population_data[get(temp.cat) == "NA" | is.na(get(temp.cat)), + c(temp.cat, temp.group, temp.varname) := "NA"] + ) + + # Cities/neighborhoods and Regions + population_data[get(temp.cat) %in% c("Cities/neighborhoods", "Regions") & + current_row$geo_type != 'blk', + c(temp.group) := geo_id] + + ## Process gender ---- + population_data[get(temp.cat) %in% c("Gender"), c(temp.group) := gender] + + ## Process 'Overall' ---- + population_data[get(temp.cat) %in% c("Overall"), c(temp.group) := "Overall"] + + ## Process race/ethnicity categories ---- + population_data[get(temp.cat) == "Ethnicity" | get(temp.varname) %in% c('race4'), + c(temp.group) := race_eth] + + population_data[get(temp.cat) == 'Race' & get(temp.varname) %in% c('race3'), + c(temp.group) := race] + + population_data <- population_data[get(temp.cat) != "Ethnicity" | (get(temp.cat) == "Ethnicity" & get(temp.group) == 'Hispanic'), ] + + population_data[get(temp.group) == "Multiple race", + c(temp.group) := "Multiple"] + + ## Process race_aic (alone or in combination) categories ---- + if (current_row$race_type == 'race_aic') { + # Filter to keep only relevant race_aic combinations + population_data <- population_data[ + !(grepl('_aic_', get(temp.varname)) & + !((get(temp.varname) == 'chi_race_aic_aian' & race_aic == 'AIAN') | + (get(temp.varname) == 'chi_race_aic_asian' & race_aic == 'Asian') | + (get(temp.varname) == 'chi_race_aic_black' & race_aic == 'Black') | + (get(temp.varname) == 'chi_race_aic_his' & race_aic == 'Hispanic') | + (get(temp.varname) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | + (get(temp.varname) == 'chi_race_aic_wht' & race_aic == 'White')) + ) + ] + + # Assign race_aic value to group + population_data[grep('_aic', get(temp.varname)), + c(temp.group) := race_aic] + } + + ## Process HRAs ---- + if (population_data[1, geo_type] == 'blk' & population_data[1, get(temp.cat)] == 'Cities/neighborhoods') { + + hra_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] + + population_data <- merge(population_data, + hra_crosswalk, + by = "geo_id", + all.x = TRUE, + all.y = FALSE) + + population_data[, c(temp.group) := hra20_name] + } + + ## Process Regions ---- + # Block to Region crosswalk + if (population_data[1, geo_type] == 'blk' & population_data[1, get(temp.cat)] == 'Regions') { + + region_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, region_name)] + + population_data <- merge(population_data, + region_crosswalk, + by = 'geo_id', + all.x = TRUE, + all.y = FALSE) + + population_data[, c(temp.group) := region_name] + } + + # HRA to Region crosswalk + if (population_data[1, geo_type] == 'hra' & population_data[1, get(temp.cat)] == 'Regions') { + + region_crosswalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] + + population_data <- merge(population_data, + region_crosswalk, + by = 'geo_id', + all.x = TRUE, + all.y = FALSE) + + population_data[, c(temp.group) := region_name] + } + + # ZIP to Region crosswalk with population weighting + if (population_data[1, geo_type] == 'zip' & population_data[1, get(temp.cat)] == 'Regions') { + + # Create ZIP to region crosswalk with population weights + zip_region_crosswalk <- rads.data::spatial_zip_to_hra20_pop + + zip_region_crosswalk <- merge(zip_region_crosswalk, + rads.data::spatial_hra20_to_region20[, list(hra20_name, region = region_name)], + by = 'hra20_name', + all = TRUE) + + # Aggregate fractional populations to region level + zip_region_crosswalk <- zip_region_crosswalk[,list(s2t_fraction = sum(s2t_fraction)), + list(geo_id = as.character(source_id), region)] + + # Apply population weighting by region + population_data <- merge(population_data, + zip_region_crosswalk, + by = "geo_id", + all.x = TRUE, + all.y = FALSE, + allow.cartesian = TRUE) + population_data[, pop := pop * s2t_fraction] # Apply weight to population + population_data[, c(temp.group) := region] + } + + ## Process Big Cities ---- + if (population_data[1, get(temp.cat)] == 'Big cities') { + # Block to big city crosswalk + if (population_data[1, geo_type] == 'blk') { + + # Two-step crosswalk: block to HRA to big city + block_to_hra <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] + hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] + + block_to_bigcity <- merge(hra_to_bigcity, + block_to_hra, + by = 'hra20_name', + all.x = T, + all.y = F)[, hra20_name := NULL] + + population_data <- merge(population_data, + block_to_bigcity, + by = "geo_id", + all.x = TRUE, + all.y = FALSE) + } + + # HRA to big city crosswalk + if (population_data[1, geo_type] == 'hra') { + hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] + population_data <- merge(population_data, + hra_to_bigcity, + by.x = 'geo_id', + by.y = 'hra20_name', + all.x = TRUE, + all.y = FALSE) + } + + # Assign big city name to group + population_data[, c(temp.group) := bigcity] + } + + ## Process age groupings ---- + # Age 6 groups: <18, 18-24, 25-44, 45-64, 65-74, 75+ + population_data[get(temp.varname) == "age6" & age %in% 0:17, + c(temp.group) := "<18"] + population_data[get(temp.varname) == "age6" & age %in% 18:24, + c(temp.group) := "18-24"] + population_data[get(temp.varname) == "age6" & age %in% 25:44, + c(temp.group) := "25-44"] + population_data[get(temp.varname) == "age6" & age %in% 45:64, + c(temp.group) := "45-64"] + population_data[get(temp.varname) == "age6" & age %in% 65:74, + c(temp.group) := "65-74"] + population_data[get(temp.varname) == "age6" & age >= 75, + c(temp.group) := "75+"] + + # Maternal age 5 groups: 10-17, 18-24, 25-34, 35-44, 45+ + population_data[get(temp.varname) == "mage5" & age %in% 10:17, + c(temp.group) := "10-17"] + population_data[get(temp.varname) == "mage5" & age %in% 18:24, + c(temp.group) := "18-24"] + population_data[get(temp.varname) == "mage5" & age %in% 25:34, + c(temp.group) := "25-34"] + population_data[get(temp.varname) == "mage5" & age %in% 35:44, + c(temp.group) := "35-44"] + population_data[get(temp.varname) == "mage5" & age >= 45, + c(temp.group) := "45+"] + + # Youth age 4 groups: 0-4, 5-9, 10-14, 15-17 + population_data[get(temp.varname) == "yage4" & age %in% 0:4, + c(temp.group) := "0-4"] + population_data[get(temp.varname) == "yage4" & age %in% 5:9, + c(temp.group) := "5-9"] + population_data[get(temp.varname) == "yage4" & age %in% 10:14, + c(temp.group) := "10-14"] + population_data[get(temp.varname) == "yage4" & age %in% 15:17, + c(temp.group) := "15-17"] + + ## Process poverty groupings ---- + # Block level poverty + if (population_data[1, geo_type] == 'blk' & + grepl("poverty$", population_data[1, get(temp.cat)], ignore.case = TRUE)) { + # Extract tract ID from block ID (first 11 characters) + population_data[, geo_tract2020 := substr(geo_id, 1, 11)] + + # Join poverty group data + population_data <- merge( + population_data, + rads.data::misc_poverty_groups[geo_type == 'Tract'][, list(geo_tract2020 = geo_id, pov200grp)], + by = "geo_tract2020", + all.x = TRUE, + all.y = FALSE + ) + + # Assign poverty group + population_data[, c(temp.group) := pov200grp] + } + + # ZIP level poverty + if (population_data[1, geo_type] == 'zip' & + grepl("poverty$", population_data[1, get(temp.cat)], ignore.case = TRUE)) { + # Join poverty group data + population_data <- merge( + population_data, + rads.data::misc_poverty_groups[geo_type == 'ZCTA'][, list(geo_id, pov200grp)], + by = 'geo_id', + all.x = TRUE, + all.y = FALSE + ) + + # Assign poverty group + population_data[, c(temp.group) := pov200grp] + } + + } # close looping over cat1/cat2 + + # Filter and clean results ---- + # Remove rows with missing primary category group + population_data <- population_data[!is.na(cat1_group)] + + # Remove rows with missing secondary category group (when category exists) + population_data <- population_data[!is.na(cat2) | cat2 == 'NA' | (!is.na(cat2_group) & cat2_group != 'NA')] + + # Aggregate population data ---- + # Collapse to one row per demographic combination with sum of population + population_data <- population_data[, list(pop = sum(pop)), + list(chi_age = age, + year, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group)] + + # Generate complete demographic combinations ---- + # Function to handle age group processing and demographic merging + process_age_category <- function(population_data, cat_num) { + # Define prefix and complementary prefix + cat_prefix <- paste0("cat", cat_num) + other_cat_prefix <- if(cat_num == 1) "cat2" else "cat1" # need to select the cat that does not have the age var + + # Get the variable name for this category + cat_varname <- population_data[1][[paste0(cat_prefix, "_varname")]] + + # Create appropriate age groups based on cat_varname + if (cat_varname == 'age6') { + age_groups <- data.table( + chi_age = 0:100 + ) + age_groups[, paste0(cat_prefix, "_group") := cut( + chi_age, + breaks = c(-1, 17, 24, 44, 64, 74, 120), + labels = c("<18", "18-24", "25-44", "45-64", "65-74", "75+") + )] + } else if (cat_varname == 'mage5') { + age_groups <- data.table( + chi_age = 10:100 + ) + age_groups[, paste0(cat_prefix, "_group") := cut( + chi_age, + breaks = c(9, 17, 24, 34, 44, 120), + labels = c('10-17', '18-24', '25-34', '35-44', '45+') + )] + } else if (cat_varname == 'yage4') { + age_groups <- data.table( + chi_age = 0:17 + ) + age_groups[, paste0(cat_prefix, "_group") := cut( + chi_age, + breaks = c(-1, 4, 9, 14, 17), + labels = c('0-4', '5-9', '10-14', '15-17') + )] + } + + # Add category info + age_groups[, (cat_prefix) := "Age"] + age_groups[, paste0(cat_prefix, "_varname") := cat_varname] + + # Combine demographic dimensions with age groups + cols_to_select <- c("year", other_cat_prefix, + paste0(other_cat_prefix, "_varname"), + paste0(other_cat_prefix, "_group")) + + unique_pop_data <- unique(population_data[, cols_to_select, with = FALSE][, key := 1]) + + complete_demographics <- unique_pop_data[age_groups[, key := 1], on = "key", allow.cartesian = TRUE] + complete_demographics[, key := NULL] + + return(complete_demographics) + } + + # Use function to handle age group processing + if (population_data[1]$cat1 == "Age") { + complete_demographics <- process_age_category(population_data, 1) + } + + if (population_data[1]$cat2 == "Age") { + complete_demographics <- process_age_category(population_data, 2) + } + + if (population_data[1]$cat1 != "Age" & population_data[1]$cat2 != "Age"){ + # Get unique cat1 groups + cat1_groups <- unique(population_data[, list(cat1, cat1_varname, cat1_group, mykey = 1)]) + + # Get unique cat2 groups + cat2_groups <- unique(population_data[, list(cat2, cat2_varname, cat2_group, mykey = 1)]) + + # All combos of cat1 and cat2 groups + complete_demographics <- cat1_groups[cat2_groups, on = "mykey", allow.cartesian = TRUE] + + # Create year and age combos + year_age <- data.table(year = as.character(current_row$year), chi_age = 0:100, mykey = 1) + + # Get combos for each year/age cat1/cat2 combo + complete_demographics <- complete_demographics[year_age, on = "mykey", allow.cartesian = TRUE] + + # Drop key + complete_demographics[, mykey := NULL] + } + + + + # Merge population data with complete demographics grid ---- + population_data <- suppressWarnings(merge(population_data, complete_demographics, all = TRUE)) + + population_data[is.na(pop), pop := 0] # Fill missing population values with zero + + # Add tab column and finalize ---- + population_data[, tab := current_row$tab] + + # Convert placeholder "NA" strings back to true NA values ---- + population_data[cat2 == "NA", c("cat2", "cat2_varname", "cat2_group") := NA] + + # Return completed population dataset ---- + return(population_data) } - tempy <- suppressWarnings(merge(tempy, temp.demog, all = T)) - tempy[is.na(pop), pop := 0] - - # create combinations of cat1_group and cat2_group that have no population and set == 0 ---- - # no need with current get_population function, but keep as a placeholder / reminder - - # add tab column ---- - tempy[, tab := pop.template[X, tab]] - - # tidy when is.na(cat2) ---- - tempy[cat2 == "NA", c("cat2", "cat2_varname", "cat2_group") := NA] - - # return object ---- - return(tempy) - } - - # use lapply to cycle over each rows and create one big final dataset ---- - tempy.allpop <- rbindlist( - future_lapply(X = as.list(seq(1, nrow(pop.template))), - FUN = function(X){ - set.seed(98104) # another attempt to set a seed - CHI_get_proper_pop_engine(X, pop.template = pop.template)}, - future.seed = 98104) - ) - - tempy.allpop <- unique(tempy.allpop) - - # return object ---- - return(tempy.allpop) + # Process all template rows in parallel ---- + # Combine results from all template rows using future_lapply for parallel processing + progressr::handlers(handler_progress()) + progressr::with_progress({ + p <- progressr::progressor(nrow(pop.template)) + all_population_data <- rbindlist( + future_lapply( + X = as.list(seq(1, nrow(pop.template))), + FUN = function(row_index) { + p(paste0("Processing row ", row_index, " of ", nrow(pop.template) )) + set.seed(98104) # Set consistent seed for reproducibility + get_population_for_template_row(row_index, pop.template = pop.template) + }, + future.seed = 98104 # Set seed for parallel processes + ) + ) + }) # closet progressr::with_progress() + + # Remove duplicate rows in final dataset + all_population_data <- unique(all_population_data) + + # Return final population dataset ---- + return(all_population_data) } diff --git a/man/chi_count_by_age.Rd b/man/chi_count_by_age.Rd index d5645c5..109c7a5 100644 --- a/man/chi_count_by_age.Rd +++ b/man/chi_count_by_age.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proto_chi_count_by_age.R +% Please edit documentation in R/chi_count_by_age.R \name{chi_count_by_age} \alias{chi_count_by_age} \title{Generate Age-Specific Counts for Community Health Indicators} diff --git a/man/chi_generate_instructions_pop.Rd b/man/chi_generate_instructions_pop.Rd index 90c6afa..1485bf6 100644 --- a/man/chi_generate_instructions_pop.Rd +++ b/man/chi_generate_instructions_pop.Rd @@ -1,20 +1,48 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proto_chi_generate_instructions_pop.R +% Please edit documentation in R/chi_generate_instructions_pop.R \name{chi_generate_instructions_pop} \alias{chi_generate_instructions_pop} \title{Generate Population Instructions for CHI Analysis} \usage{ -chi_generate_instructions_pop(mycount.data, povgeo = NA) +chi_generate_instructions_pop(mycount.data, povgeo = c("blk", "zip")) } \arguments{ -\item{mycount.data}{Input data.table containing count data specifications} +\item{mycount.data}{Input data.table produced by \code{\link{chi_count_by_age}}, +containing the following columns: + \itemize{ + \item indicator_key: indicator_key used by CHI + \item year: Year range (e.g., "2019-2021" or single year) + \item tab: Visualization tab type + \item cat1, cat1_varname, cat1_group: Primary stratification variables + \item cat2, cat2_varname, cat2_group: Secondary stratification variables + \item chi_age: Single year age + \item count: Count of events (births, death, hospitalizations, etc. ) + }} -\item{povgeo}{Geographic level for poverty analysis (NA or 'zip')} +\item{povgeo}{Geographic level for poverty analysis ('blk' or 'zip')} } \value{ -A data.table containing population processing instructions +A data.table containing population processing instructions with columns: + \itemize{ + \item year: Original year range from input + \item cat1, cat1_varname: Primary stratification details + \item cat2, cat2_varname: Secondary stratification details + \item tab: Visualization tab type + \item start, stop: Start and end years parsed from the year range + \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') + \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') + \item group_by1, group_by2: Demographic grouping specifications + } } \description{ -Creates a instructions for rads::get_population() based on count data -specifications. Handles various geographic types and demographic groupings. +Creates instructions for \code{\link{chi_get_proper_pop}} based on a table of count +data. These instructions configure appropriate demographic groupings, +geographic types, and time periods for retrieving population denominators used +in CHI rate calculations. +} +\seealso{ +\code{\link{chi_count_by_age}} which generates the count data used as input +to this function + +\code{\link{chi_get_proper_pop}} which uses the output of this function } diff --git a/man/chi_get_proper_pop.Rd b/man/chi_get_proper_pop.Rd new file mode 100644 index 0000000..1a7a292 --- /dev/null +++ b/man/chi_get_proper_pop.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/proto_chi_get_proper_pop.R +\name{chi_get_proper_pop} +\alias{chi_get_proper_pop} +\title{Get Population Denominators for CHI Analysis} +\usage{ +chi_get_proper_pop(pop.template = NULL, pop.genders = NULL, pop.ages = NULL) +} +\arguments{ +\item{pop.template}{A data.table produced by \code{\link{chi_generate_instructions_pop}}, +containing instructions for population data retrieval with the following columns: + \itemize{ + \item year: Year range (e.g., "2019-2021" or single year) + \item cat1, cat1_varname: Primary stratification variables + \item cat2, cat2_varname: Secondary stratification variables + \item tab: Visualization tab type + \item start, stop: Start and end years parsed from the year range + \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') + \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') + \item group_by1, group_by2: Race/eth grouping specifications ('race_eth', 'race') + }} + +\item{pop.genders}{Optional character vector specifying which genders to include. +Valid values are 'f', 'female', 'm', 'male'. If NULL (default), both genders are included.} + +\item{pop.ages}{Optional integer vector specifying which ages to include. +If NULL (default), ages 0-100 are included.} +} +\value{ +A data.table containing population counts with columns: + \itemize{ + \item chi_age: Single year age + \item year: Year + \item cat1, cat1_varname, cat1_group: Primary stratification variables + \item cat2, cat2_varname, cat2_group: Secondary stratification variables + \item tab: Visualization tab type + \item pop: Population count + } +} +\description{ +Retrieves population estimates based on instructions generated by +\code{\link{chi_generate_instructions_pop}}. This function processes demographic +categories, geographic levels, and time periods to create appropriate +population denominators for use in CHI rate calculations. +} +\details{ +The function performs multiple steps to generate proper population denominators: +1. Validates input parameters +2. Creates population tables for each row of the template using rads::get_population +3. Handles special cases for various geographic aggregations and crosswalks +4. Returns a comprehensive, tidy dataset with population counts +} +\seealso{ +\code{\link{chi_generate_instructions_pop}} which generates the instructions used as input +to this function +} From 02db414e6b81e86e7fa6a7220af68a20d94363a1 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 11 Mar 2025 09:31:07 -0700 Subject: [PATCH 06/11] FIXED error for dropping missing cat2_group - previous logic for dropping when cat2_group was NA / 'NA' and cat1 group NOT NA / 'NA' was flawed --- R/proto_chi_get_proper_pop.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/proto_chi_get_proper_pop.R b/R/proto_chi_get_proper_pop.R index 2d1fa91..ad94e13 100644 --- a/R/proto_chi_get_proper_pop.R +++ b/R/proto_chi_get_proper_pop.R @@ -402,7 +402,8 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages population_data <- population_data[!is.na(cat1_group)] # Remove rows with missing secondary category group (when category exists) - population_data <- population_data[!is.na(cat2) | cat2 == 'NA' | (!is.na(cat2_group) & cat2_group != 'NA')] + population_data <- population_data[is.na(cat2) | cat2 == 'NA' | + (!is.na(cat2) & cat2 != 'NA' & !is.na(cat2_group) & cat2_group != 'NA'),] # Aggregate population data ---- # Collapse to one row per demographic combination with sum of population From f395df3b775b8744f32949469e536508bf235361 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 11 Mar 2025 09:32:26 -0700 Subject: [PATCH 07/11] proto_chi_get_proper_pop > chi_get_proper_pop - just rename because now fully functional --- R/{proto_chi_get_proper_pop.R => chi_get_proper_pop.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{proto_chi_get_proper_pop.R => chi_get_proper_pop.R} (100%) diff --git a/R/proto_chi_get_proper_pop.R b/R/chi_get_proper_pop.R similarity index 100% rename from R/proto_chi_get_proper_pop.R rename to R/chi_get_proper_pop.R From 39aafe138b0f5fa264c92fb710f3b0b35b3476c8 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 11 Mar 2025 14:05:28 -0700 Subject: [PATCH 08/11] Improved proto_chi_drop_illogical_ages header --- R/globals.R | 2 + R/proto_chi_drop_illogical_ages.R | 111 +++++++++++++++++++++++++----- man/chi_drop_illogical_ages.Rd | 47 ++++++++++--- 3 files changed, 136 insertions(+), 24 deletions(-) diff --git a/R/globals.R b/R/globals.R index fb2743b..72a30e3 100644 --- a/R/globals.R +++ b/R/globals.R @@ -49,6 +49,8 @@ utils::globalVariables(c( "latest_yearx", "level", "lower_bound", + "min_age", + "max_age", "mykey", "notable", "numerator", diff --git a/R/proto_chi_drop_illogical_ages.R b/R/proto_chi_drop_illogical_ages.R index 84a10cf..fdaa403 100644 --- a/R/proto_chi_drop_illogical_ages.R +++ b/R/proto_chi_drop_illogical_ages.R @@ -1,27 +1,106 @@ -#' Drop Illogical Age Combinations from CHI Data +#' Filter Out Age Values That Don't Match Their Corresponding Age Group Categories #' #' @description -#' Removes age combinations that don't make logical sense based on category -#' groupings. For example, removes cases where the age falls outside the range -#' specified by age group categories. +#' This function filters a data.table to remove rows with inconsistent age values. +#' It compares the single year age value (specified by \code{agevar}) against age +#' ranges defined in \code{cat1_group} and \code{cat2_group} columns, keeping only +#' those rows where: +#' \itemize{ +#' \item The categorical variable is not age-related, OR +#' \item The age value falls within the range specified by the corresponding age group +#' } #' -#' @param ph.data Input data.table to process -#' @param agevar Name of the age variable (defaults to 'chi_age') +#' Age groups are expected to be in formats like "10-17", "<5", or "45+", which +#' the function automatically parses into numeric ranges. +#' +#' @param ph.data A data.table or data.frame containing category and age data to be filtered. +#' @param agevar Character string specifying the name of the age variable column. +#' +#' Default: \code{agevar = 'chi_age'} +#' +#' @return A filtered data.table with only logically consistent age values +#' +#' @details +#' The function interprets special formats in age group strings: +#' \itemize{ +#' \item "<1" is treated as age "0-0" (age zero) +#' \item " 0) { + stop(paste("The following required columns are missing:", + paste(missing_cols, collapse = ", "))) + } -chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age'){ - ph.data = copy(ph.data) - for(CatNum in c("cat1", "cat2")){ - ph.data[, paste0(CatNum, '_group_temp') := fcase(get(paste0(CatNum, "_group")) == '<1', '0-0', # <1 is special! - get(CatNum) %in% c("Age", "Birthing person's age"), gsub("<", "0-", gsub("\\+", "-120", get(paste0(CatNum, '_group')))))] - ph.data[, AgeMin := gsub("-.*", "", get(paste0(CatNum, '_group_temp')))] - ph.data[, AgeMax := gsub(".*-", "", get(paste0(CatNum, '_group_temp')))] - ph.data <- ph.data[!get(CatNum) %in% c("Age", "Birthing person's age") | between(get(agevar), AgeMin, AgeMax)] - ph.data[, c("AgeMin", paste0(CatNum, '_group_temp'), "AgeMax") := NULL] + # Ensure agevar column contains numeric values + if (!is.numeric(ph.data[[agevar]])) { + warning(paste("'", agevar, "' column is not numeric. Attempting to convert...", sep = "")) + tryCatch({ + ph.data[, (agevar) := as.numeric(get(agevar))] + }, error = function(e) { + stop(paste("'", agevar, "' could not be converted to numeric. Error: ", e$message, sep = "")) + }) } + + # Loop for cat1 and cat2 + for (catnum in c("cat1", "cat2")) { + # Get column names for this category + catnum_group <- paste0(catnum, "_group") + temp_catnum_group <- paste0(catnum_group, "_temp") + + # Create a standardized version of the age group + ph.data[, (temp_catnum_group) := data.table::fcase( + get(catnum_group) == '<1', '0-0', + + grepl("<", get(catnum_group)), gsub("<", "0-", get(catnum_group)), + + grepl("\\+", get(catnum_group)), gsub("\\+", "-120", get(catnum_group)), + + grepl('-', get(catnum_group)), get(catnum_group) + )] + + # Extract min and max age + ph.data[, "min_age" := as.numeric(gsub("-.*", "", get(temp_catnum_group)))] + ph.data[, "max_age" := as.numeric(gsub(".*-", "", get(temp_catnum_group)))] + + # Keep rows where either: + # 1. The cat is not age-related, OR + # 2. The cat is age-related AND age value is within the min-max range + ph.data <- ph.data[!grepl(' age$|^age$', get(catnum), ignore.case = T) | + data.table::between(get(agevar), min_age, max_age)] + + # Clean up temporary columns + ph.data[, c(temp_catnum_group, "min_age", "max_age") := NULL] + } + return(ph.data) } diff --git a/man/chi_drop_illogical_ages.Rd b/man/chi_drop_illogical_ages.Rd index 4d83d44..f4b906d 100644 --- a/man/chi_drop_illogical_ages.Rd +++ b/man/chi_drop_illogical_ages.Rd @@ -1,21 +1,52 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proto_chi_drop_illogical_ages.R +% Please edit documentation in R/chi_drop_illogical_ages.R \name{chi_drop_illogical_ages} \alias{chi_drop_illogical_ages} -\title{Drop Illogical Age Combinations from CHI Data} +\title{Filter Out Age Values That Don't Match Their Corresponding Age Group Categories} \usage{ chi_drop_illogical_ages(ph.data, agevar = "chi_age") } \arguments{ -\item{ph.data}{Input data.table to process} +\item{ph.data}{A data.table or data.frame containing category and age data to be filtered.} -\item{agevar}{Name of the age variable (defaults to 'chi_age')} +\item{agevar}{Character string specifying the name of the age variable column. + +Default: \code{agevar = 'chi_age'}} } \value{ -A data.table with illogical age combinations removed +A filtered data.table with only logically consistent age values } \description{ -Removes age combinations that don't make logical sense based on category -groupings. For example, removes cases where the age falls outside the range -specified by age group categories. +This function filters a data.table to remove rows with inconsistent age values. +It compares the single year age value (specified by \code{agevar}) against age +ranges defined in \code{cat1_group} and \code{cat2_group} columns, keeping only +those rows where: +\itemize{ + \item The categorical variable is not age-related, OR + \item The age value falls within the range specified by the corresponding age group +} + +Age groups are expected to be in formats like "10-17", "<5", or "45+", which +the function automatically parses into numeric ranges. +} +\details{ +The function interprets special formats in age group strings: +\itemize{ + \item "<1" is treated as age "0-0" (age zero) + \item " Date: Tue, 11 Mar 2025 14:07:16 -0700 Subject: [PATCH 09/11] fix chi_get_proper_pop - changed references to 'key' column to 'mykey' because `key` is reserved (or maybe should be reserved) by data.table --- R/chi_get_proper_pop.R | 6 +++--- man/chi_get_proper_pop.Rd | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/chi_get_proper_pop.R b/R/chi_get_proper_pop.R index ad94e13..0fb0e15 100644 --- a/R/chi_get_proper_pop.R +++ b/R/chi_get_proper_pop.R @@ -462,10 +462,10 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages paste0(other_cat_prefix, "_varname"), paste0(other_cat_prefix, "_group")) - unique_pop_data <- unique(population_data[, cols_to_select, with = FALSE][, key := 1]) + unique_pop_data <- unique(population_data[, cols_to_select, with = FALSE][, mykey := 1]) - complete_demographics <- unique_pop_data[age_groups[, key := 1], on = "key", allow.cartesian = TRUE] - complete_demographics[, key := NULL] + complete_demographics <- unique_pop_data[age_groups[, mykey := 1], on = "mykey", allow.cartesian = TRUE] + complete_demographics[, mykey := NULL] return(complete_demographics) } diff --git a/man/chi_get_proper_pop.Rd b/man/chi_get_proper_pop.Rd index 1a7a292..ab46819 100644 --- a/man/chi_get_proper_pop.Rd +++ b/man/chi_get_proper_pop.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proto_chi_get_proper_pop.R +% Please edit documentation in R/chi_get_proper_pop.R \name{chi_get_proper_pop} \alias{chi_get_proper_pop} \title{Get Population Denominators for CHI Analysis} From 2b6133e69e85d533a204aae6d19950888aa8cbd1 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 11 Mar 2025 14:08:46 -0700 Subject: [PATCH 10/11] rename proto_chi_drop_illogical_ages.R - drop proto prefix because fully documented & updated --- R/{proto_chi_drop_illogical_ages.R => chi_drop_illogical_ages.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{proto_chi_drop_illogical_ages.R => chi_drop_illogical_ages.R} (100%) diff --git a/R/proto_chi_drop_illogical_ages.R b/R/chi_drop_illogical_ages.R similarity index 100% rename from R/proto_chi_drop_illogical_ages.R rename to R/chi_drop_illogical_ages.R From 06092232c1cb9da5ce4069198307e0d93b13fcd5 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 14 Mar 2025 12:36:25 -0700 Subject: [PATCH 11/11] harmonization of cat1/2 loops across code - now use `catnum`, `catgroup`, and `catvarname` in a standard way across functions --- R/chi_drop_illogical_ages.R | 20 ++-- R/chi_generate_instructions_pop.R | 33 +++--- R/chi_get_proper_pop.R | 161 +++++++++++++++--------------- 3 files changed, 106 insertions(+), 108 deletions(-) diff --git a/R/chi_drop_illogical_ages.R b/R/chi_drop_illogical_ages.R index fdaa403..8452f1a 100644 --- a/R/chi_drop_illogical_ages.R +++ b/R/chi_drop_illogical_ages.R @@ -74,23 +74,23 @@ chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age') { # Loop for cat1 and cat2 for (catnum in c("cat1", "cat2")) { # Get column names for this category - catnum_group <- paste0(catnum, "_group") - temp_catnum_group <- paste0(catnum_group, "_temp") + catgroup <- paste0(catnum, "_group") + temp_catgroup <- paste0(catgroup, "_temp") # Create a standardized version of the age group - ph.data[, (temp_catnum_group) := data.table::fcase( - get(catnum_group) == '<1', '0-0', + ph.data[, (temp_catgroup) := data.table::fcase( + get(catgroup) == '<1', '0-0', - grepl("<", get(catnum_group)), gsub("<", "0-", get(catnum_group)), + grepl("<", get(catgroup)), gsub("<", "0-", get(catgroup)), - grepl("\\+", get(catnum_group)), gsub("\\+", "-120", get(catnum_group)), + grepl("\\+", get(catgroup)), gsub("\\+", "-120", get(catgroup)), - grepl('-', get(catnum_group)), get(catnum_group) + grepl('-', get(catgroup)), get(catgroup) )] # Extract min and max age - ph.data[, "min_age" := as.numeric(gsub("-.*", "", get(temp_catnum_group)))] - ph.data[, "max_age" := as.numeric(gsub(".*-", "", get(temp_catnum_group)))] + ph.data[, "min_age" := as.numeric(gsub("-.*", "", get(temp_catgroup)))] + ph.data[, "max_age" := as.numeric(gsub(".*-", "", get(temp_catgroup)))] # Keep rows where either: # 1. The cat is not age-related, OR @@ -99,7 +99,7 @@ chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age') { data.table::between(get(agevar), min_age, max_age)] # Clean up temporary columns - ph.data[, c(temp_catnum_group, "min_age", "max_age") := NULL] + ph.data[, c(temp_catgroup, "min_age", "max_age") := NULL] } return(ph.data) diff --git a/R/chi_generate_instructions_pop.R b/R/chi_generate_instructions_pop.R index 92d74db..53d4914 100644 --- a/R/chi_generate_instructions_pop.R +++ b/R/chi_generate_instructions_pop.R @@ -83,36 +83,35 @@ chi_generate_instructions_pop <- function(mycount.data, "chi_race_aic_his", "chi_race_aic_nhpi", "chi_race_aic_wht") # Process both primary (cat1) and secondary (cat2) stratification variables - for(catnum in c("1", "2")) { - temp.cat <- paste0("cat", catnum) - temp.varname <- paste0(temp.cat, "_varname") - temp.groupby <- paste0("group_by", catnum) + for(catnum in c("cat1", "cat2")) { + catvarname <- paste0(catnum, "_varname") + temp.groupby <- paste0("group_by", gsub('cat', '', catnum)) # Set geographic type based on category - pop.template[get(temp.cat) == "Cities/neighborhoods", geo_type := "hra"] + pop.template[get(catnum) == "Cities/neighborhoods", geo_type := "hra"] # Set race_type and group_by based on race/ethnicity variable - pop.template[get(temp.varname) == "race3", c("race_type", temp.groupby) := 'race'] - pop.template[get(temp.varname) == "race4", c("race_type", temp.groupby) := 'race_eth'] - pop.template[get(temp.varname) %in% omb_aic, c("race_type", temp.groupby) := 'race_aic'] + pop.template[get(catvarname) == "race3", c("race_type", temp.groupby) := 'race'] + pop.template[get(catvarname) == "race4", c("race_type", temp.groupby) := 'race_eth'] + pop.template[get(catvarname) %in% omb_aic, c("race_type", temp.groupby) := 'race_aic'] # Filter out non-standard AIC race/ethnicity categories that don't have population data - pop.template <- pop.template[!(grepl('_aic_', get(temp.varname)) & - !get(temp.varname) %in% omb_aic)] + pop.template <- pop.template[!(grepl('_aic_', get(catvarname)) & + !get(catvarname) %in% omb_aic)] # Set demographic grouping based on category label - pop.template[get(temp.cat) == "Ethnicity", c("race_type", temp.groupby) := 'race_eth'] - pop.template[get(temp.cat) == "Gender", (temp.groupby) := 'genders'] - pop.template[get(temp.cat) %in% c("Race", "Race/ethnicity") & get(temp.varname) == 'race4', + pop.template[get(catnum) == "Ethnicity", c("race_type", temp.groupby) := 'race_eth'] + pop.template[get(catnum) == "Gender", (temp.groupby) := 'genders'] + pop.template[get(catnum) %in% c("Race", "Race/ethnicity") & get(catvarname) == 'race4', (temp.groupby) := 'race_eth'] - pop.template[(get(temp.cat) == "Race" & get(temp.varname) == 'race3'), + pop.template[(get(catnum) == "Race" & get(catvarname) == 'race3'), (temp.groupby) := 'race'] # Set geographic type based on regions - pop.template[get(temp.cat) == "Regions" & (is.na(geo_type) | geo_type != 'hra'), + pop.template[get(catnum) == "Regions" & (is.na(geo_type) | geo_type != 'hra'), geo_type := "region"] - pop.template[get(temp.cat) == "Big cities", geo_type := "hra"] - pop.template[get(temp.cat) == "Washington State", geo_type := "wa"] + pop.template[get(catnum) == "Big cities", geo_type := "hra"] + pop.template[get(catnum) == "Washington State", geo_type := "wa"] } # Handle special geographic cases ---- diff --git a/R/chi_get_proper_pop.R b/R/chi_get_proper_pop.R index 0fb0e15..2f673f9 100644 --- a/R/chi_get_proper_pop.R +++ b/R/chi_get_proper_pop.R @@ -139,85 +139,84 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages # Process demographic categories ---- # Apply category grouping logic for both primary (cat1) and secondary (cat2) categories - for (catnum in c("1", "2")) { + for (catnum in c("cat1", "cat2")) { ## Add category information from template to result ---- - temp.cat <- paste0("cat", catnum) - temp.varname <- paste0("cat", catnum, '_varname') - temp.group <- paste0("cat", catnum, '_group') - temp.groupby <- paste0("group_by", catnum) + catvarname <- paste0(catnum, '_varname') + catgroup <- paste0(catnum, '_group') + temp.groupby <- paste0("group_by", gsub('cat', '', catnum)) - # had to use set function because regular := syntax caused errors b/c used temp.cat differently on both sides of := + # had to use set function because regular := syntax caused errors b/c used catnum differently on both sides of := data.table::set(population_data, - j = temp.cat, - value = current_row[[temp.cat]]) + j = catnum, + value = current_row[[catnum]]) data.table::set(population_data, - j = temp.varname, - value = current_row[[temp.varname]]) + j = catvarname, + value = current_row[[catvarname]]) data.table::set(population_data, - j = temp.group, + j = catgroup, value = current_row[[temp.groupby]]) ## Process geographic categories ---- # King County - population_data[get(temp.cat) == "King County", - c(temp.group) := "King County"] + population_data[get(catnum) == "King County", + c(catgroup) := "King County"] # Washington State - population_data[get(temp.cat) == "Washington State", - c(temp.group) := "Washington State"] + population_data[get(catnum) == "Washington State", + c(catgroup) := "Washington State"] # Handle NA values suppressWarnings( - population_data[get(temp.cat) == "NA" | is.na(get(temp.cat)), - c(temp.cat, temp.group, temp.varname) := "NA"] + population_data[get(catnum) == "NA" | is.na(get(catnum)), + c(catnum, catgroup, catvarname) := "NA"] ) # Cities/neighborhoods and Regions - population_data[get(temp.cat) %in% c("Cities/neighborhoods", "Regions") & + population_data[get(catnum) %in% c("Cities/neighborhoods", "Regions") & current_row$geo_type != 'blk', - c(temp.group) := geo_id] + c(catgroup) := geo_id] ## Process gender ---- - population_data[get(temp.cat) %in% c("Gender"), c(temp.group) := gender] + population_data[get(catnum) %in% c("Gender"), c(catgroup) := gender] ## Process 'Overall' ---- - population_data[get(temp.cat) %in% c("Overall"), c(temp.group) := "Overall"] + population_data[get(catnum) %in% c("Overall"), c(catgroup) := "Overall"] ## Process race/ethnicity categories ---- - population_data[get(temp.cat) == "Ethnicity" | get(temp.varname) %in% c('race4'), - c(temp.group) := race_eth] + population_data[get(catnum) == "Ethnicity" | get(catvarname) %in% c('race4'), + c(catgroup) := race_eth] - population_data[get(temp.cat) == 'Race' & get(temp.varname) %in% c('race3'), - c(temp.group) := race] + population_data[get(catnum) == 'Race' & get(catvarname) %in% c('race3'), + c(catgroup) := race] - population_data <- population_data[get(temp.cat) != "Ethnicity" | (get(temp.cat) == "Ethnicity" & get(temp.group) == 'Hispanic'), ] + population_data <- population_data[get(catnum) != "Ethnicity" | (get(catnum) == "Ethnicity" & get(catgroup) == 'Hispanic'), ] - population_data[get(temp.group) == "Multiple race", - c(temp.group) := "Multiple"] + population_data[get(catgroup) == "Multiple race", + c(catgroup) := "Multiple"] ## Process race_aic (alone or in combination) categories ---- if (current_row$race_type == 'race_aic') { # Filter to keep only relevant race_aic combinations population_data <- population_data[ - !(grepl('_aic_', get(temp.varname)) & - !((get(temp.varname) == 'chi_race_aic_aian' & race_aic == 'AIAN') | - (get(temp.varname) == 'chi_race_aic_asian' & race_aic == 'Asian') | - (get(temp.varname) == 'chi_race_aic_black' & race_aic == 'Black') | - (get(temp.varname) == 'chi_race_aic_his' & race_aic == 'Hispanic') | - (get(temp.varname) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | - (get(temp.varname) == 'chi_race_aic_wht' & race_aic == 'White')) + !(grepl('_aic_', get(catvarname)) & + !((get(catvarname) == 'chi_race_aic_aian' & race_aic == 'AIAN') | + (get(catvarname) == 'chi_race_aic_asian' & race_aic == 'Asian') | + (get(catvarname) == 'chi_race_aic_black' & race_aic == 'Black') | + (get(catvarname) == 'chi_race_aic_his' & race_aic == 'Hispanic') | + (get(catvarname) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | + (get(catvarname) == 'chi_race_aic_wht' & race_aic == 'White')) ) ] # Assign race_aic value to group - population_data[grep('_aic', get(temp.varname)), - c(temp.group) := race_aic] + population_data[grep('_aic', get(catvarname)), + c(catgroup) := race_aic] } ## Process HRAs ---- - if (population_data[1, geo_type] == 'blk' & population_data[1, get(temp.cat)] == 'Cities/neighborhoods') { + if (population_data[1, geo_type] == 'blk' & population_data[1, get(catnum)] == 'Cities/neighborhoods') { hra_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] @@ -227,12 +226,12 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages all.x = TRUE, all.y = FALSE) - population_data[, c(temp.group) := hra20_name] + population_data[, c(catgroup) := hra20_name] } ## Process Regions ---- # Block to Region crosswalk - if (population_data[1, geo_type] == 'blk' & population_data[1, get(temp.cat)] == 'Regions') { + if (population_data[1, geo_type] == 'blk' & population_data[1, get(catnum)] == 'Regions') { region_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, region_name)] @@ -242,11 +241,11 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages all.x = TRUE, all.y = FALSE) - population_data[, c(temp.group) := region_name] + population_data[, c(catgroup) := region_name] } # HRA to Region crosswalk - if (population_data[1, geo_type] == 'hra' & population_data[1, get(temp.cat)] == 'Regions') { + if (population_data[1, geo_type] == 'hra' & population_data[1, get(catnum)] == 'Regions') { region_crosswalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] @@ -256,11 +255,11 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages all.x = TRUE, all.y = FALSE) - population_data[, c(temp.group) := region_name] + population_data[, c(catgroup) := region_name] } # ZIP to Region crosswalk with population weighting - if (population_data[1, geo_type] == 'zip' & population_data[1, get(temp.cat)] == 'Regions') { + if (population_data[1, geo_type] == 'zip' & population_data[1, get(catnum)] == 'Regions') { # Create ZIP to region crosswalk with population weights zip_region_crosswalk <- rads.data::spatial_zip_to_hra20_pop @@ -282,11 +281,11 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages all.y = FALSE, allow.cartesian = TRUE) population_data[, pop := pop * s2t_fraction] # Apply weight to population - population_data[, c(temp.group) := region] + population_data[, c(catgroup) := region] } ## Process Big Cities ---- - if (population_data[1, get(temp.cat)] == 'Big cities') { + if (population_data[1, get(catnum)] == 'Big cities') { # Block to big city crosswalk if (population_data[1, geo_type] == 'blk') { @@ -319,50 +318,50 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages } # Assign big city name to group - population_data[, c(temp.group) := bigcity] + population_data[, c(catgroup) := bigcity] } ## Process age groupings ---- # Age 6 groups: <18, 18-24, 25-44, 45-64, 65-74, 75+ - population_data[get(temp.varname) == "age6" & age %in% 0:17, - c(temp.group) := "<18"] - population_data[get(temp.varname) == "age6" & age %in% 18:24, - c(temp.group) := "18-24"] - population_data[get(temp.varname) == "age6" & age %in% 25:44, - c(temp.group) := "25-44"] - population_data[get(temp.varname) == "age6" & age %in% 45:64, - c(temp.group) := "45-64"] - population_data[get(temp.varname) == "age6" & age %in% 65:74, - c(temp.group) := "65-74"] - population_data[get(temp.varname) == "age6" & age >= 75, - c(temp.group) := "75+"] + population_data[get(catvarname) == "age6" & age %in% 0:17, + c(catgroup) := "<18"] + population_data[get(catvarname) == "age6" & age %in% 18:24, + c(catgroup) := "18-24"] + population_data[get(catvarname) == "age6" & age %in% 25:44, + c(catgroup) := "25-44"] + population_data[get(catvarname) == "age6" & age %in% 45:64, + c(catgroup) := "45-64"] + population_data[get(catvarname) == "age6" & age %in% 65:74, + c(catgroup) := "65-74"] + population_data[get(catvarname) == "age6" & age >= 75, + c(catgroup) := "75+"] # Maternal age 5 groups: 10-17, 18-24, 25-34, 35-44, 45+ - population_data[get(temp.varname) == "mage5" & age %in% 10:17, - c(temp.group) := "10-17"] - population_data[get(temp.varname) == "mage5" & age %in% 18:24, - c(temp.group) := "18-24"] - population_data[get(temp.varname) == "mage5" & age %in% 25:34, - c(temp.group) := "25-34"] - population_data[get(temp.varname) == "mage5" & age %in% 35:44, - c(temp.group) := "35-44"] - population_data[get(temp.varname) == "mage5" & age >= 45, - c(temp.group) := "45+"] + population_data[get(catvarname) == "mage5" & age %in% 10:17, + c(catgroup) := "10-17"] + population_data[get(catvarname) == "mage5" & age %in% 18:24, + c(catgroup) := "18-24"] + population_data[get(catvarname) == "mage5" & age %in% 25:34, + c(catgroup) := "25-34"] + population_data[get(catvarname) == "mage5" & age %in% 35:44, + c(catgroup) := "35-44"] + population_data[get(catvarname) == "mage5" & age >= 45, + c(catgroup) := "45+"] # Youth age 4 groups: 0-4, 5-9, 10-14, 15-17 - population_data[get(temp.varname) == "yage4" & age %in% 0:4, - c(temp.group) := "0-4"] - population_data[get(temp.varname) == "yage4" & age %in% 5:9, - c(temp.group) := "5-9"] - population_data[get(temp.varname) == "yage4" & age %in% 10:14, - c(temp.group) := "10-14"] - population_data[get(temp.varname) == "yage4" & age %in% 15:17, - c(temp.group) := "15-17"] + population_data[get(catvarname) == "yage4" & age %in% 0:4, + c(catgroup) := "0-4"] + population_data[get(catvarname) == "yage4" & age %in% 5:9, + c(catgroup) := "5-9"] + population_data[get(catvarname) == "yage4" & age %in% 10:14, + c(catgroup) := "10-14"] + population_data[get(catvarname) == "yage4" & age %in% 15:17, + c(catgroup) := "15-17"] ## Process poverty groupings ---- # Block level poverty if (population_data[1, geo_type] == 'blk' & - grepl("poverty$", population_data[1, get(temp.cat)], ignore.case = TRUE)) { + grepl("poverty$", population_data[1, get(catnum)], ignore.case = TRUE)) { # Extract tract ID from block ID (first 11 characters) population_data[, geo_tract2020 := substr(geo_id, 1, 11)] @@ -376,12 +375,12 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages ) # Assign poverty group - population_data[, c(temp.group) := pov200grp] + population_data[, c(catgroup) := pov200grp] } # ZIP level poverty if (population_data[1, geo_type] == 'zip' & - grepl("poverty$", population_data[1, get(temp.cat)], ignore.case = TRUE)) { + grepl("poverty$", population_data[1, get(catnum)], ignore.case = TRUE)) { # Join poverty group data population_data <- merge( population_data, @@ -392,7 +391,7 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages ) # Assign poverty group - population_data[, c(temp.group) := pov200grp] + population_data[, c(catgroup) := pov200grp] } } # close looping over cat1/cat2