diff --git a/DESCRIPTION b/DESCRIPTION index 3462dea..48554fc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,6 +36,7 @@ Remotes: github::PHSKC-APDE/rads, github::PHSKC-APDE/dtsurvey Suggests: httr, + keyring, knitr, progress, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index e0ff381..6248b5c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(chi_calc) +export(chi_chars_ccs) +export(chi_chars_injury) export(chi_compare_estimates) export(chi_count_by_age) export(chi_drop_illogical_ages) @@ -9,13 +11,17 @@ export(chi_generate_instructions_pop) export(chi_generate_metadata) export(chi_generate_tro_shell) export(chi_get_cols) +export(chi_get_proper_pop) export(chi_get_yaml) +export(chi_keep_proper_ages) export(chi_qa_tro) export(chi_update_sql) +import(data.table) import(dtsurvey) import(future) import(future.apply) import(progressr) +import(rads) importFrom(DBI,Id) importFrom(DBI,dbConnect) importFrom(DBI,dbDisconnect) @@ -26,7 +32,9 @@ importFrom(data.table,"%between%") importFrom(data.table,":=") importFrom(data.table,.GRP) importFrom(data.table,.SD) +importFrom(data.table,CJ) importFrom(data.table,`:=`) +importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) importFrom(data.table,data.table) @@ -47,6 +55,9 @@ importFrom(future.apply,future_lapply) importFrom(glue,glue) importFrom(glue,glue_sql) importFrom(odbc,odbc) +importFrom(progressr,handlers) +importFrom(progressr,progressor) +importFrom(progressr,with_progress) importFrom(rads,calc) importFrom(rads,round2) importFrom(rads,string_clean) @@ -57,6 +68,7 @@ importFrom(stats,na.omit) importFrom(stats,qnorm) importFrom(tidyr,crossing) importFrom(tools,toTitleCase) +importFrom(utils,capture.output) importFrom(utils,tail) importFrom(yaml,read_yaml) importFrom(yaml,yaml.load) diff --git a/R/chi_calc.R b/R/chi_calc.R index 052ad13..7d0fb0d 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -196,10 +196,11 @@ chi_calc <- function(ph.data = NULL, tempbv <- unique(na.omit(c(tempbv1, tempbv2))) tempend <- current_row$end tempstart <- current_row$start + temptab <- current_row$tab # use calc()---- if(rate == FALSE){ # standard proportion analysis - if(any(grepl('wastate', tempbv))){ + if(temptab == '_wastate'){ tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend], what = current_row$indicator_key, by = tempbv, @@ -214,7 +215,7 @@ chi_calc <- function(ph.data = NULL, } } if(rate == TRUE){ - if(any(grepl('wastate', tempbv))){ + if(temptab == '_wastate'){ tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend], what = current_row$indicator_key, by = tempbv, diff --git a/R/chi_chars_ccs.R b/R/chi_chars_ccs.R new file mode 100644 index 0000000..f6290a7 --- /dev/null +++ b/R/chi_chars_ccs.R @@ -0,0 +1,433 @@ +# chi_chars_ccs ---- +#' Generate CHI CHARS counts by CCS category +#' +#' @description +#' Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +#' System (CHARS) data based on Clinical Classification Software (CCS) categories +#' for CHI. This function processes instructions for specific indicators and +#' summarizes CHARS data accordingly. It automatically handles the ICD-9 to ICD-10 +#' transition that occurred in 2016. +#' +#' @param ph.indicator A character string of length 1. The indicator key to process, +#' which must exist in the chars.defs data table. +#' @param ph.data A data.table containing the CHARS data to be processed. +#' @param myinstructions A data.table containing processing instructions for each indicator. +#' Default is the output from chi_generate_tro_shell(). +#' @param chars.defs A data.table containing definitions for each indicator. It +#' should have the following columns: `indicator_name`, `indicator_key`, `intent`, +#' `mechanism`, `superlevel`, `broad`, `midlevel`, `detailed`, `age_start`, and +#' `age_end`. +#' +#' @return A data.table containing the summarized CHARS hospitalization data for the +#' specified indicator, stratified by the requested demographic variables. +#' +#' @details +#' The function processes multiple instructions for the same indicator sequentially. +#' +#' The function automatically detects whether data spans the ICD-9 to ICD-10 transition in 2016 +#' and processes each part with the appropriate ICD version. Results are then combined seamlessly. +#' +#' @examples +#' \dontrun{ +#' # Example of how to run with future_lapply for memory efficiency +#' library(future) +#' library(future.apply) +#' +#' plan(multisession, workers = future::availableCores() - 1) +#' +#' countsCCS <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { +#' chi_chars_ccs( +#' ph.indicator = indicator, +#' ph.data = chars, +#' myinstructions = myinstructions, +#' chars.defs = chars.defs) +#' }, future.seed = TRUE)) +#' +#' plan(sequential) +#' +#' } +#' +#' @seealso +#' \code{\link[rads]{get_data_chars}}, which provides creates ph.data +#' +#' \code{\link[rads]{chars_icd_ccs_count}}, which is the engine used by this function +#' +#' \code{\link{chi_generate_tro_shell}}, which creates myinstructions +#' +#' @import data.table +#' @import rads +#' @importFrom utils capture.output +#' @export +chi_chars_ccs <- function(ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL) { + # Input validation ----- + if (is.na(ph.indicator)) { + stop("\n\U1F6D1 ph.indicator must be provided.") + } + + for (arg_name in c("ph.data", "myinstructions", "chars.defs")) { + arg_value <- get(arg_name) + if (is.null(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be specified.")) + } + if (is.data.frame(arg_value)) { + setDT(arg_value) + } else if (!is.data.frame(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be a data.table or data.frame.")) + } + } + + # Ensure indicator exists + instructions <- myinstructions[indicator_key == ph.indicator] + + if (nrow(instructions) == 0) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in myinstructions.")) + } + + if (!ph.indicator %in% chars.defs$indicator_key) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in chars.defs.")) + } + + + # Ensure cat1_varname and cat2_varname exist in ph.data + unique_cat_vars <- unique(c( + instructions$cat1_varname[!is.na(instructions$cat1_varname)], + instructions$cat2_varname[!is.na(instructions$cat2_varname)] + )) + + missing_cols <- unique_cat_vars[!unique_cat_vars %in% names(ph.data)] + if (length(missing_cols) > 0) { + stop(paste0("\n\U1F47F The following columns referenced in instructions don't exist in ph.data: ", + paste(missing_cols, collapse = ", "))) + } + + # Extract indicator parameters + indicator_def <- chars.defs[indicator_key == ph.indicator] + age_start <- indicator_def$age_start + age_end <- indicator_def$age_end + + # Verify that this is not an injury indicator (those should use chi_chars_injury) + has_mechanism <- "mechanism" %in% names(indicator_def) && !is.na(indicator_def$mechanism) + has_intent <- "intent" %in% names(indicator_def) && !is.na(indicator_def$intent) + + if (has_mechanism || has_intent) { + stop(paste0("\n\U1F6D1 Indicator '", ph.indicator, "' appears to be an injury indicator. ", + "You must using chi_chars_injury() instead.")) + } + + # Check for required parameters + if (!all(c("superlevel", "broad", "midlevel", 'detailed') %in% names(indicator_def))) { + stop("\n\U1F6D1 chars.defs must contain columns 'superlevel', 'broad', 'midlevel', and 'detailed' for CCS indicators.") + } + + # Import CM reference table ---- + message("Loading ICD-CM reference table (this will happen only once per ph.indicator)") + if (any(instructions$start < 2016, na.rm = TRUE)){ICDCM9_table <- rads::chars_icd_ccs(icdcm_version = 9)} + ICDCM10_table <- rads::chars_icd_ccs(icdcm_version = 10) + + # Helper function to process ICD data (will be passed to lapply) ---- + process_icd_data <- function(icdcm_version, + current_start, + current_end, + current_row, + age_start, + age_end, + ph_data, + ICDCM_table, + indicator_def, + indicator_key) { + # Determine geographical filter based on tab ---- + is_wastate <- current_row$tab == "_wastate" + + # Filter data for appropriate years and geography ---- + if (is_wastate) { + data_subset <- data.table::copy(ph.data[wastate == 'Washington State' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } else { + data_subset <- data.table::copy(ph.data[chi_geo_kc == 'King County' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } + + if (nrow(data_subset) == 0) { + return(NULL) + } + + # Add year information ---- + data_subset[, year_range := paste0(current_start, "-", current_end)] + + # Get CCS table based on ICD version ---- + ICDCM_table<- if (icdcm_version == 9) ICDCM9_table else ICDCM10_table + + # Generate arguments for chars_icd_ccs_count ---- + superlevel_arg <- if (!is.na(indicator_def$superlevel)) indicator_def$superlevel else NULL + broad_arg <- if (!is.na(indicator_def$broad)) indicator_def$broad else NULL + midlevel_arg <- if (!is.na(indicator_def$midlevel)) indicator_def$midlevel else NULL + detailed_arg <- if (!is.na(indicator_def$detailed)) indicator_def$detailed else NULL + + # Generate group_by argument ---- + if (is.na(current_row$cat2_varname)) { + group_by_arg <- current_row$cat1_varname + } else { + group_by_arg <- c(current_row$cat1_varname, current_row$cat2_varname) + } + group_by_arg <- c('chi_age', unique(group_by_arg)) + + # Process data with rads::chars_icd_ccs_count---- + result <- rads::chars_icd_ccs_count( + ph.data = data_subset, + icdcm_version = icdcm_version, + CMtable= ICDCM_table, + icdcm = NULL, + superlevel = superlevel_arg, + broad = broad_arg, + midlevel = midlevel_arg, + detailed = detailed_arg, + icdcol = 'diag1', + group_by = group_by_arg, + kingco = FALSE # Already filtered data above, so always FALSE + ) + + # Format the results ---- + if (nrow(result) > 0) { + # Add metadata columns + result[, icd_version := icdcm_version] + result[, original_year_range := paste0(current_row$start, "-", current_row$end)] + result[, tab := current_row$tab] + result[, cat1 := current_row$cat1] + result[, cat2 := current_row$cat2] + result[, cat1_varname := gsub("_hispanic", "", current_row$cat1_varname)] + result[, cat2_varname := gsub("_hispanic", "", current_row$cat2_varname)] + + # Rename category columns + data.table::setnames(result, current_row$cat1_varname, "cat1_group") + + if (!is.na(current_row$cat2_varname)) { + data.table::setnames(result, current_row$cat2_varname, "cat2_group", skip_absent = TRUE) + } + + # Filter and format + result <- result[!is.na(cat1_group)] + result[cat2_varname == 'chi_geo_kc', cat2_group := 'King County'] + result <- result[is.na(cat2_varname) | (!is.na(cat2_varname) & !is.na(cat2_group))] + result[, ph.indicator := ph.indicator] + + } + + return(result) + } + + # Process sequentially ---- + message('\U023F3 Processing calculations... this may take a while') + + # Process each row + results_list <- lapply( + X = seq_len(nrow(instructions)), + FUN = function(i) { + # Progress meter ---- + current_row <- instructions[i] + row_info <- paste( + ph.indicator, + current_row$tab, + current_row$cat1, + current_row$cat1_varname, + current_row$cat2, + current_row$cat2_varname, + current_row$end, + current_row$start, + sep = "|" + ) + message(paste0("myinstructions row ", i, "/", nrow(instructions), ": \n - ", row_info)) + + # Determine if we need ICD-9, ICD-10, or both ---- + current_start <- as.integer(instructions[i, start]) + current_end <- as.integer(instructions[i, end]) + need_icd9 <- current_start < 2016 # Includes any year before 2016 + need_icd10 <- current_end >= 2016 # Includes any year from 2016 onwards + + # Results for this instruction row ---- + row_results <- list() + + # Process with ICD-9 if needed ---- + if (need_icd9) { + message(paste0(" - Processing ICD-9 portion (", current_start, "-", min(current_end, 2015), ")")) + row_results$icd9 <- process_icd_data( + icdcm_version = 9, + current_start = current_start, + current_end = min(current_end, 2015), + current_row = instructions[i,], + age_start = age_start, + age_end = age_end, + ph_data = ph.data, + ICDCM_table= ICDCM9_table, + indicator_def = indicator_def, + indicator_key = ph.indicator + ) + } + + # Process with ICD-10 if needed ---- + if (need_icd10) { + message(paste0(" - Processing ICD-10 portion (", max(current_start, 2016), "-", current_end, ")")) + row_results$icd10 <- process_icd_data( + icdcm_version = 10, + current_start = max(current_start, 2016), + current_end = current_end, + current_row = instructions[i,], + age_start = age_start, + age_end = age_end, + ph_data = ph.data, + ICDCM_table= ICDCM10_table, + indicator_def = indicator_def, + indicator_key = ph.indicator + ) + } + + # If we have both ICD-9 and ICD-10 results, merge them ---- + # When data spans both ICD-9 and ICD-10 periods, we need to merge the results + # This handles the ICD transition in 2016 by combining hospitalization counts + # while preserving all demographic stratifications + if (!is.null(row_results$icd9) && !is.null(row_results$icd10) && + nrow(row_results$icd9) > 0 && nrow(row_results$icd10) > 0) { + + # Identify common columns for merging + merge_cols <- intersect( + names(row_results$icd9)[!names(row_results$icd9) %in% c("hospitalizations", "icd_version", "year_range")], + names(row_results$icd10)[!names(row_results$icd10) %in% c("hospitalizations", "icd_version", "year_range")] + ) + + # Merge data + combined <- merge( + row_results$icd9, + row_results$icd10, + by = merge_cols, + suffixes = c("_icd9", "_icd10"), + all = TRUE + ) + + # Sum hospitalizations (handling missing values) + combined[, hospitalizations := sum( + ifelse(is.na(hospitalizations_icd9), 0, hospitalizations_icd9), + ifelse(is.na(hospitalizations_icd10), 0, hospitalizations_icd10) + ), by = merge_cols] + + # Set year to the full original range + combined[, year := original_year_range] + + return(combined) + } else if (!is.null(row_results$icd9) && nrow(row_results$icd9) > 0) { + # Only ICD-9 results available + row_results$icd9[, year := original_year_range] + return(row_results$icd9) + } else if (!is.null(row_results$icd10) && nrow(row_results$icd10) > 0) { + # Only ICD-10 results available + row_results$icd10[, year := original_year_range] + return(row_results$icd10) + } else { + # No results available + return(NULL) + } + } + ) + + # Filter out NULL results + results_list <- results_list[!sapply(results_list, is.null)] + + # Combine results if any exist + if (length(results_list) > 0) { + result <- data.table::rbindlist(results_list, fill = TRUE, use.names = TRUE) + } else { + warning("\u26A0\ufe0f No results found for indicator: ", ph.indicator) + result <- data.table::data.table() + } + + # Remove temporary columns if they exist + cols_to_remove <- intersect(names(result), + c('icd_version_icd9', 'icd_version_icd10', 'hospitalizations_icd9', + 'hospitalizations_icd10', 'icd_version', 'original_year_range')) + if (length(cols_to_remove) > 0) { + result[, (cols_to_remove) := NULL] + } + + # Ensure we have complete data (for all ages withing the specified range) ---- + # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results + template <- unique(result[, list(tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + indicator_key = ph.indicator, year)]) + + template <- template[, list(chi_age = seq(age_start, age_end)), by = names(template)] + + result <- merge(template, + result[, chi_age := as.numeric(chi_age)], + by = intersect(names(template), names(result)), + all = TRUE) + + result[is.na(hospitalizations), hospitalizations := 0] + + result <- result[, list(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + + setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) + + # Identify instructions that caused all data to be filtered out ---- + # this helps diagnose data quality issues, either in myinstructions or in ph.data + if (nrow(result) > 0) { + # Get combinations actually used in results + result_combos <- unique(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + + # Handle the special case for race3_hispanic + result_combos[cat1_varname == 'race3' & cat1 == 'Ethnicity', cat1_varname := 'race3_hispanic'] + + # Use fsetdiff to find instructions that didn't produce results + unused_instructions <- fsetdiff( + setcolorder(instructions, names(result_combos)), + result_combos + ) + + # Sort for warning display + setorder(unused_instructions, indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, start, end) + } else { + unused_instructions <- copy(instructions) + } + + # Generate warnings and attach to result + if (nrow(unused_instructions) > 0) { + # Capture the formatted output directly + empty_table <- paste( + utils::capture.output( + print(unused_instructions, + row.names = FALSE, + class = FALSE, + printClassOfColumns = FALSE) + ), + collapse = "\n" + ) + + warning(paste0( + "\n\u26A0\ufe0f No data found for the following ", nrow(unused_instructions), + " instruction(s) for indicator '", ph.indicator, "':\n", + empty_table + )) + + } + + # Drop irrelevant ages ---- + result <- chi_keep_proper_ages(result) + + # Tidy year (e.g., '2025-2025' becomes just '2025') ---- + result[, year := gsub("^(\\d{4})-(\\1)$", "\\1", year)] # \\1 is a back reference to what was found before hyphen + + # Order columns ---- + setcolorder(result, c("indicator_key", "year", "chi_age", "hospitalizations", "tab", "cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group")) + + # Return data.table ---- + return(result) +} diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R new file mode 100644 index 0000000..a136d0d --- /dev/null +++ b/R/chi_chars_injury.R @@ -0,0 +1,352 @@ +# chi_chars_injury ---- +#' Generate CHI CHARS counts for injury indicators +#' +#' @description +#' Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +#' System (CHARS) data for injury indicators including falls, poisoning, +#' self-harm, and other injury mechanisms for CHI. This function processes +#' instructions for specific indicators and summarizes CHARS data accordingly. +#' +#' @param ph.indicator A character string of length 1. The indicator key to process, +#' which must exist in the chars.defs data table. +#' @param ph.data A data.table containing the CHARS data to be processed. +#' @param myinstructions A data.table containing processing instructions for each indicator. +#' Default is the output from \code{\link{chi_generate_tro_shell}}. +#' @param chars.defs A data.table containing definitions for each indicator. It +#' should have the following columns: `indicator_name`, `indicator_key`, `intent`, +#' `mechanism`, `age_start`, and `age_end`. +#' @param def A character string indicating which injury definition to use. +#' Default is \code{def = 'narrow'}. +#' +#' @return A data.table containing the summarized CHARS injury hospitalization data for the +#' specified indicator, stratified by the requested demographic variables. +#' +#' @details +#' This function processes instructions for a specific indicator sequentially. +#' +#' Note that injury data is only available for 2012 and later years. Unlike chi_chars_ccs(), +#' this function doesn't need to handle the ICD-9 to ICD-10 transition explicitly as the +#' mechanism and intent columns have been standardized across coding systems. +#' +#' @examples +#' \dontrun{ +#' # Example of how to run with future_lapply for memory efficiency +#' library(future) +#' library(future.apply) +#' +#' plan(multisession, workers = future::availableCores() - 1) +#' +#' countsINJURY <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { +#' chi_chars_injury( +#' ph.indicator = indicator, +#' ph.data = chars, +#' myinstructions = myinstructions, +#' chars.defs = chars.defs, +#' def = 'narrow') +#' }, future.seed = TRUE)) +#' +#' plan(sequential) +#' +#' } +#' +#' @seealso +#' \code{\link[rads]{get_data_chars}}, which provides creates ph.data +#' +#' \code{\link[rads]{chars_injury_matrix_count}}, which is the engine used by this function +#' +#' \code{\link{chi_generate_tro_shell}}, which creates myinstructions +#' +#' @import data.table +#' @import rads +#' @importFrom utils capture.output +#' @export +chi_chars_injury <- function(ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL, + def = 'narrow') { + # Input validation ----- + if (is.na(ph.indicator)) { + stop("\n\U1F6D1 ph.indicator must be provided.") + } + + for (arg_name in c("ph.data", "myinstructions", "chars.defs")) { + arg_value <- get(arg_name) + if (is.null(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be specified.")) + } + if (is.data.frame(arg_value)) { + setDT(arg_value) + } else if (!is.data.frame(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be a data.table or data.frame.")) + } + } + + # Ensure indicator exists + instructions <- myinstructions[indicator_key == ph.indicator] + + if (nrow(instructions) == 0) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in myinstructions.")) + } + + if (!ph.indicator %in% chars.defs$indicator_key) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in chars.defs.")) + } + + + # Ensure cat1_varname and cat2_varname exist in ph.data + unique_cat_vars <- unique(c( + instructions$cat1_varname[!is.na(instructions$cat1_varname)], + instructions$cat2_varname[!is.na(instructions$cat2_varname)] + )) + + missing_cols <- unique_cat_vars[!unique_cat_vars %in% names(ph.data)] + if (length(missing_cols) > 0) { + stop(paste0("\n\U1F47F The following columns referenced in instructions don't exist in ph.data: ", + paste(missing_cols, collapse = ", "))) + } + + # Extract indicator parameters + indicator_def <- chars.defs[indicator_key == ph.indicator] + age_start <- indicator_def$age_start + age_end <- indicator_def$age_end + if (!is.numeric(age_start) || !is.numeric(age_end)) { + stop("\n\U1F6D1 age_start and age_end must be numeric.") + } + + # Verify that this is an injury indicator (those should use chi_chars_injury) + has_mechanism <- "mechanism" %in% names(indicator_def) && !is.na(indicator_def$mechanism) + has_intent <- "intent" %in% names(indicator_def) && !is.na(indicator_def$intent) + + if (!has_mechanism && !has_intent) { + stop(paste0("\n\U1F6D1 Indicator '", ph.indicator, "' does not appear to be an injury indicator. ", + "You must use chi_chars_ccs() instead.")) + } + + # Check for required parameters + if (!all(c("intent", "mechanism") %in% names(indicator_def))) { + stop("\n\U1F6D1 chars.defs must contain columns 'intent' and 'mechanism' for injury indicators.") + } + + # Validate injury-specific parameters + if (!is.character(def) || length(def) != 1 || !def %in% c('narrow', 'broad')) { + stop("\n\U1F6D1 'def' must be either 'narrow' or 'broad'.") + } + + # Process sequentially ---- + message('\U023F3 Processing calculations... this may take a while') + + # Process each row + results_list <- lapply( + X = seq_len(nrow(instructions)), + FUN = function(i) { + # Progress meter + current_row <- instructions[i] + row_info <- paste( + ph.indicator, + current_row$tab, + current_row$cat1, + current_row$cat1_varname, + current_row$cat2, + current_row$cat2_varname, + current_row$end, + current_row$start, + sep = "|" + ) + message(paste0("myinstructions row ", i, "/", nrow(instructions), ": \n - ", row_info)) + + # Get current instruction parameters + current_start <- as.integer(current_row$start) + current_end <- as.integer(current_row$end) + + # For injury data, check if the year range is valid (only available from 2012+) + if (current_start < 2012) { + message(paste0(" - \u26A0\ufe0f Warning: Injury data only available from 2012. Adjusting start year from ", + current_start, " to 2012.")) + current_start <- 2012 + } + + # If start year is now after end year, skip this row + if (current_start > current_end) { + message(paste0(" - \u26A0\ufe0f Skipping: After year adjustment, start year ", current_start, + " is after end year ", current_end)) + return(NULL) + } + + # Determine geographical filter based on tab + is_wastate <- current_row$tab == "_wastate" + + # Filter data for appropriate years, geography, and age + if (is_wastate) { + data_subset <- data.table::copy(ph.data[wastate == 'Washington State' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } else { + data_subset <- data.table::copy(ph.data[chi_geo_kc == 'King County' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } + + # Return NULL if no data after filtering + if (nrow(data_subset) == 0) { + return(NULL) + } + + # Add year information + data_subset[, year_range := paste0(current_start, "-", current_end)] + + # Generate group_by argument + if (is.na(current_row$cat2_varname)) { + group_by_arg <- current_row$cat1_varname + } else { + group_by_arg <- c(current_row$cat1_varname, current_row$cat2_varname) + } + group_by_arg <- c('chi_age', unique(group_by_arg)) + + # Process data with rads::chars_injury_matrix_count + result <- rads::chars_injury_matrix_count( + ph.data = data_subset, + intent = indicator_def$intent, + mechanism = indicator_def$mechanism, + group_by = group_by_arg, + def = def, + kingco = FALSE # Already filtered data above + ) + + # Return NULL if no results + if (nrow(result) == 0) { + return(NULL) + } + + # For ICD10cm, poisoning is split into drug & non-drug, collapse them if needed + if (!is.null(indicator_def$mechanism) && !is.na(indicator_def$mechanism) && + indicator_def$mechanism == 'poisoning') { + result <- result[, list(mechanism = 'poisoning', hospitalizations = sum(hospitalizations)), + by = setdiff(names(result), c('mechanism', 'hospitalizations'))] + } + + # Remove mechanism and intent columns + if ("mechanism" %in% names(result)) { + result[, mechanism := NULL] + } + if ("intent" %in% names(result)) { + result[, intent := NULL] + } + + # Format the results + # Add metadata columns + result[, year := paste0(current_start, "-", current_end)] + result[, tab := current_row$tab] + result[, cat1 := current_row$cat1] + result[, cat2 := current_row$cat2] + result[, cat1_varname := gsub("_hispanic", "", current_row$cat1_varname)] + result[, cat2_varname := gsub("_hispanic", "", current_row$cat2_varname)] + + # Rename category columns + data.table::setnames(result, current_row$cat1_varname, "cat1_group") + + if (!is.na(current_row$cat2_varname)) { + data.table::setnames(result, current_row$cat2_varname, "cat2_group", skip_absent = TRUE) + } + + # Filter and format + result <- result[!is.na(cat1_group)] # Drop when group not identified + result[cat2_varname == 'chi_geo_kc', cat2_group := 'King County'] + result <- result[is.na(cat2_varname) | (!is.na(cat2_varname) & !is.na(cat2_group))] + result[, ph.indicator := ph.indicator] + + return(result) + } + ) + + # Filter out NULL results + results_list <- results_list[!sapply(results_list, is.null)] + + # Combine results if any exist + if (length(results_list) > 0) { + result <- data.table::rbindlist(results_list, fill = TRUE, use.names = TRUE) + } else { + warning("\u26A0\ufe0f No results found for indicator: ", ph.indicator) + result <- data.table::data.table() + } + + # Ensure we have complete data (for all ages within the specified range) ---- + # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results + template <- unique(result[, list(tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + indicator_key = ph.indicator, year)]) + + template <- template[, list(chi_age = seq(age_start, age_end)), by = names(template)] + + result <- merge(template, + result[, chi_age := as.numeric(chi_age)], + by = intersect(names(template), names(result)), + all = TRUE) + + result[is.na(hospitalizations), hospitalizations := 0] + + result <- result[, list(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + + setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) + + # Identify instructions that caused all data to be filtered out ---- + # this helps diagnose data quality issues, either in myinstructions or in ph.data + if (nrow(result) > 0) { + # Get combinations actually used in results + result_combos <- unique(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + + # Handle the special case for race3_hispanic (format it like the instructions) + result_combos[cat1_varname == 'race3' & cat1 == 'Ethnicity', cat1_varname := 'race3_hispanic'] + result_combos[cat2_varname == 'race3' & cat2 == 'Ethnicity', cat2_varname := 'race3_hispanic'] + + # Use fsetdiff to find instructions that didn't produce results + unused_instructions <- fsetdiff( + setcolorder(instructions[, start := pmax(2012, start)], names(result_combos)), + result_combos + ) + + # Sort for warning display + setorder(unused_instructions, indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, start, end) + } else { + unused_instructions <- copy(instructions) + } + + # Generate warnings and attach to result + if (nrow(unused_instructions) > 0) { + # Capture the formatted output directly + empty_table <- paste( + utils::capture.output( + print(unused_instructions, + row.names = FALSE, + class = FALSE, + printClassOfColumns = FALSE) + ), + collapse = "\n" + ) + + warning(paste0( + "\n\u26A0\ufe0f No data found for the following ", nrow(unused_instructions), + " instruction(s) for indicator '", ph.indicator, "':\n", + empty_table + )) + } + + # Drop irrelevant ages ---- + result <- chi_keep_proper_ages(result) + + # Tidy year (e.g., '2025-2025' becomes just '2025') ---- + result[, year := gsub("^(\\d{4})-(\\1)$", "\\1", year)] # \\1 is a back reference to what was found before hyphen + + # Order columns ---- + setcolorder(result, c("indicator_key", "year", "chi_age", "hospitalizations", "tab", "cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group")) + + # Return data.table ---- + return(result) +} diff --git a/R/chi_count_by_age.R b/R/chi_count_by_age.R new file mode 100644 index 0000000..377a668 --- /dev/null +++ b/R/chi_count_by_age.R @@ -0,0 +1,344 @@ +#' Generate Age-Specific Counts for Community Health Indicators +#' +#' @description +#' Creates a detailed breakdown of counts by age for CHI data analysis. Primarily +#' used for age standardization and rate calculations when combined with +#' population estimates. Processes data according to provided instructions and +#' handles demographic groupings with special treatment for race and ethnicity +#' variables. +#' +#' @param ph.data Input data frame or data table containing CHI data +#' @param ph.instructions Data frame or data table containing calculation specifications with columns: +#' \itemize{ +#' \item indicator_key: Name of the health metric to calculate +#' \item tab: Visualization tab type (kingcounty, demgroups, crosstabs) +#' \item cat1_varname, cat2_varname: Variable names for stratification +#' \item cat1, cat2: Human-friendly labels for these variables +#' \item start, end: Year range for the calculation +#' } +#' @param source_date Date of data source, added to output metadata +#' +#' @return A data.table containing age-specific counts with standard CHI groupings: +#' \itemize{ +#' \item indicator_key: Health metric identifier +#' \item year: Year range of data (e.g., "2019-2021" or single year) +#' \item tab: Visualization tab type +#' \item cat1, cat1_varname, cat1_group: Primary stratification variable details +#' \item cat2, cat2_varname, cat2_group: Secondary stratification variable details (if applicable) +#' \item chi_age: Age value (0-100) +#' \item count: Number of cases in that demographic-age group +#' \item source_date: Date of data source (if provided) +#' } +#' +#' @seealso +#' \code{\link{chi_generate_tro_shell}} which creates ph.instructions used by +#' \code{chi_count_by_age} +#' +#' \code{\link{chi_generate_instructions_pop}} which uses the output of the output +#' of \code{chi_count_by_age} +#' +#' @importFrom data.table setDT rbindlist setnames := setorder data.table CJ +#' @importFrom rads calc +#' @importFrom future.apply future_lapply +#' @importFrom progressr handlers progressor with_progress +#' @export +#' +chi_count_by_age <- function(ph.data = NULL, + ph.instructions = NULL, + source_date = NULL) { + + # Basic validation of inputs ---- + if (is.null(ph.data) || is.null(ph.instructions)) { + stop("Both ph.data and ph.instructions parameters are required") + } + + if (!is.data.frame(ph.data)) stop("\n\U1F6D1 ph.data must be a data.frame or data.table") + if (!is.data.frame(ph.instructions)) stop("\n\U1F6D1 ph.instructions must be a data.frame or data.table") + if (!is.null(source_date) & !inherits(source_date, "Date")) stop("\n\U1F6D1 source_date must be of type Date, if it is provided") + + # Convert inputs to data.table if they're not already + ph.data <- setDT(copy(ph.data)) + ph.instructions <- setDT(copy(ph.instructions)) + + # Create 'Overall' category if needed for crosstabs ---- + if (!"overall" %in% names(ph.data)) { + ph.data[, overall := ifelse(chi_geo_kc == "King County", "Overall", NA_character_)] + } + + # Check for required variables in the data ---- + # Extract all variable names needed from instructions + needed_byvars <- setdiff( + unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), + c(NA) + ) + + # Special case: race3 requires race3_hispanic by definition + # race3 in the Tableau viz is presented as a single variables, but in reality + # it is made up of a race indicator and a hispanic ethnicity indicator. So, + # when instructions request race3, they need race3_hispanic as well to calculate + # the final race3 values (which is race, with Hispanic as ethnicity) + if ("race3" %in% needed_byvars & !"race3_hispanic" %in% needed_byvars) { + needed_byvars <- c(needed_byvars, "race3_hispanic") + } + + # Combine byvar names with indicator keys to get all required variables + needed_vars <- setdiff( + unique(c(ph.instructions$indicator_key, needed_byvars)), + c(NA) + ) + + # Check if any required variables are missing from the data + missing_vars <- setdiff(needed_vars, names(ph.data)) + if (length(missing_vars) > 0) { + stop(paste0( + "\n\U2620 ph.data is missing the following columns that are specified in ph.instructions: ", + paste0(missing_vars, collapse = ", "), ". ", + "\nIf `race3_hispanic` is listed, that is because, by definition, `race3` cannot have a Hispanic ethnicity in the same variable. ", + "\nTwo variables (`race3` & `race3_hispanic`) will be processed and in the output, it will be called `race3`" + )) + } else { + message("\U0001f642 All specified variables exist in ph.data") + } + + # Verify that categorical variables follow CHI encoding standards ---- + # Get standardized byvar values from reference data + std_byvars <- rads.data::misc_chi_byvars + std_byvars <- std_byvars[varname %in% setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA))] + std_byvars <- std_byvars[, list(varname, group, keepme, reference = 1)] + + # Handle race3 and Hispanic special case + std_byvars[group %in% c("Hispanic", "Non-Hispanic") & varname == "race3", + varname := "race3_hispanic"] + + # Extract unique values for each byvar in the actual data + data_byvars <- rbindlist(lapply( + X = as.list(needed_byvars), + FUN = function(x) { + data.table( + varname = x, + group = setdiff(unique(ph.data[[x]]), NA), + ph.data = 1 + ) + } + )) + + # Merge actual values to reference standards + byvar_comparison <- merge(std_byvars, + data_byvars, + by = c("varname", "group"), + all = TRUE) + + # Check for any mismatches between data and reference standards + if (nrow(byvar_comparison[is.na(reference) | is.na(ph.data)]) > 0) { + print(byvar_comparison[is.na(reference) | is.na(ph.data)]) + stop("\n\U2620 The table above shows the varname/group combinations that do not align between the reference table and your ph.data.") + } else { + message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.") + } + + # Generate counts for each row in instructions ---- + message("\U023F3 Be patient! The function is generating counts for each row of ph.instructions.") + + progressr::handlers(handler_progress()) + with_progress({ + p <- progressor(nrow(ph.instructions)) + count_results <- rbindlist(future_lapply( + X = as.list(seq_len(nrow(ph.instructions))), + FUN = function(row_idx) { + p(paste0("Processing row ", row_idx, " of ", nrow(ph.instructions) )) + # Set up calculations ---- + # Extract parameters for this calculation + current_instruction <- ph.instructions[row_idx] + primary_byvar <- current_instruction[["cat1_varname"]] + secondary_byvar <- current_instruction[["cat2_varname"]] + if (is.null(secondary_byvar) || length(secondary_byvar) == 0) { + secondary_byvar <- NA + } + + # Combine all byvars including age + all_byvars <- setdiff(c(primary_byvar, secondary_byvar), c(NA)) + all_byvars <- unique(c(all_byvars, "chi_age")) + + # Calculate counts using rads::calc ---- + if (any(grepl("wastate", all_byvars))) { + # Washington state calculations + age_counts <- rads::calc( + ph.data = ph.data[chi_year >= current_instruction[["start"]] & chi_year <= current_instruction[["end"]]], + what = current_instruction[["indicator_key"]], + by = all_byvars, + metrics = c("numerator") + ) + } else { + # King County calculations + age_counts <- rads::calc( + ph.data = ph.data[chi_year >= current_instruction[["start"]] & chi_year <= current_instruction[["end"]] & chi_geo_kc == "King County"], + what = current_instruction[["indicator_key"]], + by = all_byvars, + metrics = c("numerator") + ) + } + + # Add cat1/cat2 information to results ---- + # Add cat1# info + age_counts[, cat1 := current_instruction[["cat1"]]] + setnames(age_counts, primary_byvar, "cat1_group") + age_counts[, cat1_varname := primary_byvar] + + # Add cat2# info + age_counts[, cat2 := current_instruction[["cat2"]]] + if (!is.na(secondary_byvar) & primary_byvar != secondary_byvar) { + setnames(age_counts, secondary_byvar, "cat2_group") + } else { + age_counts[, cat2_group := NA] + } + age_counts[, cat2_varname := secondary_byvar] + + # Filter out invalid combinations + age_counts <- age_counts[!is.na(cat1_group)] + age_counts <- age_counts[!(is.na(cat2_group) & !is.na(cat2))] + age_counts <- age_counts[!is.na(chi_age)] + + # Keep only necessary columns + age_counts <- age_counts[, list( + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + chi_age, count = numerator + )] + + # Create complete reference table for all combinations ---- + # When there is no data for a specific combination (e.g., NHPI 85+ in + # particular geography), the row will be missing. We want the row to be + # noted but with the count of 0, not NA. + + # cat1 summary table + cat1_table <- data.table( + cat1 = current_instruction[["cat1"]], + cat1_varname = primary_byvar, + cat1_group = sort(setdiff(as.character(unique(ph.data[[primary_byvar]])), NA)) + ) + + # cat2 summary table + if (!is.na(secondary_byvar)) { + cat2_table <- data.table( + cat2 = current_instruction[["cat2"]], + cat2_varname = secondary_byvar, + cat2_group = sort(setdiff(as.character(unique(ph.data[[secondary_byvar]])), NA)) + ) + } else { + cat2_table <- data.table( + cat2 = current_instruction[["cat2"]], + cat2_varname = secondary_byvar, + cat2_group = NA_character_ + ) + } + + # Create all logical combinations of cat1_table rows & cat2_table rows + # This approach preserves the correct relationship between variable + # names and their values. For example, if cat1_varname is 'chi_sex' then + # cat1_group must only contain values like 'Male' or 'Female', not values + # from other variables like 'Asian' or 'NHPI'. + # + # A simpler approach (like CJ with all columns directly) would create + # invalid combinations such as cat1_varname='chi_sex' with cat1_group='Hispanic', + # which is nonsensical. + # + # The nested lapply approach ensures that each category variable is only + # paired with its own valid values while still creating all valid combinations + # needed for the complete age-specific count table. + + all_combinations <- NULL + if (nrow(cat2_table) > 0) { + # For each row in cat1_table, combine with each row in cat2_table + all_combinations <- rbindlist(lapply(1:nrow(cat1_table), function(i) { + rbindlist(lapply(1:nrow(cat2_table), function(j) { + data.table( + cat1 = cat1_table[i]$cat1, + cat1_varname = cat1_table[i]$cat1_varname, + cat1_group = cat1_table[i]$cat1_group, + cat2 = cat2_table[j]$cat2, + cat2_varname = cat2_table[j]$cat2_varname, + cat2_group = cat2_table[j]$cat2_group + ) + })) + })) + } else { + all_combinations <- cat1_table + all_combinations[, `:=`( + cat2 = current_instruction[["cat2"]], + cat2_varname = secondary_byvar, + cat2_group = NA_character_ + )] + } + + # Now add all ages to create the final cartesian product + all_combinations_with_age <- CJ( + combo_idx = 1:nrow(all_combinations), + chi_age = 0:100, + unique = TRUE + ) + + all_combinations_with_age <- merge( + all_combinations_with_age, + all_combinations[, combo_idx := .I], + by = "combo_idx" + ) + + all_combinations_with_age[, combo_idx := NULL] + + + # Merge counts onto the reference table with all possible combinations ---- + complete_counts <- merge( + all_combinations_with_age, + age_counts, + by = c("cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group", "chi_age"), + all.x = TRUE + ) + + # Replace NA counts with 0 (needed for age-adjusted rates) + complete_counts[is.na(count), count := 0] + + # Add remaining identifiers + complete_counts[, indicator_key := current_instruction[["indicator_key"]]] + complete_counts[, tab := current_instruction[["tab"]]] + + # Format year range appropriately + if (current_instruction[["end"]] != current_instruction[["start"]]) { + complete_counts[, year := paste0(current_instruction[["start"]], "-", current_instruction[["end"]])] + } else { + complete_counts[, year := as.character(current_instruction[["start"]])] + } + + # Order and select final output columns + complete_counts <- complete_counts[, list( + indicator_key, year, tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + chi_age, count + )] + + setorder(complete_counts, cat1_group, cat2_group, chi_age) + return(complete_counts) + } # close function within future_lapply + ), # close future_lapply + use.names = TRUE) # close rbindlist + }) # close with_progress + + # Add source_date if provided + if (!is.null(source_date)) { + count_results[, source_date := source_date] + } else { + count_results[, source_date := as.Date(NA_character_)] + } + + # Modify race3 / race3_hispanic ---- + # as noted above, these are two distinct variables that are presented as a single + # variable in Tableau viz so the variable names need to be harmonized + count_results[cat1_varname == 'race3_hispanic', cat1_varname := 'race3'] + count_results[cat2_varname == 'race3_hispanic', cat2_varname := 'race3'] + drop_race3_groups <- rads.data::misc_chi_byvars[varname == 'race3' & keepme == 'No']$group + count_results <- count_results[!(cat1_varname == 'race3' & cat1_group %in% drop_race3_groups)] + count_results <- count_results[is.na(cat2) | !(cat2_varname == 'race3' & cat2_group %in% drop_race3_groups)] + + # Return the final results ---- + return(count_results) +} diff --git a/R/chi_drop_illogical_ages.R b/R/chi_drop_illogical_ages.R new file mode 100644 index 0000000..d919df3 --- /dev/null +++ b/R/chi_drop_illogical_ages.R @@ -0,0 +1,105 @@ +#' Filter Out Age Values That Don't Match Their Corresponding Age Group Categories +#' +#' @description +#' This function filters a data.table to remove rows with inconsistent age values. +#' It compares the single year age value (specified by \code{agevar}) against age +#' ranges defined in \code{cat1_group} and \code{cat2_group} columns, keeping only +#' those rows where: +#' \itemize{ +#' \item The categorical variable is not age-related, OR +#' \item The age value falls within the range specified by the corresponding age group +#' } +#' +#' Age groups are expected to be in formats like "10-17", "<5", or "45+", which +#' the function automatically parses into numeric ranges. +#' +#' @param ph.data A data.table or data.frame containing category and age data to be filtered. +#' @param agevar Character string specifying the name of the age variable column. +#' +#' Default: \code{agevar = 'chi_age'} +#' +#' @return A filtered data.table with only logically consistent age values +#' +#' @details +#' The function interprets special formats in age group strings: +#' \itemize{ +#' \item "<1" is treated as age "0-0" (age zero) +#' \item " 0) { + stop(paste("The following required columns are missing:", + paste(missing_cols, collapse = ", "))) + } + + # Ensure agevar column contains numeric values + if (!is.numeric(ph.data[[agevar]])) { + warning(paste("'", agevar, "' column is not numeric. Attempting to convert...", sep = "")) + tryCatch({ + ph.data[, (agevar) := as.numeric(get(agevar))] + }, error = function(e) { + stop(paste("'", agevar, "' could not be converted to numeric. Error: ", e$message, sep = "")) + }) + } + + # Loop for cat1 and cat2 + for (catnum in c("cat1", "cat2")) { + # Get column names for this category + catgroup <- paste0(catnum, "_group") + temp_catgroup <- paste0(catgroup, "_temp") + + # Create a standardized version of the age group + ph.data[, (temp_catgroup) := data.table::fcase( + get(catgroup) == '<1' & grepl(' age$|^age$', get(catnum), ignore.case = T), '0-0', + + grepl("<", get(catgroup)) &grepl(' age$|^age$', get(catnum), ignore.case = T), gsub("<", "0-", get(catgroup)), + + grepl("\\+", get(catgroup))& grepl(' age$|^age$', get(catnum), ignore.case = T), gsub("\\+", "-120", get(catgroup)), + + grepl('-', get(catgroup))& grepl(' age$|^age$', get(catnum), ignore.case = T), as.character(get(catgroup)))] + + # Extract min and max age + ph.data[, "min_age" := as.numeric(gsub("-.*", "", get(temp_catgroup)))] + ph.data[, "max_age" := as.numeric(gsub(".*-", "", get(temp_catgroup)))] + + # Keep rows where either: + # 1. The cat is not age-related, OR + # 2. The cat is age-related AND age value is within the min-max range + ph.data <- ph.data[!grepl(' age$|^age$', get(catnum), ignore.case = T) | + data.table::between(get(agevar), min_age, max_age)] + + # Clean up temporary columns + ph.data[, c(temp_catgroup, "min_age", "max_age") := NULL] + } + + return(ph.data) +} diff --git a/R/chi_generate_instructions_pop.R b/R/chi_generate_instructions_pop.R new file mode 100644 index 0000000..53d4914 --- /dev/null +++ b/R/chi_generate_instructions_pop.R @@ -0,0 +1,145 @@ +#' Generate Population Instructions for CHI Analysis +#' +#' @description +#' Creates instructions for \code{\link{chi_get_proper_pop}} based on a table of count +#' data. These instructions configure appropriate demographic groupings, +#' geographic types, and time periods for retrieving population denominators used +#' in CHI rate calculations. +#' +#' @param mycount.data Input data.table produced by \code{\link{chi_count_by_age}}, +#' containing the following columns: +#' \itemize{ +#' \item indicator_key: indicator_key used by CHI +#' \item year: Year range (e.g., "2019-2021" or single year) +#' \item tab: Visualization tab type +#' \item cat1, cat1_varname, cat1_group: Primary stratification variables +#' \item cat2, cat2_varname, cat2_group: Secondary stratification variables +#' \item chi_age: Single year age +#' \item count: Count of events (births, death, hospitalizations, etc. ) +#' } +#' @param povgeo Geographic level for poverty analysis ('blk' or 'zip') +#' +#' @return A data.table containing population processing instructions with columns: +#' \itemize{ +#' \item year: Original year range from input +#' \item cat1, cat1_varname: Primary stratification details +#' \item cat2, cat2_varname: Secondary stratification details +#' \item tab: Visualization tab type +#' \item start, stop: Start and end years parsed from the year range +#' \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') +#' \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') +#' \item group_by1, group_by2: Demographic grouping specifications +#' } +#' +#' @seealso +#' \code{\link{chi_count_by_age}} which generates the count data used as input +#' to this function +#' +#' \code{\link{chi_get_proper_pop}} which uses the output of this function +#' +#' @importFrom data.table copy `:=` setorder tstrsplit +#' @importFrom tools toTitleCase +#' @export +#' +chi_generate_instructions_pop <- function(mycount.data, + povgeo = c('blk', 'zip')) { + + # Validation of inputs ---- + if (is.null(mycount.data)) { + stop("\n\U1F6D1 mycount.data parameter is required") + } + + if (!is.data.frame(mycount.data)) { + stop("\n\U1F6D1 mycount.data must be a data.frame or data.table") + } else {mycount.data <- setDT(copy(mycount.data))} + + povgeo <- match.arg(povgeo) + + # Initial data preparation ---- + # Create a template with only the necessary columns to avoid duplicates + pop.template <- copy(mycount.data) + pop.template <- unique(copy(pop.template)[, list(year, cat1, cat1_varname, cat2, cat2_varname, tab)]) + + # Process year ranges ---- + # Split year ranges (e.g., "2019-2021") into start and stop years + pop.template[, c("start", "stop") := tstrsplit(year, split = '-')] + # For single years, set the stop year equal to the start year + pop.template[is.na(stop), stop := start] + + # Set default demographic settings ---- + # Default race type includes ethnicity (Hispanic as race) + pop.template[, race_type := 'race_eth'] + + # Handle maternal data prefixes ---- + # Remove "Birthing person's" prefix to standardize maternal data categories + pop.template[grepl("birthing person", cat1, ignore.case = TRUE), + cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] + pop.template[grepl("birthing person", cat2, ignore.case = TRUE), + cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] + + # Process geographic types and demographic groupings ---- + # Define OMB AIC (alone or in combination) + omb_aic <- c("chi_race_aic_aian", "chi_race_aic_asian", "chi_race_aic_black", + "chi_race_aic_his", "chi_race_aic_nhpi", "chi_race_aic_wht") + + # Process both primary (cat1) and secondary (cat2) stratification variables + for(catnum in c("cat1", "cat2")) { + catvarname <- paste0(catnum, "_varname") + temp.groupby <- paste0("group_by", gsub('cat', '', catnum)) + + # Set geographic type based on category + pop.template[get(catnum) == "Cities/neighborhoods", geo_type := "hra"] + + # Set race_type and group_by based on race/ethnicity variable + pop.template[get(catvarname) == "race3", c("race_type", temp.groupby) := 'race'] + pop.template[get(catvarname) == "race4", c("race_type", temp.groupby) := 'race_eth'] + pop.template[get(catvarname) %in% omb_aic, c("race_type", temp.groupby) := 'race_aic'] + + # Filter out non-standard AIC race/ethnicity categories that don't have population data + pop.template <- pop.template[!(grepl('_aic_', get(catvarname)) & + !get(catvarname) %in% omb_aic)] + + # Set demographic grouping based on category label + pop.template[get(catnum) == "Ethnicity", c("race_type", temp.groupby) := 'race_eth'] + pop.template[get(catnum) == "Gender", (temp.groupby) := 'genders'] + pop.template[get(catnum) %in% c("Race", "Race/ethnicity") & get(catvarname) == 'race4', + (temp.groupby) := 'race_eth'] + pop.template[(get(catnum) == "Race" & get(catvarname) == 'race3'), + (temp.groupby) := 'race'] + + # Set geographic type based on regions + pop.template[get(catnum) == "Regions" & (is.na(geo_type) | geo_type != 'hra'), + geo_type := "region"] + pop.template[get(catnum) == "Big cities", geo_type := "hra"] + pop.template[get(catnum) == "Washington State", geo_type := "wa"] + } + + # Handle special geographic cases ---- + # Set geographic type for poverty analysis to block group level by default + pop.template[grepl("poverty$", cat1, ignore.case = TRUE) | + grepl("poverty$", cat2, ignore.case = TRUE), + geo_type := "blk"] + + # Override with zip code level for poverty analysis if specified in povgeo parameter + if(!is.na(povgeo) && povgeo == 'zip') { + pop.template[grepl("poverty$", cat1, ignore.case = TRUE) | + grepl("poverty$", cat2, ignore.case = TRUE), + geo_type := "zip"] + } + + # Special case for combined regions and cities/neighborhoods analysis + pop.template[(cat1 == "Regions" & cat2 == "Cities/neighborhoods") | + (cat2 == "Regions" & cat1 == "Cities/neighborhoods"), + geo_type := "blk"] + + # Set missing or default values ---- + # Replace NA cat2 with "NA" string for consistent processing + pop.template[is.na(cat2), cat2 := "NA"] + + # Set default geographic type as King County when not specified + pop.template[is.na(geo_type), geo_type := 'kc'] + + # Return final results ---- + # Remove duplicate rows to minimize calls to get_population for efficiency + return(unique(pop.template)) +} diff --git a/R/chi_generate_metadata.R b/R/chi_generate_metadata.R index 0d0aca6..3bdca00 100644 --- a/R/chi_generate_metadata.R +++ b/R/chi_generate_metadata.R @@ -54,18 +54,26 @@ chi_generate_metadata <- function(meta.old = NULL, meta.new[!is.na(latest_year_countx), latest_year_count := latest_year_countx] meta.new[!is.na(latest_year_kc_popx), latest_year_kc_pop := latest_year_kc_popx] meta.new[, c("latest_yearx", "latest_year_resultx", "run_datex", "latest_year_countx", "latest_year_kc_popx") := NULL] - # update valid_years ---- - meta.new[as.integer(latest_year) > suppressWarnings(as.integer(rads::substrRight(valid_years, 1, 4))), - valid_years := suppressWarnings(paste(as.integer(substr(valid_years, 1, 4)):as.integer(latest_year), collapse = " "))] - # Since trends only have 10 years of data, valid_years should be limited to 10 years max ---- - meta.new[, valid_years := { - allyears <- sort(as.integer(strsplit(valid_years, " ")[[1]])) # convert valid_years to a vector of numbers - if(length(allyears) > 10) { - paste(tail(sort(allyears), 10), collapse = " ") - } else { - paste(allyears, collapse = " ") - } - }, by = indicator_key] + + # Update valid_years & keep up to 10 years ---- + meta.new[, valid_years := { + # Get current years as vector + years_vector <- as.integer(strsplit(valid_years, " ")[[1]]) + + # Add latest_year if it's not already there + if (!is.na(latest_year) && !latest_year %in% years_vector) { + years_vector <- c(years_vector, latest_year) + } + + # Sort and keep only most recent 10 years if there are more than 10 + years_vector <- sort(unique(years_vector)) + if (length(years_vector) > 10) { + years_vector <- tail(years_vector, 10) + } + + # Convert back to space-separated string + paste(years_vector, collapse = " ") + }, by = indicator_key] # Ensure there are no missing important metadata cells (with exceptions) ---- unexpected_missing <- data.frame() diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 05bc574..2e1480b 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -26,7 +26,6 @@ #' } #' #' @param ph.analysis_set name of data.table to parse -#' @param start.year the earliest year to be used for estimates #' @param end.year the latest year to be used for aggregate estimates #' @param year.span the number of years to be included in a single non-trend period #' @param trend.span the number of years to be included in a single trend period @@ -54,52 +53,50 @@ #' @export #' chi_generate_tro_shell <- function(ph.analysis_set, - start.year, - end.year, - year.span = NULL, - trend.span = NULL, - trend.periods = NULL){ + end.year, + year.span = NULL, + trend.span = NULL, + trend.periods = NULL){ # Input validation - if (missing(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be provided") - if (!is.data.frame(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be a data.frame or data.table") - - if (missing(start.year)) stop("\n\U1F6D1 start.year must be provided") - if (!is.numeric(start.year) || length(start.year) != 1) stop("\n\U1F6D1 start.year must be a single numeric value") + if (missing(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be provided") + if (!is.data.frame(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be a data.frame or data.table") + if (!("set" %in% names(ph.analysis_set)) | anyNA(ph.analysis_set$set)) { + stop("\n\u1F6D1 set number must be provided for all rows") + } + if (missing(end.year)) stop("\n\U1F6D1 end.year must be provided") + if (!is.numeric(end.year) || length(end.year) != 1) stop("\n\U1F6D1 end.year must be a single numeric value") - if (missing(end.year)) stop("\n\U1F6D1 end.year must be provided") - if (!is.numeric(end.year) || length(end.year) != 1) stop("\n\U1F6D1 end.year must be a single numeric value") + if (!is.null(year.span) && (!is.numeric(year.span) || length(year.span) != 1)) { + stop("\n\U1F6D1 year.span must be NULL or a single numeric value") + } - if (!is.null(year.span) && (!is.numeric(year.span) || length(year.span) != 1)) { - stop("\n\U1F6D1 year.span must be NULL or a single numeric value") - } + if (!is.null(trend.span) && (!is.numeric(trend.span) || length(trend.span) != 1)) { + stop("\n\U1F6D1 trend.span must be NULL or a single numeric value") + } - if (!is.null(trend.span) && (!is.numeric(trend.span) || length(trend.span) != 1)) { - stop("\n\U1F6D1 trend.span must be NULL or a single numeric value") - } + if (!is.null(trend.periods) && (!is.numeric(trend.periods) || length(trend.periods) != 1)) { + stop("\n\U1F6D1 trend.periods must be NULL or a single numeric value") + } - if (!is.null(trend.periods) && (!is.numeric(trend.periods) || length(trend.periods) != 1)) { - stop("\n\U1F6D1 trend.periods must be NULL or a single numeric value") - } - # Convert to data.table if needed - if (!is.data.table(ph.analysis_set)) setDT(ph.analysis_set) + # Convert to data.table if needed + if (!is.data.table(ph.analysis_set)) setDT(ph.analysis_set) - #parameterization checks + # parameterization checks if("x" %in% ph.analysis_set$trends & (is.null(trend.span) | is.null(trend.periods))) {stop("you have indicated that a trends analysis is to be conducted, but have not indicated both the span and number of periods for this analysis.")} - #ph.analysis_set checks - - - #advisory messages + # advisory messages if("x" %in% ph.analysis_set$trends) {message("Note: trends are applied backwards from end.year")} # Race / ethnicity is a chronic headache with CHI. Need to remove rows for race4 & Ethnicity because should be Race/ethnicity ph.analysis_set <- ph.analysis_set[!(cat1_varname == 'race4' & cat1 == 'Ethnicity')] # apply the template generating function + # generate vector of sets + sets <- unique(ph.analysis_set$set) template <- rbindlist( - lapply(X = seq(1, length(unique(ph.analysis_set$set))), + lapply(X = sets, FUN = chi_process_nontrends, ph.analysis_set = ph.analysis_set)) # split trends from other tabs because processed for multiple years @@ -117,9 +114,9 @@ chi_generate_tro_shell <- function(ph.analysis_set, # add years to template (trends) if(nrow(template.trends) > 0){ trend.years <- chi_process_trends(indicator_key = intersect(unique(template$indicator_key), unique(template.trends$indicator_key)), - trend.span = trend.span, - end.year = end.year, - trend.periods = trend.periods) + trend.span = trend.span, + end.year = end.year, + trend.periods = trend.periods) template.trends <- merge(template.trends, trend.years, by = 'indicator_key', all = T, allow.cartesian = T) # append trends template to main template diff --git a/R/chi_get_cols.R b/R/chi_get_cols.R new file mode 100644 index 0000000..4f40acd --- /dev/null +++ b/R/chi_get_cols.R @@ -0,0 +1,25 @@ +#' Get CHI variable column names +#' +#' @description +#' Returns a character vector of column names defined in the CHI YAML reference file. +#' This helper function provides easy access to the standardized CHI variable names. +#' +#' @param metadata returns metadata column names instead of primary data +#' +#' @return A character vector of column names for the chi data (Default) or metadata +#' @importFrom yaml read_yaml +#' @export +#' +#' @examples +#' cols <- chi_get_cols() +chi_get_cols <- function(metadata = FALSE) { + chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools") + if (chi.yaml.filepath == "") { + stop("Could not find reference file chi_qa.yaml") + } + chi.yaml <- yaml::read_yaml(chi.yaml.filepath) + if(metadata){ + return(names(chi.yaml$metadata)) + } + return(names(chi.yaml$vars)) +} diff --git a/R/chi_get_proper_pop.R b/R/chi_get_proper_pop.R new file mode 100644 index 0000000..2f673f9 --- /dev/null +++ b/R/chi_get_proper_pop.R @@ -0,0 +1,541 @@ +#' Get Population Denominators for CHI Analysis +#' +#' @description +#' Retrieves population estimates based on instructions generated by +#' \code{\link{chi_generate_instructions_pop}}. This function processes demographic +#' categories, geographic levels, and time periods to create appropriate +#' population denominators for use in CHI rate calculations. +#' +#' @param pop.template A data.table produced by \code{\link{chi_generate_instructions_pop}}, +#' containing instructions for population data retrieval with the following columns: +#' \itemize{ +#' \item year: Year range (e.g., "2019-2021" or single year) +#' \item cat1, cat1_varname: Primary stratification variables +#' \item cat2, cat2_varname: Secondary stratification variables +#' \item tab: Visualization tab type +#' \item start, stop: Start and end years parsed from the year range +#' \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic') +#' \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa') +#' \item group_by1, group_by2: Race/eth grouping specifications ('race_eth', 'race') +#' } +#' @param pop.genders Optional character vector specifying which genders to include. +#' Valid values are 'f', 'female', 'm', 'male'. If NULL (default), both genders are included. +#' @param pop.ages Optional integer vector specifying which ages to include. +#' If NULL (default), ages 0-100 are included. +#' +#' @return A data.table containing population counts with columns: +#' \itemize{ +#' \item chi_age: Single year age +#' \item year: Year +#' \item cat1, cat1_varname, cat1_group: Primary stratification variables +#' \item cat2, cat2_varname, cat2_group: Secondary stratification variables +#' \item tab: Visualization tab type +#' \item pop: Population count +#' } +#' +#' @details +#' The function performs multiple steps to generate proper population denominators: +#' 1. Validates input parameters +#' 2. Creates population tables for each row of the template using rads::get_population +#' 3. Handles special cases for various geographic aggregations and crosswalks +#' 4. Returns a comprehensive, tidy dataset with population counts +#' +#' @seealso +#' \code{\link{chi_generate_instructions_pop}} which generates the instructions used as input +#' to this function +#' +#' @importFrom data.table setDT rbindlist `:=` +#' @importFrom future.apply future_lapply +#' @importFrom tools toTitleCase +#' @importFrom progressr handlers progressor with_progress +#' @export +#' +chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages = NULL) { + # Validation of arguments ---- + # pop.template + if (is.null(pop.template)) { + stop("\n\U1F6D1 pop.template parameter is required") + } + + if (!is.data.frame(pop.template)) { + stop("\n\U1F6D1 pop.template must be a data.frame or data.table") + } else { + pop.template <- setDT(copy(pop.template)) + } + + # Check for required columns + required_columns <- c("year", "cat1", "cat1_varname", "cat2", "cat2_varname", + "start", "stop", "race_type", "geo_type", "tab") + missing_columns <- required_columns[!required_columns %in% names(pop.template)] + + if (length(missing_columns) > 0) { + stop("\n\U1F6D1 pop.template is missing required columns: ", + paste(missing_columns, collapse = ", ")) + } + + # pop.genders + if (is.null(pop.genders)) { + gender_values <- c("f", "m") + } else { + if (!tolower(pop.genders) %in% c('f', 'female', 'm', 'male')) { + stop("\n\U0001f47f if pop.genders is specified, it is limited to one of the following values: 'F', 'f', 'Female', 'female', 'M', 'm', 'Male', or 'male'") + } else { + gender_values <- pop.genders + } + } + # pop.ages + if (is.null(pop.ages)) { + age_values <- c(0:100) + } else { + if (!is.integer(pop.ages)) { + stop("\n\U0001f47f if pop.ages is specified it must be vector of integers, e.g., c(0:65)") + } else { + age_values <- pop.ages + } + } + + # Define core population extraction function ---- + get_population_for_template_row <- function(row_index, pop.template = NULL) { + # Status update with progress indicator ---- + print(paste0("Process ID ", Sys.getpid(), ": Getting population ", row_index, " out of ", nrow(pop.template))) + + # Extract current template row ---- + current_row <- pop.template[row_index, ] + + # Standardize category names ---- + # Remove birthing person prefixes to standardize maternal data categories + pop.template[grepl("birthing person", cat1, ignore.case = TRUE), + cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] + pop.template[grepl("birthing person", cat2, ignore.case = TRUE), + cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] + + # Build grouping parameters for pop query ---- + grouping_vars <- unique(c( + c("ages", "geo_id"), + setdiff(c(current_row$group_by1, current_row$group_by2), c(NA)) + )) + + # Use rads::get_population() ---- + if (is.na(current_row$geo_type)) { + population_data <- rads::get_population( + group_by = grouping_vars, + race_type = current_row$race_type, + years = current_row$start:current_row$stop, + genders = gender_values, + ages = age_values, + round = FALSE + ) + } else { + population_data <- rads::get_population( + group_by = grouping_vars, + geo_type = current_row$geo_type, + race_type = current_row$race_type, + years = current_row$start:current_row$stop, + genders = gender_values, + ages = age_values, + round = FALSE + ) + } + + # Process demographic categories ---- + # Apply category grouping logic for both primary (cat1) and secondary (cat2) categories + for (catnum in c("cat1", "cat2")) { + ## Add category information from template to result ---- + catvarname <- paste0(catnum, '_varname') + catgroup <- paste0(catnum, '_group') + temp.groupby <- paste0("group_by", gsub('cat', '', catnum)) + + # had to use set function because regular := syntax caused errors b/c used catnum differently on both sides of := + data.table::set(population_data, + j = catnum, + value = current_row[[catnum]]) + + data.table::set(population_data, + j = catvarname, + value = current_row[[catvarname]]) + + data.table::set(population_data, + j = catgroup, + value = current_row[[temp.groupby]]) + + ## Process geographic categories ---- + # King County + population_data[get(catnum) == "King County", + c(catgroup) := "King County"] + + # Washington State + population_data[get(catnum) == "Washington State", + c(catgroup) := "Washington State"] + + # Handle NA values + suppressWarnings( + population_data[get(catnum) == "NA" | is.na(get(catnum)), + c(catnum, catgroup, catvarname) := "NA"] + ) + + # Cities/neighborhoods and Regions + population_data[get(catnum) %in% c("Cities/neighborhoods", "Regions") & + current_row$geo_type != 'blk', + c(catgroup) := geo_id] + + ## Process gender ---- + population_data[get(catnum) %in% c("Gender"), c(catgroup) := gender] + + ## Process 'Overall' ---- + population_data[get(catnum) %in% c("Overall"), c(catgroup) := "Overall"] + + ## Process race/ethnicity categories ---- + population_data[get(catnum) == "Ethnicity" | get(catvarname) %in% c('race4'), + c(catgroup) := race_eth] + + population_data[get(catnum) == 'Race' & get(catvarname) %in% c('race3'), + c(catgroup) := race] + + population_data <- population_data[get(catnum) != "Ethnicity" | (get(catnum) == "Ethnicity" & get(catgroup) == 'Hispanic'), ] + + population_data[get(catgroup) == "Multiple race", + c(catgroup) := "Multiple"] + + ## Process race_aic (alone or in combination) categories ---- + if (current_row$race_type == 'race_aic') { + # Filter to keep only relevant race_aic combinations + population_data <- population_data[ + !(grepl('_aic_', get(catvarname)) & + !((get(catvarname) == 'chi_race_aic_aian' & race_aic == 'AIAN') | + (get(catvarname) == 'chi_race_aic_asian' & race_aic == 'Asian') | + (get(catvarname) == 'chi_race_aic_black' & race_aic == 'Black') | + (get(catvarname) == 'chi_race_aic_his' & race_aic == 'Hispanic') | + (get(catvarname) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | + (get(catvarname) == 'chi_race_aic_wht' & race_aic == 'White')) + ) + ] + + # Assign race_aic value to group + population_data[grep('_aic', get(catvarname)), + c(catgroup) := race_aic] + } + + ## Process HRAs ---- + if (population_data[1, geo_type] == 'blk' & population_data[1, get(catnum)] == 'Cities/neighborhoods') { + + hra_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] + + population_data <- merge(population_data, + hra_crosswalk, + by = "geo_id", + all.x = TRUE, + all.y = FALSE) + + population_data[, c(catgroup) := hra20_name] + } + + ## Process Regions ---- + # Block to Region crosswalk + if (population_data[1, geo_type] == 'blk' & population_data[1, get(catnum)] == 'Regions') { + + region_crosswalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, region_name)] + + population_data <- merge(population_data, + region_crosswalk, + by = 'geo_id', + all.x = TRUE, + all.y = FALSE) + + population_data[, c(catgroup) := region_name] + } + + # HRA to Region crosswalk + if (population_data[1, geo_type] == 'hra' & population_data[1, get(catnum)] == 'Regions') { + + region_crosswalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] + + population_data <- merge(population_data, + region_crosswalk, + by = 'geo_id', + all.x = TRUE, + all.y = FALSE) + + population_data[, c(catgroup) := region_name] + } + + # ZIP to Region crosswalk with population weighting + if (population_data[1, geo_type] == 'zip' & population_data[1, get(catnum)] == 'Regions') { + + # Create ZIP to region crosswalk with population weights + zip_region_crosswalk <- rads.data::spatial_zip_to_hra20_pop + + zip_region_crosswalk <- merge(zip_region_crosswalk, + rads.data::spatial_hra20_to_region20[, list(hra20_name, region = region_name)], + by = 'hra20_name', + all = TRUE) + + # Aggregate fractional populations to region level + zip_region_crosswalk <- zip_region_crosswalk[,list(s2t_fraction = sum(s2t_fraction)), + list(geo_id = as.character(source_id), region)] + + # Apply population weighting by region + population_data <- merge(population_data, + zip_region_crosswalk, + by = "geo_id", + all.x = TRUE, + all.y = FALSE, + allow.cartesian = TRUE) + population_data[, pop := pop * s2t_fraction] # Apply weight to population + population_data[, c(catgroup) := region] + } + + ## Process Big Cities ---- + if (population_data[1, get(catnum)] == 'Big cities') { + # Block to big city crosswalk + if (population_data[1, geo_type] == 'blk') { + + # Two-step crosswalk: block to HRA to big city + block_to_hra <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] + hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] + + block_to_bigcity <- merge(hra_to_bigcity, + block_to_hra, + by = 'hra20_name', + all.x = T, + all.y = F)[, hra20_name := NULL] + + population_data <- merge(population_data, + block_to_bigcity, + by = "geo_id", + all.x = TRUE, + all.y = FALSE) + } + + # HRA to big city crosswalk + if (population_data[1, geo_type] == 'hra') { + hra_to_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] + population_data <- merge(population_data, + hra_to_bigcity, + by.x = 'geo_id', + by.y = 'hra20_name', + all.x = TRUE, + all.y = FALSE) + } + + # Assign big city name to group + population_data[, c(catgroup) := bigcity] + } + + ## Process age groupings ---- + # Age 6 groups: <18, 18-24, 25-44, 45-64, 65-74, 75+ + population_data[get(catvarname) == "age6" & age %in% 0:17, + c(catgroup) := "<18"] + population_data[get(catvarname) == "age6" & age %in% 18:24, + c(catgroup) := "18-24"] + population_data[get(catvarname) == "age6" & age %in% 25:44, + c(catgroup) := "25-44"] + population_data[get(catvarname) == "age6" & age %in% 45:64, + c(catgroup) := "45-64"] + population_data[get(catvarname) == "age6" & age %in% 65:74, + c(catgroup) := "65-74"] + population_data[get(catvarname) == "age6" & age >= 75, + c(catgroup) := "75+"] + + # Maternal age 5 groups: 10-17, 18-24, 25-34, 35-44, 45+ + population_data[get(catvarname) == "mage5" & age %in% 10:17, + c(catgroup) := "10-17"] + population_data[get(catvarname) == "mage5" & age %in% 18:24, + c(catgroup) := "18-24"] + population_data[get(catvarname) == "mage5" & age %in% 25:34, + c(catgroup) := "25-34"] + population_data[get(catvarname) == "mage5" & age %in% 35:44, + c(catgroup) := "35-44"] + population_data[get(catvarname) == "mage5" & age >= 45, + c(catgroup) := "45+"] + + # Youth age 4 groups: 0-4, 5-9, 10-14, 15-17 + population_data[get(catvarname) == "yage4" & age %in% 0:4, + c(catgroup) := "0-4"] + population_data[get(catvarname) == "yage4" & age %in% 5:9, + c(catgroup) := "5-9"] + population_data[get(catvarname) == "yage4" & age %in% 10:14, + c(catgroup) := "10-14"] + population_data[get(catvarname) == "yage4" & age %in% 15:17, + c(catgroup) := "15-17"] + + ## Process poverty groupings ---- + # Block level poverty + if (population_data[1, geo_type] == 'blk' & + grepl("poverty$", population_data[1, get(catnum)], ignore.case = TRUE)) { + # Extract tract ID from block ID (first 11 characters) + population_data[, geo_tract2020 := substr(geo_id, 1, 11)] + + # Join poverty group data + population_data <- merge( + population_data, + rads.data::misc_poverty_groups[geo_type == 'Tract'][, list(geo_tract2020 = geo_id, pov200grp)], + by = "geo_tract2020", + all.x = TRUE, + all.y = FALSE + ) + + # Assign poverty group + population_data[, c(catgroup) := pov200grp] + } + + # ZIP level poverty + if (population_data[1, geo_type] == 'zip' & + grepl("poverty$", population_data[1, get(catnum)], ignore.case = TRUE)) { + # Join poverty group data + population_data <- merge( + population_data, + rads.data::misc_poverty_groups[geo_type == 'ZCTA'][, list(geo_id, pov200grp)], + by = 'geo_id', + all.x = TRUE, + all.y = FALSE + ) + + # Assign poverty group + population_data[, c(catgroup) := pov200grp] + } + + } # close looping over cat1/cat2 + + # Filter and clean results ---- + # Remove rows with missing primary category group + population_data <- population_data[!is.na(cat1_group)] + + # Remove rows with missing secondary category group (when category exists) + population_data <- population_data[is.na(cat2) | cat2 == 'NA' | + (!is.na(cat2) & cat2 != 'NA' & !is.na(cat2_group) & cat2_group != 'NA'),] + + # Aggregate population data ---- + # Collapse to one row per demographic combination with sum of population + population_data <- population_data[, list(pop = sum(pop)), + list(chi_age = age, + year, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group)] + + # Generate complete demographic combinations ---- + # Function to handle age group processing and demographic merging + process_age_category <- function(population_data, cat_num) { + # Define prefix and complementary prefix + cat_prefix <- paste0("cat", cat_num) + other_cat_prefix <- if(cat_num == 1) "cat2" else "cat1" # need to select the cat that does not have the age var + + # Get the variable name for this category + cat_varname <- population_data[1][[paste0(cat_prefix, "_varname")]] + + # Create appropriate age groups based on cat_varname + if (cat_varname == 'age6') { + age_groups <- data.table( + chi_age = 0:100 + ) + age_groups[, paste0(cat_prefix, "_group") := cut( + chi_age, + breaks = c(-1, 17, 24, 44, 64, 74, 120), + labels = c("<18", "18-24", "25-44", "45-64", "65-74", "75+") + )] + } else if (cat_varname == 'mage5') { + age_groups <- data.table( + chi_age = 10:100 + ) + age_groups[, paste0(cat_prefix, "_group") := cut( + chi_age, + breaks = c(9, 17, 24, 34, 44, 120), + labels = c('10-17', '18-24', '25-34', '35-44', '45+') + )] + } else if (cat_varname == 'yage4') { + age_groups <- data.table( + chi_age = 0:17 + ) + age_groups[, paste0(cat_prefix, "_group") := cut( + chi_age, + breaks = c(-1, 4, 9, 14, 17), + labels = c('0-4', '5-9', '10-14', '15-17') + )] + } + + # Add category info + age_groups[, (cat_prefix) := "Age"] + age_groups[, paste0(cat_prefix, "_varname") := cat_varname] + + # Combine demographic dimensions with age groups + cols_to_select <- c("year", other_cat_prefix, + paste0(other_cat_prefix, "_varname"), + paste0(other_cat_prefix, "_group")) + + unique_pop_data <- unique(population_data[, cols_to_select, with = FALSE][, mykey := 1]) + + complete_demographics <- unique_pop_data[age_groups[, mykey := 1], on = "mykey", allow.cartesian = TRUE] + complete_demographics[, mykey := NULL] + + return(complete_demographics) + } + + # Use function to handle age group processing + if (population_data[1]$cat1 == "Age") { + complete_demographics <- process_age_category(population_data, 1) + } + + if (population_data[1]$cat2 == "Age") { + complete_demographics <- process_age_category(population_data, 2) + } + + if (population_data[1]$cat1 != "Age" & population_data[1]$cat2 != "Age"){ + # Get unique cat1 groups + cat1_groups <- unique(population_data[, list(cat1, cat1_varname, cat1_group, mykey = 1)]) + + # Get unique cat2 groups + cat2_groups <- unique(population_data[, list(cat2, cat2_varname, cat2_group, mykey = 1)]) + + # All combos of cat1 and cat2 groups + complete_demographics <- cat1_groups[cat2_groups, on = "mykey", allow.cartesian = TRUE] + + # Create year and age combos + year_age <- data.table(year = as.character(current_row$year), chi_age = 0:100, mykey = 1) + + # Get combos for each year/age cat1/cat2 combo + complete_demographics <- complete_demographics[year_age, on = "mykey", allow.cartesian = TRUE] + + # Drop key + complete_demographics[, mykey := NULL] + } + + + + # Merge population data with complete demographics grid ---- + population_data <- suppressWarnings(merge(population_data, complete_demographics, all = TRUE)) + + population_data[is.na(pop), pop := 0] # Fill missing population values with zero + + # Add tab column and finalize ---- + population_data[, tab := current_row$tab] + + # Convert placeholder "NA" strings back to true NA values ---- + population_data[cat2 == "NA", c("cat2", "cat2_varname", "cat2_group") := NA] + + # Return completed population dataset ---- + return(population_data) + } + + # Process all template rows in parallel ---- + # Combine results from all template rows using future_lapply for parallel processing + progressr::handlers(handler_progress()) + progressr::with_progress({ + p <- progressr::progressor(nrow(pop.template)) + all_population_data <- rbindlist( + future_lapply( + X = as.list(seq(1, nrow(pop.template))), + FUN = function(row_index) { + p(paste0("Processing row ", row_index, " of ", nrow(pop.template) )) + set.seed(98104) # Set consistent seed for reproducibility + get_population_for_template_row(row_index, pop.template = pop.template) + }, + future.seed = 98104 # Set seed for parallel processes + ) + ) + }) # closet progressr::with_progress() + + # Remove duplicate rows in final dataset + all_population_data <- unique(all_population_data) + + # Return final population dataset ---- + return(all_population_data) +} diff --git a/R/chi_get_yaml.R b/R/chi_get_yaml.R index 7d7c813..9ad1ebf 100644 --- a/R/chi_get_yaml.R +++ b/R/chi_get_yaml.R @@ -1,23 +1,3 @@ -#' Get CHI variable column names -#' -#' Returns a character vector of column names defined in the CHI YAML reference file. -#' This helper function provides easy access to the standardized CHI variable names. -#' -#' @return A character vector of column names -#' @importFrom yaml read_yaml -#' @export -#' -#' @examples -#' cols <- chi_get_cols() -chi_get_cols <- function() { - chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools") - if (chi.yaml.filepath == "") { - stop("Could not find reference file chi_qa.yaml") - } - chi.yaml <- yaml::read_yaml(chi.yaml.filepath) - return(names(chi.yaml$vars)) -} - #' Get CHI YAML configuration #' #' Returns the complete CHI YAML configuration as a list. diff --git a/R/chi_keep_proper_ages.R b/R/chi_keep_proper_ages.R new file mode 100644 index 0000000..3e39880 --- /dev/null +++ b/R/chi_keep_proper_ages.R @@ -0,0 +1,151 @@ +# chi_keep_proper_ages ---- +#' Keep data for appropriate ages only +#' +#' @description +#' Internal function to filter data based on age groups in cat1_group and cat2_group, +#' preserving only the rows where chi_age falls within the implied age ranges of +#' any age groups in the data. +#' +#' @param ph.data A data.table containing the data to be filtered. Should have the +#' following columns: cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, +#' indicator_key, year, chi_age, tab -- all of the appropriate classes. +#' +#' @return A filtered data.table with only the rows where chi_age is compatible +#' with the age ranges implied by age groups. +#' +#' @details +#' When \code{cat# == 'Age'}, the function parses \code{cat#_group} labels (e.g., +#' "<18", "18-24", "75+") to determine minimum and maximum ages, then filters the +#' data accordingly. It handles age groups in both cat1 and cat2 categories. +#' +#' @importFrom data.table is.data.table as.data.table rbindlist setnames data.table +#' +#' @examples +#' library(data.table) +#' +#' # Make test data with all combinations of categories and age +#' test_data <- CJ( +#' cat1 = "Gender", +#' cat1_varname = "chi_sex", +#' cat1_group = c("Female", "Male"), +#' cat2 = "Age", +#' cat2_varname = "yage4", +#' cat2_group = c("0-4", "5-9", "10-14"), +#' chi_age = 1:20, +#' unique = TRUE +#' ) +#' +#' # Add extra required columns +#' test_data[, `:=`( +#' indicator_key = "indicator1", +#' year = "2023", +#' tab = "mytab", +#' count = 1 +#' )] +#' +#' # Example usage +#' filtered_data <- chi_keep_proper_ages(test_data) +#' filtered_data[cat2_group == "10-14", unique(chi_age)] +#' +#' @export +chi_keep_proper_ages <- function(ph.data = NULL) { + # Validate inputs + if (is.null(ph.data)) { + stop("\n\U1F6D1 ph.data must be provided") + } + + if (!data.table::is.data.table(ph.data)) { + ph.data <- data.table::as.data.table(ph.data) + } + + # Check for required columns + required_columns <- c("cat2_group", "cat1_group", "indicator_key", "year", + "chi_age", "tab", "cat1", "cat1_varname", + "cat2", "cat2_varname") + + missing_columns <- required_columns[!required_columns %in% names(ph.data)] + if (length(missing_columns) > 0) { + stop(paste0("\n\U1F6D1 ph.data is missing required columns: ", + paste(missing_columns, collapse = ", "))) + } + + # Check expected column classes + expected_classes <- list( + cat1 = c("character", "factor"), + cat1_varname = c("character", "factor"), + cat1_group = c("character", "factor"), + cat2 = c("character", "factor"), + cat2_varname = c("character", "factor"), + cat2_group = c("character", "factor"), + indicator_key = c("character", "factor"), + year = "character", + chi_age = c("numeric", "integer"), + tab = c("character", "factor") + ) + + for (col_name in names(expected_classes)) { + expected <- expected_classes[[col_name]] + valid_class <- FALSE + + # Check if column inherits from any of the expected classes + for (expected_class in expected) { + if (inherits(ph.data[[col_name]], expected_class)) { + valid_class <- TRUE + break # once a valid class is established, no need to keep checking for that col_name + } + } + + if (isFALSE(valid_class)) { + expected_str <- paste(expected, collapse = " or ") + actual_class <- class(ph.data[[col_name]])[1] # Get primary class + stop(paste0("\n\U1F6D1 Column '", col_name, "' should be of class '", + expected_str, "', but is '", actual_class, "'")) + } + } + + # Make function used in lapply below to get min and max age from age group + parse_age_group <- function(group) { + + group <- gsub(" ", "", group) + if (grepl("^<[0-9]+$", group)) { + return(list(c(NA, as.numeric(sub("<", "", group)) - 1))) + } else if (grepl("^[0-9]+-[0-9]+$", group)) { + return(list(as.numeric(unlist(strsplit(group, "-"))))) + } else if (grepl("^[0-9]+\\+$", group)) { + return(list(c(as.numeric(sub("\\+", "", group)), NA))) + } else { + return(list(c(NA, NA))) + } + } + + + # Create reference table of age groups and corresponding min_age and max_age + all_groups <- unique(c(ph.data$cat1_group[ph.data$cat1 == "Age"], + ph.data$cat2_group[ph.data$cat2 == "Age"])) + + if(length(all_groups) == 0){return(ph.data)} # no need to continue if cat1 or cat2 are not 'Age' + + all_bounds <- rbindlist(lapply(all_groups, function(mygroup) { + bounds <- parse_age_group(mygroup)[[1]] + data.table(age_group = mygroup, min_age = bounds[1], max_age = bounds[2]) + })) + + # Merge cat1 bounds + setnames(all_bounds, c("age_group", "min_age", "max_age"), c("cat1_group", "min1", "max1")) + ph.data <- merge(ph.data, all_bounds, by = "cat1_group", all.x = TRUE, sort = FALSE) + + # Merge cat2 bounds + setnames(all_bounds, c("cat1_group", "min1", "max1"), c("cat2_group", "min2", "max2")) + ph.data <- merge(ph.data, all_bounds, by = "cat2_group", all.x = TRUE, sort = FALSE) + + # Create filter to identify rows to drop + ph.data[, keepme := TRUE] + + ph.data[cat1 == "Age", keepme := keepme & + (is.na(min1) | chi_age >= min1) & (is.na(max1) | chi_age <= max1)] + ph.data[cat2 == "Age", keepme := keepme & + (is.na(min2) | chi_age >= min2) & (is.na(max2) | chi_age <= max2)] + + # Clean up and return + return(ph.data[keepme == TRUE][, c("keepme", "min1", "max1", "min2", "max2") := NULL]) +} diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index 17a1db1..4bb63f9 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -148,7 +148,7 @@ chi_qa_tro <- function(CHIestimates, } } - missing.var <- setdiff(names(CHImetadata), names(unlist(chi_get_yaml()$metadata))) + missing.var <- setdiff(names(CHImetadata), chi_get_cols(metadata = TRUE)) if(length(missing.var) > 0){ status <- 0 if(verbose){ diff --git a/R/globals.R b/R/globals.R index d646335..e51db41 100644 --- a/R/globals.R +++ b/R/globals.R @@ -17,6 +17,7 @@ utils::globalVariables(c( "chi_age", "chi_geo_kc", "chi_year", + "combo_idx", "comparison_with_kc_sig", "count", "crosstabs", @@ -35,8 +36,13 @@ utils::globalVariables(c( "group", "group_by1", "group_by2", + "hospitalizations", + "hospitalizations_icd9", + "hospitalizations_icd10", "hra20_name", + "icd_version", "indicator_key", + "intent", "keepme", "latest_year", "latest_year_count", @@ -48,8 +54,18 @@ utils::globalVariables(c( "latest_yearx", "level", "lower_bound", + "mechanism", + "min1", + "min2", + "min_age", + "max1", + "max2", + "max_age", + "mykey", "notable", "numerator", + "original_year_range", + "overall", "pattern", "pop", "pov200grp", @@ -86,10 +102,13 @@ utils::globalVariables(c( "valid_years", "varname", "vebrose", + "wastate", + "year_range", "year.span", "year", "_kingcounty.x", "_kingcounty.y", "_wastate.x", - "_wastate.y" + "_wastate.y", + ".I" )) diff --git a/R/proto_chi_count_by_age.R b/R/proto_chi_count_by_age.R deleted file mode 100644 index 67c1bcb..0000000 --- a/R/proto_chi_count_by_age.R +++ /dev/null @@ -1,132 +0,0 @@ -#' Generate Age-Specific Counts for CHI Data -#' -#' @description -#' Creates a detailed breakdown of counts by age for CHI data analysis, most -#' often for age standardization.Processes data according to provided -#' instructions and handles demographic groupings. -#' -#' @param ph.data Input data frame or data table containing CHI data -#' @param ph.instructions Data frame or data table containing calculation instructions -#' @param source_date Date of data source -#' -#' @return A data.table containing age-specific counts with standard CHI groupings -#' @importFrom data.table setDT rbindlist setnames := setorder data.table -#' @importFrom rads calc -#' @importFrom future.apply future_lapply -#' @importFrom tidyr crossing -#' @export -#' -chi_count_by_age <- function(ph.data = NULL, - ph.instructions = NULL, - source_date = NULL){ - # Create 'Overall' if needed for crosstabs ---- - if(!'overall' %in% names(ph.data)){ - ph.data$overall <- with(ph.data, ifelse(chi_geo_kc == 'King County', 'Overall', NA_character_)) - } - - # Check to make sure all variables needed exist in the data ---- - neededbyvars <- setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA)) - if("race3" %in% neededbyvars & !"race3_hispanic" %in% neededbyvars){neededbyvars <- c(neededbyvars, 'race3_hispanic')} # By definition, Hispanic cannot be contained within race3 - - neededvars <- setdiff(unique(c(ph.instructions$indicator_key, neededbyvars)), c(NA)) - - missingvars <- setdiff(neededvars, names(ph.data)) - if(length(missingvars) > 0 ){ - stop(paste0("\n\U2620 ph.data is missing the following columns that are specified in ph.instructions: ", paste0(missingvars, collapse = ', '), ". ", - "\nIf `race3_hispanic` is listed, that is because, by definition, `race3` cannot have a Hispanic ethnicity in the same variable. So, two ", - "\nvariables (`race3` & `race3_hispanic`) will be processed and in the output, it will be called `race3`")) - } else{message("\U0001f642 All specified variables exist in ph.data")} - - # Check to make sure all byvariables have the CHI specified encoding ---- - stdbyvars <- rads.data::misc_chi_byvars[varname %in% setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA))][, list(varname, group, keepme, reference = 1)] - stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data - phbyvars <- rbindlist(lapply( - X=as.list(neededbyvars), - FUN = function(X){data.table(varname = X, group = setdiff(unique(ph.data[[X]]), NA), ph.data = 1)})) - compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) - if(nrow(compbyvars[is.na(reference)| is.na(ph.data)]) > 0){ - print(compbyvars[is.na(reference)| is.na(ph.data)]) - stop("\n\U2620 the table above shows the varname/group combinations that do not align between the reference table and your ph.data.") - } else {message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.")} - - # Cycle through a function that generates counts by age ---- - message("\U023F3 Be patient! The function is generating counts for each row of ph.instructions.") - tempCHIcount <- rbindlist(future_lapply( - X = as.list(seq(1, nrow(ph.instructions), 1)), - FUN = function(X){ - message(paste0("Calculating estimates for ph.instructions row ", X, " of ", nrow(ph.instructions), "...")) - - # create constants for calc---- - tempbv1 <- setdiff(ph.instructions[X][['cat1_varname']], c()) - tempbv2 <- setdiff(ph.instructions[X][['cat2_varname']], c()) - if(length(tempbv2) == 0){tempbv2 = NA} - tempbv <- setdiff(c(tempbv1, tempbv2), c(NA)) - tempbv <- c(tempbv, "chi_age") - - # create variables of interest used in calc function below - tempbv <- tempbv - tempend <- ph.instructions[X][['end']] - tempstart <- ph.instructions[X][['start']] - - # use calc---- - if(any(grepl('wastate', tempbv))){ - tempcount <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend, - by = tempbv, - metrics = c('numerator')) - } else { - tempcount <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County', - by = tempbv, - metrics = c('numerator')) - } - - # tidy---- - tempcount[, cat1 := ph.instructions[X][['cat1']]] - setnames(tempcount, ph.instructions[X][['cat1_varname']], 'cat1_group') - tempcount[, cat1_varname := ph.instructions[X][['cat1_varname']]] - tempcount[, cat2 := ph.instructions[X][['cat2']]] - if(!is.na(tempbv2) & tempbv1 != tempbv2){ - setnames(tempcount, ph.instructions[X][['cat2_varname']], 'cat2_group')} else{ - tempcount[, cat2_group := NA] } - tempcount[, cat2_varname := ph.instructions[X][['cat2_varname']]] - - tempcount <- tempcount[!is.na(cat1_group)] - tempcount <- tempcount[!(is.na(cat2_group) & !is.na(cat2))] - tempcount <- tempcount[!is.na(chi_age)] - tempcount <- tempcount[, list(cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, count = numerator)] - - # create reference table with every combination of cat1 x cat2 and age---- - cat1table <- data.table(cat1 = ph.instructions[X][['cat1']], - cat1_varname = tempbv1, - cat1_group = sort(setdiff(as.character(unique(ph.data[[tempbv1]])), NA)) ) - cat2table <- suppressWarnings(data.table(cat2 = ph.instructions[X][['cat2']], - cat2_varname = tempbv2, - cat2_group = sort(setdiff(as.character(unique(ph.data[[tempbv2]])), NA))) ) - cattable <- tidyr::crossing(cat1table, cat2table) - cattable <- setDT(tidyr::crossing(cattable, data.table(chi_age = 0:100))) - - # merge the counts onto the reference table to get every combo of age x cat1 x cat2---- - tempcount <- merge(cattable, tempcount, by = c('cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group', 'chi_age'), all = T) - tempcount[is.na(count), count := 0] # when count is NA, it is actually zero and zero is needed for calculating age adjusted rates - - # add on remaining essential identifiers---- - tempcount[, indicator_key := ph.instructions[X][['indicator_key']]] - tempcount[, tab := ph.instructions[X][['tab']]] - tempcount[ph.instructions[X][['end']] != ph.instructions[X][['start']], - year := paste0(ph.instructions[X][['start']], "-", ph.instructions[X][['end']])] - tempcount[ph.instructions[X][['end']] == ph.instructions[X][['start']], - year := ph.instructions[X][['end']]] - - # order output---- - tempcount <- tempcount[, list(indicator_key, year, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, count)] - setorder(tempcount, cat1_group, cat2_group, chi_age) - - } - ), use.names = TRUE) - - # Return ---- - return(tempCHIcount) -} diff --git a/R/proto_chi_drop_illogical_ages.R b/R/proto_chi_drop_illogical_ages.R deleted file mode 100644 index 84a10cf..0000000 --- a/R/proto_chi_drop_illogical_ages.R +++ /dev/null @@ -1,27 +0,0 @@ -#' Drop Illogical Age Combinations from CHI Data -#' -#' @description -#' Removes age combinations that don't make logical sense based on category -#' groupings. For example, removes cases where the age falls outside the range -#' specified by age group categories. -#' -#' @param ph.data Input data.table to process -#' @param agevar Name of the age variable (defaults to 'chi_age') -#' -#' @return A data.table with illogical age combinations removed -#' @importFrom data.table copy := fcase between -#' @export -#' - -chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age'){ - ph.data = copy(ph.data) - for(CatNum in c("cat1", "cat2")){ - ph.data[, paste0(CatNum, '_group_temp') := fcase(get(paste0(CatNum, "_group")) == '<1', '0-0', # <1 is special! - get(CatNum) %in% c("Age", "Birthing person's age"), gsub("<", "0-", gsub("\\+", "-120", get(paste0(CatNum, '_group')))))] - ph.data[, AgeMin := gsub("-.*", "", get(paste0(CatNum, '_group_temp')))] - ph.data[, AgeMax := gsub(".*-", "", get(paste0(CatNum, '_group_temp')))] - ph.data <- ph.data[!get(CatNum) %in% c("Age", "Birthing person's age") | between(get(agevar), AgeMin, AgeMax)] - ph.data[, c("AgeMin", paste0(CatNum, '_group_temp'), "AgeMax") := NULL] - } - return(ph.data) -} diff --git a/R/proto_chi_generate_instructions_pop.R b/R/proto_chi_generate_instructions_pop.R deleted file mode 100644 index e679bc7..0000000 --- a/R/proto_chi_generate_instructions_pop.R +++ /dev/null @@ -1,67 +0,0 @@ -#' Generate Population Instructions for CHI Analysis -#' -#' @description -#' Creates a instructions for rads::get_population() based on count data -#' specifications. Handles various geographic types and demographic groupings. -#' -#' @param mycount.data Input data.table containing count data specifications -#' @param povgeo Geographic level for poverty analysis (NA or 'zip') -#' -#' @return A data.table containing population processing instructions -#' @importFrom data.table copy `:=` setorder tstrsplit -#' @importFrom tools toTitleCase -#' @export -#' - -chi_generate_instructions_pop <- function(mycount.data, povgeo = NA){ - pop.template <- copy(mycount.data) - pop.template <- unique(copy(pop.template)[, list(year, cat1, cat1_varname, cat2, cat2_varname, tab)]) - pop.template[, c("start", "stop") := tstrsplit(year, split = '-') ] - pop.template[is.na(stop), stop := start] # need to have an end date even if it is just one year - - pop.template[, race_type := 'race_eth'] # by default has race and OMB 97 with Hispanic as race - - # Drop prefix when using maternal data because do not want to create multiple alternative codings below ---- - pop.template[grepl("birthing person", cat1, ignore.case = T), cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] - pop.template[grepl("birthing person", cat2, ignore.case = T), cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] - - # Create geo_type & group_by arguments ---- - omb_aic <- c("chi_race_aic_aian", "chi_race_aic_asian", "chi_race_aic_black", "chi_race_aic_his", "chi_race_aic_nhpi", "chi_race_aic_wht") - - for(catnum in c("1", "2")){ - temp.cat <- paste0("cat", catnum) - pop.template[get(temp.cat) == "Cities/neighborhoods", geo_type := "hra"] - pop.template[get(paste0(temp.cat, "_varname")) == "race3", c("race_type", paste0("group_by", catnum)) := 'race'] - pop.template[get(paste0(temp.cat, "_varname")) == "race4", c("race_type", paste0("group_by", catnum)) := 'race_eth'] - pop.template[get(paste0(temp.cat, "_varname")) %in% omb_aic, c("race_type", paste0("group_by", catnum)) := 'race_aic'] - - # the only AIC race/eth with pop data are the standard OMB categories - pop.template <- pop.template[!(grepl('_aic_', get(paste0(temp.cat, "_varname"))) & !get(paste0(temp.cat, "_varname")) %in% omb_aic)] - - pop.template[get(temp.cat) == "Ethnicity", c("race_type", paste0("group_by", catnum)) := 'race_eth'] - pop.template[get(temp.cat) == "Gender", paste0("group_by", catnum) := 'genders'] - pop.template[get(temp.cat) %in% c("Race", "Race/ethnicity") & get(paste0(temp.cat, "_varname")) == 'race4', - paste0("group_by", catnum) := 'race_eth'] - pop.template[(get(temp.cat) == "Race" & get(paste0(temp.cat, "_varname")) == 'race3') , - paste0("group_by", catnum) := 'race'] - pop.template[get(temp.cat) == "Regions" & (is.na(geo_type) | geo_type != 'hra'), `:=` (geo_type = "region")] - pop.template[get(temp.cat) == "Big cities", `:=` (geo_type = "hra")] - pop.template[get(temp.cat) == "Washington State", `:=` (geo_type = "wa")] - } - - pop.template[grepl("poverty$", cat1, ignore.case = T) | grepl("poverty$", cat2, ignore.case = T), geo_type := "blk"] - if(povgeo == 'zip'){ - pop.template[grepl("poverty$", cat1, ignore.case = T) | grepl("poverty$", cat2, ignore.case = T), geo_type := "zip"] - } - pop.template[(cat1 == "Regions" & cat2 == "Cities/neighborhoods") | - (cat2 == "Regions" & cat1 == "Cities/neighborhoods"), - geo_type := "blk"] - - # the only AIC race/eth with population data are the OMB standard categories - - pop.template[is.na(cat2), cat2 := "NA"] # temporarily set NA to "NA" to facilitate processing with function - - pop.template[is.na(geo_type), geo_type := 'kc'] # when not specified, it is for KC - - pop.template <- unique(pop.template) # because want to minimize the calls to get_popualation to improve speed -} diff --git a/R/proto_chi_get_proper_pop.R b/R/proto_chi_get_proper_pop.R deleted file mode 100644 index dff8d76..0000000 --- a/R/proto_chi_get_proper_pop.R +++ /dev/null @@ -1,277 +0,0 @@ -# CHI_get_proper_pop() - function to get population for a single row specified by the output of CHI_generate_instructions_pop() ---- -chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages = NULL){ - # check for valid values of pop.genders ---- - if(is.null(pop.genders)){gendery = c("f", "m") - }else{if(!tolower(pop.genders) %in% c('f', 'female', 'm', 'male')){ - stop("\n\U0001f47f if pop.genders is specified it must have one of the following values: 'F', 'f', 'Female', 'female', 'M', 'm', 'Male', or 'male'") - } else {gendery = pop.genders}} - - # check for valid values of pop.ages ---- - if(is.null(pop.ages)){agesy = c(0:100) - }else{if(!is.integer(pop.ages)){ - stop("\n\U0001f47f if pop.ages is specified it must be vector of integers, e.g., c(0:65)") - } else {agesy = pop.ages}} - - # create function to generate the population table corresponding to each row of the pop.template---- - CHI_get_proper_pop_engine <- function(X, pop.template = NULL){ - # Status updates ---- - print(paste0("Process ID ", Sys.getpid(), ": Getting population ", X, " out of ", nrow(pop.template))) - - # Drop prefix when using maternal data because do not want to create multiple alternative codings below ---- - pop.template[grepl("birthing person", cat1, ignore.case = T), cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))] - pop.template[grepl("birthing person", cat2, ignore.case = T), cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))] - - # create the group_by argument ---- - groupy <- unique(c(c("ages", "geo_id"), setdiff(c(pop.template[X, group_by1], pop.template[X, group_by2]), c(NA)))) - - # use rads::get_population ---- - if(is.na(pop.template[X, geo_type])){ - tempy <- rads::get_population(group_by = groupy, - race_type = pop.template[X, race_type], - years = pop.template[X, start]:pop.template[X, stop], - genders = gendery, - ages = agesy, - round = F) - } - if(!is.na(pop.template[X, geo_type])){ - tempy <- rads::get_population(group_by = groupy, - geo_type = pop.template[X, geo_type], - race_type = pop.template[X, race_type], - years = pop.template[X, start]:pop.template[X, stop], - genders = gendery, - ages = agesy, - round = F) - } - - # tidy the population data ---- - for(catnum in c("cat1", "cat2")){ - # misc ---- - tempy[, paste0(catnum) := pop.template[X, get(catnum)]] - tempy[, paste0(catnum, "_varname") := pop.template[X, get(paste0(catnum, "_varname"))]] - - tempy[get(catnum) == "King County", paste0(catnum, "_group") := "King County"] - - tempy[get(catnum) == "Washington State", paste0(catnum, "_group") := "Washington State"] - - suppressWarnings(tempy[get(catnum) == "NA" | is.na(get(catnum)), - c(catnum, paste0(catnum, "_group"), paste0(catnum, "_varname")) := "NA"]) # just a random fill value for NA, which will be changed to true NA later - - tempy[get(catnum) %in% c("Cities/neighborhoods", "Regions") & pop.template[X, geo_type] != 'blk', - paste0(catnum, "_group") := geo_id] - - tempy[get(catnum) %in% c("Gender"), paste0(catnum, "_group") := gender] - - tempy[get(catnum) %in% c("Overall"), paste0(catnum, "_group") := "Overall"] - - - # race/eth ---- - tempy[get(catnum) == "Ethnicity" | get(paste0(catnum, "_varname")) %in% c('race4'), paste0(catnum, "_group") := race_eth] - tempy[get(catnum) == 'Race' & get(paste0(catnum, "_varname")) %in% c('race3'), paste0(catnum, "_group") := race] - tempy[get(paste0(catnum, "_group")) == "Multiple race", paste0(catnum, "_group") := "Multiple"] - tempy <- tempy[get(catnum) != "Ethnicity" | (get(catnum) == "Ethnicity" & get(paste0(catnum, "_group")) == 'Hispanic'), ] - - # race_aic ---- - if(pop.template[X, race_type] == 'race_aic'){ - tempy <- tempy[!(grepl('_aic_', get(paste0(catnum, "_varname"))) & - !((get(paste0(catnum, "_varname")) == 'chi_race_aic_aian' & race_aic == 'AIAN') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_asian' & race_aic == 'Asian') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_black' & race_aic == 'Black')| - (get(paste0(catnum, "_varname")) == 'chi_race_aic_his' & race_aic == 'Hispanic') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_nhpi' & race_aic == 'NHPI') | - (get(paste0(catnum, "_varname")) == 'chi_race_aic_wht' & race_aic == 'White')) - )] - tempy[grep('_aic', get(paste0(catnum, "_varname"))), paste0(catnum, "_group") := race_aic] - } - - # HRAS ---- - if(tempy[1, geo_type] == 'blk' & tempy[1, get(catnum)] == 'Cities/neighborhoods'){ - temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] - tempy <- merge(tempy, temp.xwalk, by = "geo_id", all.x = T, all.y = F) - tempy[, paste0(catnum, "_group") := hra20_name] - } - - # Regions ---- - if(tempy[1, geo_type] == 'blk' & tempy[1, get(catnum)] == 'Regions'){ - temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, region_name)] - tempy <- merge(tempy, temp.xwalk, by = 'geo_id', all.x = T, all.y = F) - - tempy[, paste0(catnum, "_group") := region_name] - } - - if(tempy[1, geo_type] == 'hra' & tempy[1, get(catnum)] == 'Regions'){ - temp.xwalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] - tempy <- merge(tempy, temp.xwalk, by = 'geo_id', all.x = T, all.y = F) - - tempy[, paste0(catnum, "_group") := region_name] - } - - if(tempy[1, geo_type] == 'zip' & tempy[1, get(catnum)] == 'Regions'){ - zip_2_region <- rads.data::spatial_zip_to_hra20_pop - zip_2_region <- merge(zip_2_region, - rads.data::spatial_hra20_to_region20[, list(hra20_name, region = region_name)], - by = 'hra20_name', - all = T) - zip_2_region <- zip_2_region[, list(s2t_fraction = sum(s2t_fraction)), # collapse fractions down to region level - list(geo_id = as.character(source_id), region)] - - tempy <- merge(tempy, zip_2_region, by = "geo_id", all.x = T, all.y = F, allow.cartesian = T) - tempy[, pop := pop * s2t_fraction] # calculate weighted pop - tempy[, paste0(catnum, "_group") := region] - } - - # Big Cities ---- - if(tempy[1, get(catnum)] == 'Big cities'){ - if(tempy[1, geo_type] == 'blk'){ - blk20_hra20 <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] - tempy <- merge(tempy, blk20_hra20, by = "geo_id", all.x = T, all.y = F) - hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] - tempy <- merge(tempy, hra20_bigcity, by = 'hra20_name', all.x = T, all.y = F) - } - if(tempy[1, geo_type] == 'hra'){ - hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] - tempy <- merge(tempy, hra20_bigcity, by.x = 'geo_id', by.y = 'hra20_name', all.x = T, all.y = F) - } - tempy[, paste0(catnum, "_group") := bigcity] - } - - # age6 ---- - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 0:17, paste0(catnum, "_group") := "<18"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 18:24, paste0(catnum, "_group") := "18-24"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 25:44, paste0(catnum, "_group") := "25-44"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 45:64, paste0(catnum, "_group") := "45-64"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age %in% 65:74, paste0(catnum, "_group") := "65-74"] - tempy[get(paste0(catnum, "_varname")) == "age6" & age >= 75, paste0(catnum, "_group") := "75+"] - - # mage5 ---- - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 10:17, paste0(catnum, "_group") := "10-17"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 18:24, paste0(catnum, "_group") := "18-24"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 25:34, paste0(catnum, "_group") := "25-34"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age %in% 35:44, paste0(catnum, "_group") := "35-44"] - tempy[get(paste0(catnum, "_varname")) == "mage5" & age >=45, paste0(catnum, "_group") := "45+"] - - # yage4 ---- - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 0:4, paste0(catnum, "_group") := "0-4"] - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 5:9, paste0(catnum, "_group") := "5-9"] - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 10:14, paste0(catnum, "_group") := "10-14"] - tempy[get(paste0(catnum, "_varname")) == "yage4" & age %in% 15:17, paste0(catnum, "_group") := "15-17"] - - # pov200grp ---- - if(tempy[1, geo_type] == 'blk' & grepl("poverty$", tempy[1, get(catnum)], ignore.case = T)){ - tempy[, geo_tract2020 := substr(geo_id, 1, 11)] # have blocks (15 char), so keep first 11 for tracts - tempy <- merge(tempy, - rads.data::misc_poverty_groups[geo_type=='Tract'][, list(geo_tract2020 = geo_id, pov200grp)], - by = "geo_tract2020", - all.x = T, - all.y = F) - tempy[, paste0(catnum, "_group") := pov200grp] - } - if( tempy[1, geo_type] == 'zip' & grepl("poverty$", tempy[1, get(catnum)], ignore.case = T)){ - tempy <- merge(tempy, - rads.data::misc_poverty_groups[geo_type=='ZCTA'][, list(geo_id, pov200grp)], - by = 'geo_id', - all.x = T, - all.y = F) - tempy[, paste0(catnum, "_group") := pov200grp] - } - - } - - # Drop if is.na(cat1_group) - tempy <- tempy[!is.na(cat1_group)] - - # drop if is.na(cat2_group) - tempy <- tempy[!(cat2 != 'NA' & (cat2_group == 'NA') | is.na(cat2_group))] # did not yet switch back to true NA at this point - - # collapse to one row per demographic combination and keep minimum needed columns ---- - tempy <- tempy[, list(pop = sum(pop)), list(chi_age = age, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] - - # ensure each demographic has rows for all relevant ages & only relevant ages ---- - if(tempy[1]$cat1 == "Age"){ - if(tempy[1]$cat1_varname == 'age6'){ - tempage <- data.table(cat1 = "Age", cat1_varname = "age6", chi_age = 0:100) - tempage[, cat1_group := cut(chi_age, - breaks = c(-1, 17, 24, 44, 64, 74, 120), - labels = c("<18", "18-24", "25-44", "45-64", "65-74", "75+"))]} - - if(tempy[1]$cat1_varname == 'mage5'){ - tempage <- data.table(cat1 = "Age", cat1_varname = "mage5", chi_age = 10:100) - tempage[, cat1_group := cut(chi_age, - breaks = c(9, 17, 24, 34, 44, 120), - labels = c('10-17', '18-24', '25-34', '35-44', '45+'))]} - - if(tempy[1]$cat1_varname == 'yage4'){ - tempage <- data.table(cat1 = "Age", cat1_varname = "yage4", chi_age = 0:17) - tempage[, cat1_group := cut(chi_age, - breaks = c(-1, 4, 9, 14, 17), - labels = c('0-4', '5-9', '10-14', '15-17'))]} - - temp.demog <- setDT(tidyr::crossing(unique(tempy[, list(year = as.character(year), cat2, cat2_varname, cat2_group)]), - tempage)) - } - - if(tempy[1]$cat2 == "Age"){ - if(tempy[1]$cat2_varname == 'age6'){ - tempage <- data.table(cat2 = "Age", cat2_varname = "age6", chi_age = 0:100) - tempage[, cat2_group := cut(chi_age, - breaks = c(-1, 17, 24, 44, 64, 74, 120), - labels = c("<18", "18-24", "25-44", "45-64", "65-74", "75+"))]} - - if(tempy[1]$cat2_varname == 'mage5'){ - tempage <- data.table(cat2 = "Age", cat2_varname = "mage5", chi_age = 10:100) - tempage[, cat2_group := cut(chi_age, - breaks = c(9, 17, 24, 34, 44, 120), - labels = c('10-17', '18-24', '25-34', '35-44', '45+'))]} - - if(tempy[1]$cat2_varname == 'yage4'){ - tempage <- data.table(cat2 = "Age", cat2_varname = "yage4", chi_age = 0:17) - tempage[, cat2_group := cut(chi_age, - breaks = c(-1, 4, 9, 14, 17), - labels = c('0-4', '5-9', '10-14', '15-17'))]} - - temp.demog <- setDT(tidyr::crossing(unique(tempy[, list(year = as.character(year), cat1, cat1_varname, cat1_group)]), - tempage)) - } - - if(!"Age" %in% unique(c(tempy$cat1, tempy$cat2))){ - # all combinations of cat1 x cat2 - temp.demog <- setDT(tidyr::crossing( - unique(tempy[, list(cat1, cat1_varname, cat1_group)]), - unique(tempy[, list(cat2, cat2_varname, cat2_group)]) - )) - # all combination of cat table with year and age - temp.demog <- setDT(tidyr::crossing( - temp.demog, - data.table(year = as.character(pop.template[X, ]$year), chi_age = 0:100) - )) - } - - tempy <- suppressWarnings(merge(tempy, temp.demog, all = T)) - tempy[is.na(pop), pop := 0] - - # create combinations of cat1_group and cat2_group that have no population and set == 0 ---- - # no need with current get_population function, but keep as a placeholder / reminder - - # add tab column ---- - tempy[, tab := pop.template[X, tab]] - - # tidy when is.na(cat2) ---- - tempy[cat2 == "NA", c("cat2", "cat2_varname", "cat2_group") := NA] - - # return object ---- - return(tempy) - } - - # use lapply to cycle over each rows and create one big final dataset ---- - tempy.allpop <- rbindlist( - future_lapply(X = as.list(seq(1, nrow(pop.template))), - FUN = function(X){ - set.seed(98104) # another attempt to set a seed - CHI_get_proper_pop_engine(X, pop.template = pop.template)}, - future.seed = 98104) - ) - - tempy.allpop <- unique(tempy.allpop) - - # return object ---- - return(tempy.allpop) -} diff --git a/README.md b/README.md index 8dfc0a1..2e6a032 100644 --- a/README.md +++ b/README.md @@ -13,22 +13,35 @@ The are meant to support our epidemiologist in various steps along the pipeline, ## Installation -Install [`apde.chi.tools`](https://github.com/PHSKC-APDE/apde.chi.tools) ... `remotes::install_github("PHSKC-APDE/apde.chi.tools", auth_token = NULL)` +To install the latest version: [`apde.chi.tools`](https://github.com/PHSKC-APDE/apde.chi.tools) ... `remotes::install_github("PHSKC-APDE/apde.chi.tools", auth_token = NULL)` -To install github from a particular branch, specify it with the 'ref' argument, e.g., `remotes::install_github("PHSKC-APDE/apde.chi.tools", ref = "dev", auth_token = NULL)` +To install a particular version, append the version tag to the repository name. You can see a list of recent tags on the right of this page, and click [`Releases`](https://github.com/PHSKC-APDE/apde.chi.tools/releases) to view details ... `remotes::install_github("PHSKC-APDE/apde.chi.tools@v2025.0.0", auth_token = NULL)`. Note, you will want to use the lateset version for the year of work of the CHI project (so work performed in 2025 should use the 2025 version) + +To install a particular branch, (for example, if participating in testing or needing a feature still in development) specify it with the 'ref' argument, e.g., `remotes::install_github("PHSKC-APDE/apde.chi.tools", ref = "dev", auth_token = NULL)` + +## Loading a package Load [`apde.chi.tools`](https://github.com/PHSKC-APDE/apde.chi.tools) ... `library(apde.chi.tools)` -## New for version 0.1 -1. Welcome! Please provide feedback and recommendation directly to Ronald Buie or by submitting an issue in github + + +## New for version 2025.0.0 + +Our first full release! + +Versioning scheme, expect a release each year with number for that year + +CHI functions in the CHI repository disabled. Use apde.chi.tools going forward + +If something doesn’t work or appears broken, let Danny or Ronald know, and submit an [issue](https://github.com/PHSKC-APDE/apde.chi.tools/issues). + +Review documentation using '?[function_name]()' in R studio, as well as the [wiki](https://github.com/PHSKC-APDE/apde.chi.tools/wiki). ## Best Practices If you have code that uses these functions from last year, you will want to confirm that the function contained in this package conforms to the expectations of the previous version. You can review the manual for any function by typing '?[function_name]()'. There you will see the expected parameters and examples of usage. If these are insufficient, please reach out! -Coming soon: wiki pages to train users who are new to these functions - ## Problems? - If you come across a bug or have specific suggestions for improvement, please click on ["Issues"](https://github.com/PHSKC-APDE/apde.chi.tools/issues) at the top of this page and then click ["New Issue"](https://github.com/PHSKC-APDE/apde.chi.tools/issues/new/choose) and provide the necessary details. diff --git a/inst/ref/chi_qa.yaml_pre2025.yaml b/inst/ref/chi_qa.yaml_pre2025.yaml new file mode 100644 index 0000000..cb0f5a7 --- /dev/null +++ b/inst/ref/chi_qa.yaml_pre2025.yaml @@ -0,0 +1,62 @@ +schema: APDE_WIP +table: +years: + +vars: + data_source: varchar(255) + indicator_key: varchar(255) + tab: varchar(255) + year: varchar(255) + cat1: varchar(255) + cat1_group: nvarchar(2000) + cat1_varname: varchar(255) + cat2: varchar(255) + cat2_group: nvarchar(2000) + cat2_varname: varchar(255) + result: float + lower_bound: float + upper_bound: float + se: float + rse: float + comparison_with_kc: varchar(255) + time_trends: varchar(255) + significance: varchar(255) + caution: varchar(255) + suppression: varchar(255) + numerator: float + denominator: float + chi: tinyint + source_date: date + run_date: date + +metadata: + data_source: varchar(255) + indicator_key: varchar(255) + result_type: varchar(255) + valence: varchar(255) + latest_year: int + latest_year_result: float + latest_year_kc_pop: int + latest_year_count: int + map_type: varchar(255) + unit: varchar(255) + valid_years: varchar(255) + chi: tinyint + run_date: date + +toc: + data_source: varchar(255) + indicator_key: varchar(255) + topic_chi: varchar(255) + topic_bsk: varchar(255) + topic_bskhs: varchar(255) + title_toc: varchar(255) + description: varchar(1000) + url_backup: varchar(500) + latest_year_bk: varchar(255) + latest_rate_bk: varchar(255) + toc_bsk: float + toc_bskhs: float + toc_chi: float + toc_cc: float + diff --git a/man/chi_chars_ccs.Rd b/man/chi_chars_ccs.Rd new file mode 100644 index 0000000..21095f7 --- /dev/null +++ b/man/chi_chars_ccs.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_chars_ccs.R +\name{chi_chars_ccs} +\alias{chi_chars_ccs} +\title{Generate CHI CHARS counts by CCS category} +\usage{ +chi_chars_ccs( + ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL +) +} +\arguments{ +\item{ph.indicator}{A character string of length 1. The indicator key to process, +which must exist in the chars.defs data table.} + +\item{ph.data}{A data.table containing the CHARS data to be processed.} + +\item{myinstructions}{A data.table containing processing instructions for each indicator. +Default is the output from chi_generate_tro_shell().} + +\item{chars.defs}{A data.table containing definitions for each indicator. It +should have the following columns: `indicator_name`, `indicator_key`, `intent`, +`mechanism`, `superlevel`, `broad`, `midlevel`, `detailed`, `age_start`, and +`age_end`.} +} +\value{ +A data.table containing the summarized CHARS hospitalization data for the + specified indicator, stratified by the requested demographic variables. +} +\description{ +Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +System (CHARS) data based on Clinical Classification Software (CCS) categories +for CHI. This function processes instructions for specific indicators and +summarizes CHARS data accordingly. It automatically handles the ICD-9 to ICD-10 +transition that occurred in 2016. +} +\details{ +The function processes multiple instructions for the same indicator sequentially. + +The function automatically detects whether data spans the ICD-9 to ICD-10 transition in 2016 +and processes each part with the appropriate ICD version. Results are then combined seamlessly. +} +\examples{ +\dontrun{ +# Example of how to run with future_lapply for memory efficiency +library(future) +library(future.apply) + +plan(multisession, workers = future::availableCores() - 1) + +countsCCS <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = chars, + myinstructions = myinstructions, + chars.defs = chars.defs) +}, future.seed = TRUE)) + +plan(sequential) + +} + +} +\seealso{ +\code{\link[rads]{get_data_chars}}, which provides creates ph.data + +\code{\link[rads]{chars_icd_ccs_count}}, which is the engine used by this function + +\code{\link{chi_generate_tro_shell}}, which creates myinstructions +} diff --git a/man/chi_chars_injury.Rd b/man/chi_chars_injury.Rd new file mode 100644 index 0000000..a0a2fed --- /dev/null +++ b/man/chi_chars_injury.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_chars_injury.R +\name{chi_chars_injury} +\alias{chi_chars_injury} +\title{Generate CHI CHARS counts for injury indicators} +\usage{ +chi_chars_injury( + ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL, + def = "narrow" +) +} +\arguments{ +\item{ph.indicator}{A character string of length 1. The indicator key to process, +which must exist in the chars.defs data table.} + +\item{ph.data}{A data.table containing the CHARS data to be processed.} + +\item{myinstructions}{A data.table containing processing instructions for each indicator. +Default is the output from \code{\link{chi_generate_tro_shell}}.} + +\item{chars.defs}{A data.table containing definitions for each indicator. It +should have the following columns: `indicator_name`, `indicator_key`, `intent`, +`mechanism`, `age_start`, and `age_end`.} + +\item{def}{A character string indicating which injury definition to use. +Default is \code{def = 'narrow'}.} +} +\value{ +A data.table containing the summarized CHARS injury hospitalization data for the + specified indicator, stratified by the requested demographic variables. +} +\description{ +Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +System (CHARS) data for injury indicators including falls, poisoning, +self-harm, and other injury mechanisms for CHI. This function processes +instructions for specific indicators and summarizes CHARS data accordingly. +} +\details{ +This function processes instructions for a specific indicator sequentially. + +Note that injury data is only available for 2012 and later years. Unlike chi_chars_ccs(), +this function doesn't need to handle the ICD-9 to ICD-10 transition explicitly as the +mechanism and intent columns have been standardized across coding systems. +} +\examples{ +\dontrun{ +# Example of how to run with future_lapply for memory efficiency +library(future) +library(future.apply) + +plan(multisession, workers = future::availableCores() - 1) + +countsINJURY <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { + chi_chars_injury( + ph.indicator = indicator, + ph.data = chars, + myinstructions = myinstructions, + chars.defs = chars.defs, + def = 'narrow') +}, future.seed = TRUE)) + +plan(sequential) + +} + +} +\seealso{ +\code{\link[rads]{get_data_chars}}, which provides creates ph.data + +\code{\link[rads]{chars_injury_matrix_count}}, which is the engine used by this function + +\code{\link{chi_generate_tro_shell}}, which creates myinstructions +} diff --git a/man/chi_count_by_age.Rd b/man/chi_count_by_age.Rd index d5af4ac..109c7a5 100644 --- a/man/chi_count_by_age.Rd +++ b/man/chi_count_by_age.Rd @@ -1,23 +1,49 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proto_chi_count_by_age.R +% Please edit documentation in R/chi_count_by_age.R \name{chi_count_by_age} \alias{chi_count_by_age} -\title{Generate Age-Specific Counts for CHI Data} +\title{Generate Age-Specific Counts for Community Health Indicators} \usage{ chi_count_by_age(ph.data = NULL, ph.instructions = NULL, source_date = NULL) } \arguments{ \item{ph.data}{Input data frame or data table containing CHI data} -\item{ph.instructions}{Data frame or data table containing calculation instructions} +\item{ph.instructions}{Data frame or data table containing calculation specifications with columns: +\itemize{ + \item indicator_key: Name of the health metric to calculate + \item tab: Visualization tab type (kingcounty, demgroups, crosstabs) + \item cat1_varname, cat2_varname: Variable names for stratification + \item cat1, cat2: Human-friendly labels for these variables + \item start, end: Year range for the calculation +}} -\item{source_date}{Date of data source} +\item{source_date}{Date of data source, added to output metadata} } \value{ -A data.table containing age-specific counts with standard CHI groupings +A data.table containing age-specific counts with standard CHI groupings: + \itemize{ + \item indicator_key: Health metric identifier + \item year: Year range of data (e.g., "2019-2021" or single year) + \item tab: Visualization tab type + \item cat1, cat1_varname, cat1_group: Primary stratification variable details + \item cat2, cat2_varname, cat2_group: Secondary stratification variable details (if applicable) + \item chi_age: Age value (0-100) + \item count: Number of cases in that demographic-age group + \item source_date: Date of data source (if provided) + } } \description{ -Creates a detailed breakdown of counts by age for CHI data analysis, most -often for age standardization.Processes data according to provided -instructions and handles demographic groupings. +Creates a detailed breakdown of counts by age for CHI data analysis. Primarily +used for age standardization and rate calculations when combined with +population estimates. Processes data according to provided instructions and +handles demographic groupings with special treatment for race and ethnicity +variables. +} +\seealso{ +\code{\link{chi_generate_tro_shell}} which creates ph.instructions used by +\code{chi_count_by_age} + +\code{\link{chi_generate_instructions_pop}} which uses the output of the output +of \code{chi_count_by_age} } diff --git a/man/chi_drop_illogical_ages.Rd b/man/chi_drop_illogical_ages.Rd index 4d83d44..f4b906d 100644 --- a/man/chi_drop_illogical_ages.Rd +++ b/man/chi_drop_illogical_ages.Rd @@ -1,21 +1,52 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proto_chi_drop_illogical_ages.R +% Please edit documentation in R/chi_drop_illogical_ages.R \name{chi_drop_illogical_ages} \alias{chi_drop_illogical_ages} -\title{Drop Illogical Age Combinations from CHI Data} +\title{Filter Out Age Values That Don't Match Their Corresponding Age Group Categories} \usage{ chi_drop_illogical_ages(ph.data, agevar = "chi_age") } \arguments{ -\item{ph.data}{Input data.table to process} +\item{ph.data}{A data.table or data.frame containing category and age data to be filtered.} -\item{agevar}{Name of the age variable (defaults to 'chi_age')} +\item{agevar}{Character string specifying the name of the age variable column. + +Default: \code{agevar = 'chi_age'}} } \value{ -A data.table with illogical age combinations removed +A filtered data.table with only logically consistent age values } \description{ -Removes age combinations that don't make logical sense based on category -groupings. For example, removes cases where the age falls outside the range -specified by age group categories. +This function filters a data.table to remove rows with inconsistent age values. +It compares the single year age value (specified by \code{agevar}) against age +ranges defined in \code{cat1_group} and \code{cat2_group} columns, keeping only +those rows where: +\itemize{ + \item The categorical variable is not age-related, OR + \item The age value falls within the range specified by the corresponding age group +} + +Age groups are expected to be in formats like "10-17", "<5", or "45+", which +the function automatically parses into numeric ranges. +} +\details{ +The function interprets special formats in age group strings: +\itemize{ + \item "<1" is treated as age "0-0" (age zero) + \item " 0)) { + stop("number_of_observations must be an integer greater than 0") + } + } + #if(!return_code & comments) { + # message("user has requested data, 'comments' set to FALSE.") + # comments <- FALSE + #} + + + variable_modeller <- function(oneVariable, number_of_observations, varName = NA, comments = TRUE) { + if(any(class(oneVariable) %in% "data.table")) { + if(ncol(oneVariable) == 1) { + message(class(oneVariable)) + oneVariable <- oneVariable[,1][[1]] + message(class(oneVariable)) + message("caught DT") + } else { + stop("more than 1 column passed. Only pass a vector or one column") + } + } + + instructions <- NA + + if(is.na(varName)){ + variableName <- sub(".*\\$.*?", "\\1", deparse(substitute(oneVariable))) + } else { + variableName <- varName + } + + oneVariableClass <- class(oneVariable) + + #factor + if(is.na(instructions) & inherits(oneVariable, "factor")) { + orderTF <- is.ordered(oneVariable) + detectedLevels <- levels(oneVariable) + instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + if(comments){ + instructions <- paste0(instructions, " # as a factor") + } + } + + #integer: categorical + if(is.na(instructions) & inherits(oneVariable, "integer") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + instructions <- paste0(variableName," = as.integer(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")))", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + if(comments){ + instructions <- paste0(instructions, " # as a categorical non factor") + } + } + + #character: categorical + if(is.na(instructions) & inherits(oneVariable, "character") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + if(comments){ + + instructions <- paste0(instructions, " # as a categorical non factor") + } + } + + #continuous integer + if(is.na(instructions) & inherits(oneVariable, "integer") & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + #uniform distribution + instructions <- paste0(variableName, " = as.integer(runif(", number_of_observations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),"))") + if(comments){ + instructions <- paste0(instructions, " # continuous integer with uniform distribution") + } + } + + #continuous double + if(is.na(instructions) & inherits(oneVariable, "double") & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + count_decimal_places <- function(x) { + if (!is.numeric(x)) return(NA) + sapply(x, function(num) { + if (is.na(num)) return(NA) + str_num <- as.character(num) + if (grepl("\\.", str_num)) { + return(nchar(strsplit(str_num, "\\.")[[1]][2])) + } else { + return(0) + } + }) + } + oneVariable[,RH := count_decimal_places(oneVariable[[1]])] + numberOfDecimals <- max(oneVariable$RH, na.rm = T) + #uniform distribution + instructions <- paste0(variableName, " = as.double(round(runif(", number_of_observations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),"),", numberOfDecimals , "))") + if(comments){ + instructions <- paste0(instructions, " # continuous double with uniform distribution") + } + } + + #if unmatched + if(is.na(instructions)) { + + instructions <- paste0("`",variableName,"`", " = NA") + if(comments){ + instructions <- paste0(instructions, " # data type not modeled") + } + } + + if(is.na(instructions)) { + + } else{ + return(instructions) + } + } + + batch_variable_modeller <- function(x) { + variable_modeller(ph.data[,..x][[1]], number_of_observations, names(ph.data)[x], comments = comments) + } + + codeList <- lapply(seq_along(ph.data), batch_variable_modeller) + + if(comments) { + + codeListParsed <- c(list("DT <- data.table("),gsub(" #", ", #", codeList[1:(length(codeList)-1)]), gsub(" #",") #",codeList[length(codeList)])) + + codeText <- paste(unlist(codeListParsed), collapse =" \n" ) + + } else { + codeListParsed <- c(list("DT <- data.table("),paste0(codeList[1:(length(codeList)-1)], ","), paste0(codeList[length(codeList)], ")")) + + codeText <- paste(unlist(codeListParsed), collapse =" \n" ) + } + + if(return_code) { + #codeText <- paste(unlist(codeList), collapse =" \n" ) + + cat(codeText) + return(codeList) + + } else { + + cat(codeText) + eval( parse(text = paste0(codeText))) + # eval( parse(text = paste0("DT <- data.table(", codeText,collapse = ""))) + return(DT) + } + + } + +# +# ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams')) +# ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams', "time_of_birth", "mother_birthplace_country" )) +# #ph.data <- get_data_birth() +# todo <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = F, comments = TRUE) +# +# tada <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = T, comments = TRUE) +# +# codeListParsed <- c(list("DT <- data.table("),gsub(" #", ", #", tada[1:(length(tada)-1)]), gsub(" #",") #",tada[length(tada)])) +# +# codeText <- paste(unlist(codeListParsed), collapse =" \n" ) +# +# tada <- eval( parse(text = paste0(codeText))) +# +# +# +# str(ph.data) +# str(todo) +# str(tada) +# + + ################################ end migrate this out ########################################################### + + + generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000, years = 2023){ + ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data + ### receives description of data set to emulate, number of observations to include, a seed and number of years. + ### returns a data.table of synthetic data. If dataset is "generic" the returned structure will have idealized chi values and generic indicators + + # input validation + datasetOptions <- c("generic", "brfss", "death") + dataset <- tolower(dataset) + if(!(dataset %in% datasetOptions)) { + stop(paste0("dataset must be one of: '", paste(datasetOptions, collapse = "', '"),"'")) + } + + if(dataset == "generic") { + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + id = 1:observations, + chi_geo_kc = sample(c('King County',NA_character_), observations, replace = T), + chi_race_7 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), + chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), + indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), + indicator2 = as.factor(sample(c(1,2,3,4, NA), observations, replace = T)), + indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), + chi_year = year) + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } + + + + } else if(dataset == "death") { + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + `state_file_number` = NA, # data type not modelled, + `underlying_cod_code` = NA, # data type not modelled, + age6 = sample(c('65-74', '75+', '25-44', '45-64', '<18', '18-24'), observations, replace = TRUE, prob = c(0.0107097718352957, 0.0114414953768376, 0.0644581919776492, 0.184660413756403, 0.184194771502694, 0.544535355551121)), # as a categorical non factor, + bigcities = sample(c(NA, 'Seattle city', 'Auburn city', 'Kent city', 'Federal Way city', 'Bellevue city', 'Renton city', 'Kirkland city', 'Redmond city'), observations, replace = TRUE, prob = c(0.0491585179272268, 0.0566753143085213, 0.0538814607862702, 0.0679837690414422, 0.0367192177210138, 0.0190248120800905, 0.052351493381228, 0.249850329275594, 0.414355085478614)), # as a categorical non factor, + `hra20_name` = NA, # data type not modelled, + chi_sex = factor(sample(c('Female', 'Male', NA), observations, replace = TRUE, prob = c(0.468036985299009, 0.531763453735116, 0.000199560965875075)), levels = c('Female', 'Male'), ordered = FALSE), # as a factor, + chi_geo_kc = sample(c('King County'), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + pov200grp = sample(c(NA, 'Very high poverty areas', 'High poverty areas', 'Medium poverty areas', 'Low poverty areas'), observations, replace = TRUE, prob = c(0.287234750216191, 0.174283243530899, 0.24998336991951, 0.285771303133107, 0.00272733320029269)), # as a categorical non factor, + race3 = factor(sample(c('White', 'Black', NA, 'Asian', 'Multiple', 'AIAN', 'NHPI'), observations, replace = TRUE, prob = c(0.0782278986230293, 0.763054613184328, 0.0178939666067984, 0.111355018958292, 0.0109758531231291, 0.0108428124792124, 0.0076498370252112)), levels = c('Black', 'White', 'Multiple', 'Asian', 'AIAN', 'NHPI'), ordered = FALSE), # as a factor, + race4 = factor(sample(c('White', 'Hispanic', 'Black', NA, 'Asian', 'Multiple', 'AIAN', 'NHPI'), observations, replace = TRUE, prob = c(0.0771635734716956, 0.727200159648773, 0.0169626820993814, 0.110889376704583, 0.0108428124792124, 0.0391139493115147, 0.0103106499035455, 0.00751679638129449)), levels = c('Black', 'White', 'Multiple', 'Asian', 'NHPI', 'Hispanic', 'AIAN'), ordered = FALSE), # as a factor, + chi_geo_region = sample(c(NA, 'Seattle', 'South', 'East', 'North'), observations, replace = TRUE, prob = c(0.204549990021952, 0.077230093793654, 0.287367790860108, 0.428124792123994, 0.00272733320029269)), # as a categorical non factor, + wastate = sample(c('Washington State'), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + `chi_age` = NA, # data type not modelled, + chi_year = year, + race3_hispanic = sample(c(NA, 'Hispanic'), observations, replace = TRUE, prob = c(0.0392469899554314, 0.960753010044569)) # as a categorical non factor + ) + + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } + + + + } else if(dataset == "brfss") { + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + chi_year = year, + age = runif(observations, 18, 99), # continuous with uniform distribution, + age5_v2 = factor(sample(c('25-44', '45-64', '18-24', '75+', '65-74'), observations, replace = TRUE, prob = c(0.0619025944469731, 0.366257017144591, 0.320285237445001, 0.149294492489759, 0.102260658473676)), levels = c('18-24', '25-44', '45-64', '65-74', '75+'), ordered = FALSE), # as a factor, + chi_sex = factor(sample(c('Female', 'Male'), observations, replace = TRUE, prob = c(0.504779244424215, 0.495220755575785)), levels = c('Male', 'Female'), ordered = FALSE), # as a factor, + race3 = factor(sample(c('White', NA, 'Asian', 'Black', 'Multiple', 'NHPI', 'AIAN'), observations, replace = TRUE, prob = c(0.715217721134881, 0.0567440449097254, 0.0115308754362009, 0.135639508420574, 0.00971021089364285, 0.0380822333485055, 0.0330754058564709)), levels = c('White', 'Black', 'AIAN', 'Asian', 'NHPI', 'Multiple', NA), ordered = FALSE), # as a factor, + race4 = factor(sample(c('White', 'Hispanic', 'Asian', 'Multiple', 'Black', 'AIAN', NA, 'NHPI'), observations, replace = TRUE, prob = c(0.00652404794416629, 0.0511303292368381, 0.133667121832802, 0.00804126839629798, 0.0923987255348202, 0.662570171445911, 0.0364132908511607, 0.00925504475800334)), levels = c('AIAN', 'Black', 'Asian', 'NHPI', 'Hispanic', 'White', 'Multiple', NA), ordered = FALSE), # as a factor, + hispanic = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.896829009255045, 0.0923987255348202, 0.010772265210135)), # as a categorical non factor, + income6b = factor(sample(c('$50-74,999', NA, '$100,000+', '$20-34,999', '$75-99,999', '<$20,000', '$35-49,999'), observations, replace = TRUE, prob = c(0.0399028978910636, 0.0696404187528448, 0.062661204673039, 0.101046882111971, 0.101805492338037, 0.441662873615536, 0.183280230617509)), levels = c('<$20,000', '$20-34,999', '$35-49,999', '$50-74,999', '$75-99,999', '$100,000+'), ordered = FALSE), # as a factor, + sexorien = factor(sample(c('Something else', 'Straight', 'Lesbian/Gay', 'Bisexual'), observations, replace = TRUE, prob = c(0.887725686542255, 0.0342891822181763, 0.0523441055985435, 0.0256410256410256)), levels = c('Straight', 'Lesbian/Gay', 'Bisexual', 'Something else'), ordered = FALSE), # as a factor, + trnsgndr = sample(c('0', '1'), observations, replace = TRUE, prob = c(0.990593233196783, 0.00940676680321651)), # as a categorical non factor, + veteran3 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.922166590805644, 0.072523137611895, 0.00531027158246093)), # as a categorical non factor, + asthnow = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.896222121074192, 0.0951297223486573, 0.00864815657715066)), # as a categorical non factor, + bphigh = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.699135184342285, 0.296768320436959, 0.00409649522075558)), # as a categorical non factor, + cholchk5 = sample(c('1', '0', NA), observations, replace = TRUE, prob = c(0.832802306175087, 0.0989227734789865, 0.0682749203459263)), # as a categorical non factor, + x_crcrec = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + x_crcrec2 = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + cvdheart = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.948869670763162, 0.0432407828857533, 0.00788954635108481)), # as a categorical non factor, + cvdstrk3 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.977393415263238, 0.0203307540585647, 0.00227583067819754)), # as a categorical non factor, + denvst1 = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + diab2 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.924594143529055, 0.0731300257927477, 0.00227583067819754)), # as a categorical non factor, + exerany = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.870581095433166, 0.128053406159915, 0.00136549840691853)), # as a categorical non factor, + disab2 = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.777120315581854, 0.199969655590957, 0.0229100288271886)), # as a categorical non factor, + ecignow1 = sample(c('3', NA, '2', '1'), observations, replace = TRUE, prob = c(0.0166894249734486, 0.0256410256410256, 0.920194204217873, 0.0374753451676529)), # as a categorical non factor, + firearm4 = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + flushot7 = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.567440449097254, 0.384615384615385, 0.0479441662873616)), # as a categorical non factor, + fnotlast = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.846002124108633, 0.0726748596571082, 0.0813230162342588)), # as a categorical non factor, + sdhfood1 = sample(c('5', NA, '1', '3', '2', '4'), observations, replace = TRUE, prob = c(0.00819299044151115, 0.00955848884842968, 0.0549233803671673, 0.0588681535427098, 0.787133970565923, 0.0813230162342588)), # as a categorical non factor, + genhlth2 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.881201638598088, 0.116977696859354, 0.00182066454255803)), # as a categorical non factor, + mam2yrs = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + medcost1 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.914580488544986, 0.0819299044151115, 0.0034896070399029)), # as a categorical non factor, + x_pastaer = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.560309512972235, 0.298892429069944, 0.140798057957821)), # as a categorical non factor, + fmd = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.855257168866636, 0.130480958883326, 0.0142618722500379)), # as a categorical non factor, + mjnow = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.767410104688211, 0.144135942952511, 0.0884539523592778)), # as a categorical non factor, + obese = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.69458352298589, 0.207555757851616, 0.0978607191624943)), # as a categorical non factor, + x_bmi5cat = factor(sample(c('Overweight', NA, 'Obese', 'Normal', 'Underweight'), observations, replace = TRUE, prob = c(0.018813533606433, 0.343043544226976, 0.332726445152481, 0.207555757851616, 0.0978607191624943)), levels = c('Underweight', 'Normal', 'Overweight', 'Obese'), ordered = FALSE), # as a factor, + x_veglt1a = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + crvscrnx = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + persdoc3 = sample(c('1', '0', NA), observations, replace = TRUE, prob = c(0.861629494765589, 0.126687907752997, 0.011682597481414)), # as a categorical non factor, + x_pneumo3 = sample(c(NA, '0', '1'), observations, replace = TRUE, prob = c(0.18236989834623, 0.0499165528751328, 0.767713548778638)), # as a categorical non factor, + smoker1 = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.907449552419967, 0.0553785465028069, 0.0371719010772265)), # as a categorical non factor, + finalwt1 = runif(observations, 9.1, 3698.3), # continuous with uniform distribution, + x_ststr = sample(c('2023532011', '2023532012', '2023532082', '2023532021', '2023532022', '2023532031', '2023532032', '2023532131', '2023532121', '2023532161', '2023532151', '2023531141', '2023532061', '2023531142', '2023532091', '2023532112', '2023532081', '2023532071', '2023532042', '2023532122', '2023532051', '2023532072', '2023532062', '2023532101', '2023532102', '2023531271', '2023531231', '2023532052', '2023531241', '2023532111', '2023532092', '2023532041', '2023532141', '2023532132', '2023531161', '2023531301', '2023531211', '2023531242', '2023532142', '2023531202', '2023532019'), observations, replace = TRUE, prob = c(0.0581095433166439, 0.00364132908511607, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.00121377636170536, 0.000151722045213169, 0.587922925201032, 0.210286754665453, 0.000151722045213169, 0.0136549840691853, 0.00804126839629798, 0.0127446517979062, 0.0101653770292824, 0.00106205431649219, 0.000606888180852678, 0.00136549840691853, 0.000910332271279017, 0.00242755272341071, 0.0019723865877712, 0.00151722045213169, 0.000910332271279017, 0.00182066454255803, 0.00652404794416629, 0.000606888180852678, 0.000758610226065847, 0.00166894249734486, 0.000606888180852678, 0.000455166135639508, 0.000758610226065847, 0.000606888180852678, 0.0019723865877712, 0.00242755272341071, 0.000606888180852678, 0.000606888180852678, 0.000303444090426339, 0.0453648915187377, 0.0171445911090881)), # as a categorical non factor, + hra20_id_1 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_2 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_3 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_4 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_5 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_6 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_7 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_8 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_9 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_10 = runif(observations, 1, 61), # continuous with uniform distribution, + default_wt = runif(observations, 9.1, 3698.3), # continuous with uniform distribution, + `_id` = NA, # data type not modelled, + chi_geo_region = sample(c(NA, 'South', 'East', 'North', 'Seattle'), observations, replace = TRUE, prob = c(0.230010620543165, 0.0650887573964497, 0.330602336519496, 0.284175390684266, 0.0901228948566227)) # as a categorical non factor + ) + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } + + + + } else if(dataset == "skeleton") { + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + #paste data modelling code here + ) + + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } + } + return(returnDT) + } + + test_data_generic <- generate_test_data("generic", 10000, 1000, c(2016:2023)) + test_data_brfss <- generate_test_data("brfss", 10000, 1000, c(2016:2023)) + test_data_death <- generate_test_data("death", 10000, 1000, c(2016:2023)) + + test_analysis_set_twosets <- data.table( + #this should work with the generic data set + cat1 = rep(c('Regions', 'Gender', 'Race/ethnicity'),2), + cat1_varname = rep(c('chi_geo_region', 'chi_sex', 'chi_race_7'),2), + `_kingcounty` = c('x'), + `_wastate` = NA_character_, + demgroups = c(rep(NA_character_,3),rep("x", 3)), + crosstabs = c(rep(NA_character_,3),rep("x", 3)), + trends = c(rep(NA_character_,3),rep("x", 3)), + set = c(rep(1,3), rep(2,3)), + set_indicator_keys = c(rep(c('indicator1, indicator2'),3), rep("indicator3",3)) + ) + + # create twoset analysis set + #not currently exported, may not be needed + #remove("test_twoset_estimates") + for(indicator in c("indicator1","indicator2")) { + partialDT <- data.table( + indicator = indicator, + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2023'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = as.Date("2025-01-01"), + run_date = as.Date("2025-01-01"), + numerator = c(111, 175, 210, 600, 430000), + denominator = c(1000, 1500, 2000, 2500, 2200000) + ) + if(exists("test_twoset_estimates")) { + test_twoset_estimates <- rbind(test_twoset_estimates, partialDT) + } else { + test_twoset_estimates <- partialDT + } + } + partialDT <- data.table( + indicator = "indicator3", + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2023'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = as.Date("2025-01-01"), + run_date = as.Date("2025-01-01"), + numerator = c(111, 175, 210, 600, 430000), + denominator = c(1000, 1500, 2000, 2500, 2200000) + ) + test_twoset_estimates <- rbind(test_twoset_estimates, partialDT) + + test_twoset_estimates[, result := numerator / denominator] + test_twoset_estimates[, se := sqrt((result * (1-result)) / denominator)] + test_twoset_estimates[, rse := 100 * se / result] + test_twoset_estimates[, lower_bound := result - 1.96 * se] + test_twoset_estimates[, upper_bound := result + 1.96 * se] + + + #twoset metadata should work with with the "generic" dataset + test_twoset_metadata <- data.table( + indicator_key = c("indicator1", "indicator2","indicator3"), + result_type = c("proportion"), + valid_years = c("2020 2021 2022 2022"), + latest_year = c(2022), + data_source = 'test', + valence = 'positive', + latest_year_result = 0.20, + latest_year_kc_pop = 2300000, + latest_year_count = 460000, + map_type = 'hra', + unit = 'person', + chi = 1, + run_date = as.Date("2025-01-01") + ) + + # Sample instructions ---- test_instructions <- data.table( indicator_key = c("indicator1", "indicator2", "indicator1", "indicator2"), @@ -66,8 +494,8 @@ setup_test_data <- function() { caution = NA_character_, suppression = NA_character_, chi = 1, - source_date = Sys.Date(), - run_date = Sys.Date(), + source_date = as.Date("2025-01-01"), + run_date = as.Date("2025-01-01"), numerator = c(111, 175, 210, 600, 430000), denominator = c(1000, 1500, 2000, 2500, 2200000) ) @@ -77,6 +505,7 @@ setup_test_data <- function() { test_estimates[, lower_bound := result - 1.96 * se] test_estimates[, upper_bound := result + 1.96 * se] + test_estimates_old <- data.table( indicator_key = c("indicatorX"), tab = c(rep('demgroups', 4), '_kingcounty'), @@ -116,14 +545,84 @@ setup_test_data <- function() { map_type = 'hra', unit = 'person', chi = 1, - run_date = Sys.Date() + run_date = as.Date("2024-01-01") ) + validate_hhsaw_connection <- function(hhsaw_key = 'hhsaw'){ + # Key should be a character string that can be used to generate a database connection + # Also have to allow for the option of interactive authentication + # TODO: Allow hhsaw_key to be a database connection itself + is.db = function(x){ + r = try(dbIsValid(hhsaw_key)) + if(inherits(r, 'try-error')){ + r = FALSE + } + r + } + status <- 0 + closeserver = TRUE + if(is.character(hhsaw_key)){ + server <- grepl('server', tolower(Sys.info()['release'])) + trykey <- try(keyring::key_get(hhsaw_key, keyring::key_list(hhsaw_key)[['username']]), silent = T) + if (inherits(trykey, "try-error")) warning(paste0("Your hhsaw keyring is not properly configured or you are not connected to the VPN. \n", + "Please check your VPN connection and or set your keyring and run the function again. \n", + paste0("e.g., keyring::key_set('hhsaw', username = 'ALastname@kingcounty.gov') \n"), + "When prompted, be sure to enter the same password that you use to log into to your laptop. \n", + "If you already have an hhsaw key on your keyring with a different name, you can specify it with the 'mykey = ...' or 'hhsaw_key = ...' argument \n")) + rm(trykey) + + if(server == FALSE){ + con <- try(con <- DBI::dbConnect(odbc::odbc(), + driver = getOption('rads.odbc_version'), + server = 'kcitazrhpasqlprp16.azds.kingcounty.gov', + database = 'hhs_analytics_workspace', + uid = keyring::key_list(hhsaw_key)[["username"]], + pwd = keyring::key_get(hhsaw_key, keyring::key_list(hhsaw_key)[["username"]]), + Encrypt = 'yes', + TrustServerCertificate = 'yes', + Authentication = 'ActiveDirectoryPassword'), silent = T) + if (inherits(con, "try-error")) warning(paste("Either your computer is not connected to KC systems (e.g. VPN is not connected), your hhsaw key is not properly configured, and/or your key value is outdated.", + "To (re)set your hhsaw key use keyring::key_set('", hhsaw_key, "', username = 'ALastname@kingcounty.gov')"), + "When prompted, be sure to enter the same password that you use to log into to your laptop.") + }else{ + message(paste0('Please enter the password you use for your laptop into the pop-up window. \n', + 'Note that the pop-up may be behind your Rstudio session. \n', + 'You will need to use your two factor authentication app to confirm your KC identity.')) + con <- DBI::dbConnect(odbc::odbc(), + driver = getOption('rads.odbc_version'), + server = "kcitazrhpasqlprp16.azds.kingcounty.gov", + database = "hhs_analytics_workspace", + uid = keyring::key_list(hhsaw_key)[["username"]], + Encrypt = "yes", + TrustServerCertificate = "yes", + Authentication = "ActiveDirectoryInteractive") + status <- 1 + } + + # on.exit(DBI::dbDisconnect(con)) + + }else if(is.db(hhsaw_key)){ + closeserver = FALSE + con = hhsaw_key + status <- 1 + }else{ + warning('`hhsaw_key` is not a reference to database connection or keyring') + } + + return(status) + + } + # Return ---- list(my.analytic = test_analytic, my.analysis_set = test_analysis_set, + my.analysis_set_twosets = test_analysis_set_twosets, + my.generic_data = test_data_generic, + my.brfss_data = test_data_brfss, + my.death_data = test_data_death, my.estimate= test_estimates, my.estimate_old= test_estimates_old, my.metadata = test_metadata, - my.instructions = test_instructions) + my.instructions = test_instructions, + my.hhsaw_status_test = validate_hhsaw_connection) } diff --git a/tests/testthat/test-chi_calc.R b/tests/testthat/test-chi_calc.R index 36d320f..f705071 100644 --- a/tests/testthat/test-chi_calc.R +++ b/tests/testthat/test-chi_calc.R @@ -2,15 +2,23 @@ test_that("chi_calc performs basic calculations correctly", { test_data <- setup_test_data() + set <- test_data$my.analysis_set_twosets + instruction <- apde.chi.tools::chi_generate_tro_shell(ph.analysis_set = set, end.year = 2023, year.span = 5, trend.span = 3, trend.periods = 3) + DTgeneric <- test_data$my.generic_data + #make a denominator come out as 0 + DTgeneric$indicator + DTgeneric[chi_race_7 == "White" , indicator1 := "never"] result <- chi_calc( - ph.data = test_data$my.analytic, - ph.instructions = test_data$my.instructions, + ph.data = DTgeneric, + ph.instructions = instruction, ci = 0.90, rate = FALSE, source_name = "test", - source_date = Sys.Date() + source_date = Sys.Date(), + small_num_suppress = F ) + expect_s3_class(result, "data.table") expect_true(all(c("result", "lower_bound", "upper_bound") %in% names(result))) expect_true(all(result$result >= 0 & result$result <= 1)) # For proportions @@ -18,4 +26,5 @@ test_that("chi_calc performs basic calculations correctly", { expect_type(result$numerator, "double") expect_type(result$denominator, "double") expect_type(result$indicator_key, "character") + expect_true(all(!is.na(result[numerator == 0 & denominator != 0,se]))) }) diff --git a/tests/testthat/test-chi_chars_ccs.R b/tests/testthat/test-chi_chars_ccs.R new file mode 100644 index 0000000..2d84cee --- /dev/null +++ b/tests/testthat/test-chi_chars_ccs.R @@ -0,0 +1,383 @@ +# Create mock_chars data ---- +set.seed(98104) + +# ICD codes for asthma and non-asthma conditions +icd9_asthma <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392") + +icd10_asthma <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998") + +icd9_non_asthma <- c("E9855", "36190", "V1508", "76425", "36642", "66061", "V7040", + "V7710", "V4530", "V0660", "76830", "85406") + +icd10_non_asthma <- c("S80241D", "S52134B", "S62256S", "S60922S", "V629", "K9420", + "S59241G", "S52042R", "J8417", "S83136D", "M06851", "T363X3D") + +# Create 1000 sample CHARS records +n_samples <- 1000 +half_samples <- n_samples / 2 + +# Create ICD-9 era data (pre-2016) +icd9_data <- data.table( + seq_no = 1:half_samples, # Unique identifier + chi_year = sample(2013:2015, half_samples, replace = TRUE), + chi_age = sample(0:99, half_samples, replace = TRUE), + chi_geo_kc = rep("King County", half_samples), + wastate = rep("Washington State", half_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), half_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), half_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), half_samples, replace = TRUE) +) + +# Ensure at least 100 asthma cases +asthma_cases <- 100 +non_asthma_cases <- half_samples - asthma_cases + +icd9_data[1:asthma_cases, diag1 := sample(icd9_asthma, asthma_cases, replace = TRUE)] +icd9_data[(asthma_cases+1):half_samples, diag1 := sample(icd9_non_asthma, non_asthma_cases, replace = TRUE)] + +# Create ICD-10 era data (2016 and after) +icd10_data <- data.table( + seq_no = (half_samples+1):n_samples, # Continue sequence numbers + chi_year = sample(2016:2022, half_samples, replace = TRUE), + chi_age = sample(0:99, half_samples, replace = TRUE), + chi_geo_kc = rep("King County", half_samples), + wastate = rep("Washington State", half_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), half_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), half_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), half_samples, replace = TRUE) +) + +# Ensure at least 100 asthma cases +icd10_data[1:asthma_cases, diag1 := sample(icd10_asthma, asthma_cases, replace = TRUE)] +icd10_data[(asthma_cases+1):half_samples, diag1 := sample(icd10_non_asthma, non_asthma_cases, replace = TRUE)] + +# Combine the data +mock_chars <- rbindlist(list(icd9_data, icd10_data)) + +# Create mock_instructions ---- +mock_instructions <- data.table( + indicator_key = rep(c("hos1803000_v1", "hos1803000_v2"), 3), + tab = rep(c("trends", "_wastate"), each = 3), + cat1 = rep("Ethnicity", 6), + cat1_varname = rep("race3_hispanic", 6), + cat2 = c(NA_character_, NA_character_, "Sex", "Sex", NA_character_, NA_character_), + cat2_varname = c(NA_character_, NA_character_, "chi_sex", "chi_sex", NA_character_, NA_character_), + end = c(2017, 2022, 2017, 2022, 2015, 2015), + start = c(2013, 2018, 2014, 2016, 2013, 2013) +) + +# Create mock_chars_def ---- +mock_chars_def <- data.table( + indicator_name = c("Asthma hospitalizations (all ages)", "Asthma hospitalizations (children)"), + indicator_key = c("hos1803000_v1", "hos1803000_v2"), + intent = c(NA_character_, NA_character_), + mechanism = c(NA_character_, NA_character_), + superlevel = c(NA_character_, NA_character_), + broad = c("RESP", NA_character_), + midlevel = c(NA_character_, "Asthma"), + detailed = c(NA_character_, NA_character_), + age_start = c(0, 0), + age_end = c(120, 17) +) + +# Create function for mock ccs_table ---- +# Function to create mock CCS reference tables +create_mock_ccs_table <- function(icdcm_version) { + if (icdcm_version == 9) { + icd_codes <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392", + "E9855", "36190", "V1508", "76425", "36642", "66061", "V7040", + "V7710", "V4530", "V0660", "76830", "85406") + + # Ensure all asthma codes are properly classified + asthma_codes <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392") + } else { # ICD-10 + icd_codes <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998", + "S80241D", "S52134B", "S62256S", "S60922S", "V629", "K9420", + "S59241G", "S52042R", "J8417", "S83136D", "M06851", "T363X3D") + + # Ensure all asthma codes are properly classified + asthma_codes <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998") + } + + # Create the data table + ccs_table <- data.table( + icdcm_code = icd_codes, + icdcm = paste0("Description for ", icd_codes), + superlevel = NA_character_, + broad = NA_character_, + midlevel = NA_character_, + detailed = NA_character_, + icdcm_version = icdcm_version + ) + + # Assign RESP to all asthma codes + ccs_table[icdcm_code %in% asthma_codes, broad := "RESP"] + + # Assign Asthma to all asthma codes + ccs_table[icdcm_code %in% asthma_codes, midlevel := "Asthma"] + + return(ccs_table) +} + +# Create vector of expected column order ---- +expectedCols <- c('indicator_key', 'year', 'chi_age', 'hospitalizations', 'tab', 'cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group') + +# Test validation ---- +test_that("chi_chars_ccs validates inputs correctly", { + # Test missing ph.indicator + expect_error(chi_chars_ccs(ph.indicator = NA, + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.indicator must be provided") + + # Test missing ph.data + expect_error(chi_chars_ccs(ph.indicator = "hos1803000_v1", + ph.data = NULL, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.data must be specified") + + # Test indicator not found in instructions + expect_error(chi_chars_ccs(ph.indicator = "not_an_indicator", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "not found in myinstructions") + + # Test invalid column in instructions + bad_instructions <- copy(mock_instructions) + bad_instructions[1, cat1_varname := "not_a_column"] + + expect_error(chi_chars_ccs(ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = bad_instructions, + chars.defs = mock_chars_def), + "don't exist in ph.data") +}) + +# Test function handles ICD-9 data correctly ---- +test_that("chi_chars_ccs processes ICD-9 data correctly", { + # Filter instructions to only include pre-2016 data + icd9_instructions <- mock_instructions[end < 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = icd9_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all data is from ICD-9 era (pre-2016) + expect_true(all(grepl("201[3-5]", result$year))) + + # Check hospitalization count is as expected + expect_equal(sum(result$hospitalizations), 100) + + # Check age filtering worked correctly + expect_gt(max(as.numeric(result$chi_age)), 50) # is for all ages, so should definitely have some > 50 yrs old + + # Check correct indicator is used + expect_true(all(result$indicator_key == "hos1803000_v1")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles ICD-10 data correctly ---- +test_that("chi_chars_ccs processes ICD-10 data correctly", { + # Filter instructions to only include post-2016 data + icd10_instructions <- mock_instructions[start >= 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v2", + ph.data = mock_chars, + myinstructions = icd10_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all data is from ICD-10 era (2016+) + expect_true(all(grepl("201[6-9]|202[0-2]", result$year))) + + # Check hospitalization count is as expected + expect_lt(sum(result$hospitalizations), 100) # All ICD10 should be 100, so limiting to up to 17 should be less + + # Check age filtering worked correctly + expect_equal(max(as.numeric(result$chi_age)), 17) # for children, so max is 17 yrs old + + # Check correct indicator is used + expect_true(all(result$indicator_key == "hos1803000_v2")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles mixed ICD-9/ICD-10 data correctly ---- +test_that("chi_chars_ccs processes mixed ICD-9/ICD-10 data correctly", { + # Filter data to include instructions that span the 2016 transition + mixed_instructions <- mock_instructions[start < 2016 & end >= 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = mixed_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Verify that year ranges are formatted correctly + expect_true(all(grepl("^\\d{4}-\\d{4}$", result$year))) + + # Verify that the range spans the transition + expect_true(any(grepl("201[3-5]-201[6-9]", result$year))) + + # Verify we have all cat1_group values + expect_equal(unique(c("Hispanic", "Non-Hispanic")), unique(result$cat1_group) ) + + # Verify we have the proper year spans + expect_equal(sort(unique(result$year)), + sort(unique(mixed_instructions[, year := paste0(start, '-', end)]$year))) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test that instructions with different indicator variables work ---- +test_that("chi_chars_ccs handles different indicator variables correctly", { + # Instructions with two category variables + cat2_instructions <- mock_instructions[!is.na(cat2_varname)] + + # Run function + result <- rbindlist(lapply(c("hos1803000_v1", "hos1803000_v2"), function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = mock_chars, + myinstructions = cat2_instructions, + chars.defs = mock_chars_def) + }), fill = TRUE) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that both category values are correctly filled + expect_true(all(!is.na(result$cat1_group))) + expect_true(all(!is.na(result$cat2_group))) + + # Check all ph.indicator (indicator_keys) exist + expect_equal(sort(unique(cat2_instructions$indicator_key)), + sort(unique(result$indicator_key))) + + # Check cat1_group and cat2_group are all there + expect_equal( + setorder(unique(result[, .(cat1_group, cat2_group)]), cat1_group, cat2_group), + data.table(cat1_group = rep(c("Hispanic", "Non-Hispanic"), each = 2), cat2_group = rep(c("Female", "Male"), times = 2)) + ) + + # Check if have the same number of observations for each age (within an indicator) + expect_equal(uniqueN(result[indicator_key == "hos1803000_v1", .N, chi_age]$N), 1) + expect_equal(uniqueN(result[indicator_key == "hos1803000_v2", .N, chi_age]$N), 1) +}) + +# Test that WA state filtering works ---- +test_that("chi_chars_ccs handles WA state filtering correctly", { + # Instructions for WA state + wa_instructions <- mock_instructions[tab == "_wastate"] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = wa_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all rows are for WA state + expect_true(all(result$tab == "_wastate")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test child-specific age filtering ---- +test_that("chi_chars_ccs handles age filtering correctly", { + # Test children-only indicator (hos1803000_v2 has age_end = 17) + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v2", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1803000_v2"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # check correct age range + expect_true(all(result$chi_age %in% 0:17)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test what happens when instructions filter out all rows---- +test_that("When some instructions filter out all rows, expect it to work but returning warnings", { + warnings <- capture_warnings({ + result <- rbindlist(lapply(c("hos1803000_v1", "hos1803000_v2"), function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = copy(mock_chars)[, chi_geo_kc := 'KC'], + myinstructions = mock_instructions, + chars.defs = mock_chars_def) + }), fill = TRUE) + }) + + # Check that we got exactly 2 warnings + expect_length(warnings, 2) + + # Check content of first warning + expect_match(warnings[1], + "No data found for the following 2 instruction\\(s\\) for indicator 'hos1803000_v1'") + + # Check content of second warning + expect_match(warnings[2], + "No data found for the following 1 instruction\\(s\\) for indicator 'hos1803000_v2'") + + expect_equal( # expect three of the instructions 'worked' because _wastate was not corrupted + 3, + uniqueN(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + ) + +}) diff --git a/tests/testthat/test-chi_chars_injury.R b/tests/testthat/test-chi_chars_injury.R new file mode 100644 index 0000000..5506769 --- /dev/null +++ b/tests/testthat/test-chi_chars_injury.R @@ -0,0 +1,399 @@ +# Create mock_chars data for injury ---- +set.seed(98104) + +# Create 1000 sample CHARS records +n_samples <- 1000 + +# Define all injury mechanisms and intents as needed by chars_injury_matrix_count +mechanisms <- c("any", "bites_stings", "cut_pierce", "drowning", "fall", "fire", + "fire_burn", "firearm", "machinery", "motor_vehicle_nontraffic", + "mvt_motorcyclist", "mvt_occupant", "mvt_other", "mvt_pedal_cyclist", + "mvt_pedestrian", "mvt_traffic", "mvt_unspecified", "natural", + "natural_environmental", "other", "other_land_transport", "other_spec", + "other_specified", "other_transport", "overexertion", "pedal_cyclist", + "pedestrian", "poisoning", "poisoning_drug", "poisoning_nondrug", "struck", + "struck_by_against", "suffocation", "transport_other", "unspecified") + +intents <- c("any", "unintentional", "intentional", "assault", "legal", "undetermined") + +# Initialize the data.table with demographic columns +mock_chars <- data.table( + seq_no = 1:n_samples, # Unique identifier + injury_nature_narrow = sample(c(TRUE, FALSE), n_samples, replace = TRUE, prob = c(0.8, 0.2)), + injury_nature_broad = NA, + chi_year = sample(2012:2022, n_samples, replace = TRUE), + chi_age = sample(0:99, n_samples, replace = TRUE), + chi_geo_kc = rep("King County", n_samples), + wastate = rep("Washington State", n_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), n_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), n_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), n_samples, replace = TRUE) +) +mock_chars[, injury_nature_broad := !injury_nature_narrow] # Define broad based on narrow + + +# Add mechanism columns (all 0 initially) +for (mech in mechanisms) { + mock_chars[, paste0("mechanism_", mech) := 0] +} + +# Add intent columns (all 0 initially) +for (int in intents) { + mock_chars[, paste0("intent_", int) := 0] +} + +# First, set our fixed test cases +# Set exactly 100 fall injuries with unintentional intent (rows 1-100) +mock_chars[1:100, `:=`( + mechanism_fall = 1, + intent_unintentional = 1, + injury_intent = "unintentional" +)] + +# Set exactly 50 poisoning self-harm cases (rows 101-150) +mock_chars[101:150, `:=`( + mechanism_poisoning = 1, + intent_intentional = 1, + injury_intent = "intentional" +)] + +# Now randomly assign mechanism and intent to the remaining records (151-1000) +remaining_rows <- 151:n_samples +for (i in remaining_rows) { + # Select random mechanism and intent + mech <- sample(mechanisms, 1) + int <- sample(intents, 1) + + # Set the selected mechanism and intent to 1 + mock_chars[i, paste0("mechanism_", mech) := 1] + mock_chars[i, paste0("intent_", int) := 1] + + # Set the injury_intent string value based on which intent_* column is 1 + # Skip "any" since it's not a real intent category for injury_intent + if (int != "any") { + mock_chars[i, injury_intent := int] + } else { + # If "any" was selected, choose one of the real intents + real_int <- sample(setdiff(intents, "any"), 1) + mock_chars[i, injury_intent := real_int] + mock_chars[i, paste0("intent_", real_int) := 1] + } +} + +# Create mock_instructions ---- +mock_instructions <- data.table( + indicator_key = rep(c("hos1901000_v1", "hos1901000_v2"), 3), + tab = rep(c("trends", "_wastate"), each = 3), + cat1 = rep("Ethnicity", 6), + cat1_varname = rep("race3_hispanic", 6), + cat2 = c(NA_character_, NA_character_, "Sex", "Sex", NA_character_, NA_character_), + cat2_varname = c(NA_character_, NA_character_, "chi_sex", "chi_sex", NA_character_, NA_character_), + end = c(2017, 2022, 2017, 2022, 2015, 2015), + start = c(2012, 2018, 2014, 2016, 2012, 2012) +) + +# Create mock_chars_def ---- +mock_chars_def <- data.table( + indicator_name = c("Fall injuries (all ages)", "Fall injuries (children)"), + indicator_key = c("hos1901000_v1", "hos1901000_v2"), + intent = c("unintentional", "unintentional"), + mechanism = c("fall", "fall"), + age_start = c(0, 0), + age_end = c(120, 17) +) + +# Add poisoning self-harm indicator +mock_chars_def <- rbind(mock_chars_def, data.table( + indicator_name = "Self-harm poisoning", + indicator_key = "hos1902000_v1", + intent = "intentional", # Changed from intentional_self_harm to match intent column names + mechanism = "poisoning", + age_start = 10, + age_end = 120 +)) + +# Add row to instructions for self-harm +mock_instructions <- rbind(mock_instructions, data.table( + indicator_key = "hos1902000_v1", + tab = "trends", + cat1 = "Ethnicity", + cat1_varname = "race3_hispanic", + cat2 = "Sex", + cat2_varname = "chi_sex", + end = 2022, + start = 2012 +)) + +# Create vector of expected column order ---- +expectedCols <- c('indicator_key', 'year', 'chi_age', 'hospitalizations', 'tab', 'cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group') + +# Test validation ---- +test_that("chi_chars_injury validates inputs correctly", { + # Test missing ph.indicator + expect_error(chi_chars_injury(ph.indicator = NA, + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.indicator must be provided") + + # Test missing ph.data + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = NULL, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.data must be specified") + + # Test indicator not found in instructions + expect_error(chi_chars_injury(ph.indicator = "not_an_indicator", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "not found in myinstructions") + + # Test invalid column in instructions + bad_instructions <- copy(mock_instructions) + bad_instructions[1, cat1_varname := "not_a_column"] + + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = bad_instructions, + chars.defs = mock_chars_def), + "don't exist in ph.data") + + # Test invalid def parameter + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def, + def = "invalid_def"), + "must be either 'narrow' or 'broad'") +}) + +# Test function processes fall injury data correctly ---- +test_that("chi_chars_injury processes fall injury data correctly", { + # Run function with default parameters + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that data matches the fall/unintentional criteria + expect_true(sum(result$hospitalizations) > 0) + + # Check age filtering worked correctly + expect_gt(max(as.numeric(result$chi_age)), 50) # is for all ages, so should definitely have some > 50 yrs old + + # Check if have consistent number of rows per age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles age filtering correctly ---- +test_that("chi_chars_injury handles age filtering correctly", { + # Test children-only indicator (hos1901000_v2 has age_end = 17) + result <- chi_chars_injury( + ph.indicator = "hos1901000_v2", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v2"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check correct age range + expect_true(all(result$chi_age %in% 0:17)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles different injury types correctly ---- +test_that("chi_chars_injury handles different injury types correctly", { + # Test self-harm poisoning indicator + result <- chi_chars_injury( + ph.indicator = "hos1902000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1902000_v1"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that we have counts > 0 + expect_true(sum(result$hospitalizations) > 0) + + # Check age filtering worked correctly (min age should be 10) + expect_equal(min(as.numeric(result$chi_age)), 10) +}) + +# Test def parameter variation ---- +test_that("chi_chars_injury handles 'def' parameter correctly", { + # Test with narrow definition (default) + narrow_result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"][1], + chars.defs = mock_chars_def, + def = "narrow" + ) + + # Test with broad definition + broad_result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"][1], + chars.defs = mock_chars_def, + def = "broad" + ) + + # Both should have valid structure + expect_true(is.data.table(narrow_result)) + expect_true(is.data.table(broad_result)) + + # Results can be the same in our test data, but function should run without error + expect_true(TRUE) +}) + +# Test that WA state filtering works ---- +test_that("chi_chars_injury handles WA state filtering correctly", { + # Instructions for WA state + wa_instructions <- mock_instructions[tab == "_wastate" & indicator_key == "hos1901000_v1"] + + # Run function + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = wa_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all rows are for WA state + expect_true(all(result$tab == "_wastate")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test processing multiple instructions ---- +test_that("chi_chars_injury processes multiple instructions correctly", { + # Use multiple instructions for the same indicator + multiple_instructions <- mock_instructions[indicator_key == "hos1901000_v1"] + + # Run function + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = multiple_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure with multiple rows + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + expect_true(nrow(result) > 0) + + # confirm we processed the all of the instruction sets + expect_equal(uniqueN(result[, .(indicator_key, tab, cat1_varname, cat2_varname)]), + nrow(multiple_instructions)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test what happens when instructions filter out all rows---- +test_that("When some instructions filter out all rows, expect it to work but returning warnings", { + warnings <- capture_warnings({ + result <- rbindlist(lapply(c("hos1901000_v1", "hos1901000_v2"), function(indicator) { + chi_chars_injury( + ph.indicator = indicator, + ph.data = copy(mock_chars)[, chi_geo_kc := 'KC'], # Corrupt the data + myinstructions = mock_instructions[indicator_key %in% c("hos1901000_v1", "hos1901000_v2")], + chars.defs = mock_chars_def) + }), fill = TRUE) + }) + + # Check that we got exactly 2 warnings (1 per indicator) + expect_length(warnings, 2) + + # Check content of first warning + expect_match(warnings[1], + "No data found for the following .* instruction\\(s\\) for indicator 'hos1901000_v1'") + + # Check content of second warning + expect_match(warnings[2], + "No data found for the following .* instruction\\(s\\) for indicator 'hos1901000_v2'") + + # Check for any data from _wastate (which should still work) + expect_gt(nrow(result), 0) +}) + +# *Test year restriction for injury data (only 2012+) ---- +test_that("chi_chars_injury correctly handles pre-2012 years", { + # Create instructions with pre-2012 years + early_instructions <- copy(mock_instructions[1]) + early_instructions[, `:=`(start = 2010, end = 2015)] + + # Run the function with pre-2012 years + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = early_instructions, + chars.defs = mock_chars_def + ) + + # Check the results + expect_true(is.data.table(result)) + expect_true(nrow(result) > 0) + + # The year range in the result should be 2012-2015, not 2010-2015 + expect_true(all(result$year == "2012-2015")) +}) + +# Test poisoning mechanism handling with drug & non-drug ---- +test_that("chi_chars_injury correctly handles poisoning with ICD-10", { + # Create mock data with both poisoning_drug and poisoning_nondrug + poisoning_test_chars <- copy(mock_chars) + + # Add some specific poisoning_drug and poisoning_nondrug cases + poisoning_test_chars[201:225, `:=`( + mechanism_poisoning_drug = 1, + mechanism_poisoning = 0, + intent_intentional = 1, + injury_intent = "intentional" + )] + + poisoning_test_chars[226:250, `:=`( + mechanism_poisoning_nondrug = 1, + mechanism_poisoning = 0, + intent_intentional = 1, + injury_intent = "intentional" + )] + + # Create a test for poisoning indicator (which may need to collapse drug & non-drug) + result <- chi_chars_injury( + ph.indicator = "hos1902000_v1", + ph.data = poisoning_test_chars, + myinstructions = mock_instructions[indicator_key == "hos1902000_v1"], + chars.defs = mock_chars_def + ) + + # Check the results + expect_true(is.data.table(result)) + expect_true(nrow(result) > 0) + + # The result should include data from the poisoning mechanism + expect_true(sum(result$hospitalizations) > 0) +}) diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R index c5a0e95..279bb31 100644 --- a/tests/testthat/test-chi_generate_analysis_set.R +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -4,3 +4,11 @@ test_that("chi_generate_analysis_set validates inputs", { expect_error(chi_generate_analysis_set(data_source = 123), "data_source must be a single character string") expect_error(chi_generate_analysis_set(CHIestimates = 123), "CHIestimates must be a data.table or data.frame") }) + +test_that("chi_generate_analysis_set generates expected analysis set from CHI estimates", { + TestData <- setup_test_data() + DT_test_analysis_instructions_results <- apde.chi.tools::chi_generate_tro_shell(TestData$my.analysis_set_twosets, end.year = 2023, year.span = 3, trend.span = 3, trend.periods = 5) + DT_test_chi_calc_output <- suppressWarnings(apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = DT_test_analysis_instructions_results, source_name = "generic_test", source_date = as.Date("2025-04-10"), ci = .80)) + DT_recreated_analysis_set <- suppressWarnings(chi_generate_analysis_set(CHIestimates = DT_test_chi_calc_output)) + expect_equal(all.equal(TestData$my.analysis_set_twosets[order(cat1,set)] , DT_recreated_analysis_set[order(cat1,set)]),TRUE) +}) diff --git a/tests/testthat/test-chi_generate_tro_shell.R b/tests/testthat/test-chi_generate_tro_shell.R index 5bf4a43..18370b7 100644 --- a/tests/testthat/test-chi_generate_tro_shell.R +++ b/tests/testthat/test-chi_generate_tro_shell.R @@ -2,8 +2,8 @@ test_that("chi_generate_tro_shell validates inputs correctly", { test_data <- setup_test_data() expect_error(chi_generate_tro_shell(), "ph.analysis_set must be provided") - expect_error(chi_generate_tro_shell(data.frame(), start.year = "2023"), - "start.year must be a single numeric value") + expect_error(chi_generate_tro_shell(data.frame()), + "set number must be provided for all rows") }) test_that("ingest template format", { @@ -29,7 +29,6 @@ test_that("ingest template format", { trends, set_indicator_keys) DT <- chi_generate_tro_shell(ph.analysis_set = template, - start.year = 2021, end.year = 2022, year.span = 5, trend.span = 3, diff --git a/tests/testthat/test-chi_keep_proper_ages.R b/tests/testthat/test-chi_keep_proper_ages.R new file mode 100644 index 0000000..f5b88d3 --- /dev/null +++ b/tests/testthat/test-chi_keep_proper_ages.R @@ -0,0 +1,172 @@ +# Helper function to create test data (not in helper.R because very specific to this function) +create_test_data <- function() { + cat1_age6 <- data.table(cat1 = 'Age', cat1_varname = 'age6', + cat1_group = c('<18', '18-24', '25-44', '45-64', '65-74', '75+'), + mykey = 1) + cat2_sex <- data.table(cat2 = 'Gender', cat2_varname = 'chi_sex', + cat2_group = c('Female', 'Male'), mykey = 1) + cat1_sex <- setnames(copy(cat2_sex), gsub('cat2', 'cat1', names(cat2_sex))) + cat2_yage4 <- data.table(cat2 = 'Age', cat2_varname = 'yage4', + cat2_group = c('0-4', '10-14', '15-17', '5-9'), mykey = 1) + cat1_geo <- data.table(cat1 = 'King County', cat1_varname = 'chi_geo_kc', + cat1_group = 'King County', mykey = 1) + chi_ages <- data.table(chi_age = 0:100, mykey = 1) + + test.counts <- rbind( + merge(cat1_age6, cat2_sex, allow.cartesian = TRUE), + merge(cat1_sex, cat2_yage4, allow.cartesian = TRUE), + merge(cat1_geo, cat2_sex, allow.cartesian = TRUE)) + + test.counts <- merge(test.counts, chi_ages, allow.cartesian = TRUE)[, mykey := NULL] + + test.counts[, `:=` (indicator_key = 'indicator1', + year = '2020', + tab = 'mytab', + count = 1)] + + return(test.counts) +} + +test_that("Input validation works correctly", { + # Test NULL input + expect_error(chi_keep_proper_ages(NULL), "ph.data must be provided") + + # Test missing columns + incomplete_data <- data.table(cat1 = "Age", cat1_group = "<18") + expect_error(chi_keep_proper_ages(incomplete_data), "ph.data is missing required columns") + + # Test incorrect column classes + bad_class_data <- create_test_data() + bad_class_data[, chi_age := as.character(chi_age)] + expect_error(chi_keep_proper_ages(bad_class_data), "should be of class 'numeric or integer'") +}) + +test_that("Age filtering works correctly for cat1 Age groups", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test age group '<18' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "<18"] + expect_true(all(age_group_rows$chi_age < 18)) + + # Test age group '18-24' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "18-24"] + expect_true(all(age_group_rows$chi_age >= 18 & age_group_rows$chi_age <= 24)) + + # Test age group '25-44' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "25-44"] + expect_true(all(age_group_rows$chi_age >= 25 & age_group_rows$chi_age <= 44)) + + # Test age group '45-64' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "45-64"] + expect_true(all(age_group_rows$chi_age >= 45 & age_group_rows$chi_age <= 64)) + + # Test age group '65-74' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "65-74"] + expect_true(all(age_group_rows$chi_age >= 65 & age_group_rows$chi_age <= 74)) + + # Test age group '75+' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "75+"] + expect_true(all(age_group_rows$chi_age >= 75)) +}) + +test_that("Age filtering works correctly for cat2 Age groups", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test age group '0-4' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "0-4"] + expect_true(all(age_group_rows$chi_age >= 0 & age_group_rows$chi_age <= 4)) + + # Test age group '5-9' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "5-9"] + expect_true(all(age_group_rows$chi_age >= 5 & age_group_rows$chi_age <= 9)) + + # Test age group '10-14' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "10-14"] + expect_true(all(age_group_rows$chi_age >= 10 & age_group_rows$chi_age <= 14)) + + # Test age group '15-17' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "15-17"] + expect_true(all(age_group_rows$chi_age >= 15 & age_group_rows$chi_age <= 17)) +}) + +test_that("Non-Age categories are not filtered", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test for geographic category (completely unrelated to Age) + geo_rows_original <- test.data[cat1 == "King County"] + geo_rows_filtered <- filtered_data[cat1 == "King County"] + + # For completely non-Age categories, all rows should remain + # (note: filtering only applies when cat1 or cat2 is "Age") + expect_equal(nrow(geo_rows_original), nrow(geo_rows_filtered)) + + # Check that all unique chi_age values remain for King County rows + geo_original_ages <- sort(unique(geo_rows_original$chi_age)) + geo_filtered_ages <- sort(unique(geo_rows_filtered$chi_age)) + + expect_equal(geo_original_ages, geo_filtered_ages) + + # Also check that Gender categories not paired with Age are preserved + gender_only_original <- test.data[cat1 == "Gender" & cat2 != "Age"] + gender_only_filtered <- filtered_data[cat1 == "Gender" & cat2 != "Age"] + + expect_equal(nrow(gender_only_original), nrow(gender_only_filtered)) +}) + +test_that("Count sums decrease after filtering", { + # Create test data + test.data <- create_test_data() + + # Sum of counts before filtering + original_count_sum <- sum(test.data$count) + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Sum of counts after filtering + filtered_count_sum <- sum(filtered_data$count) + + # There should be fewer rows after filtering + expect_true(nrow(filtered_data) < nrow(test.data)) + + # Sum of counts should decrease + expect_true(filtered_count_sum < original_count_sum) + + # Calculate exact expected count + # For each age group, count rows that should be filtered out + expected_filtered_rows <- test.data[ + (cat1 == "Age" & cat1_group == "<18" & chi_age >= 18) | + (cat1 == "Age" & cat1_group == "18-24" & (chi_age < 18 | chi_age > 24)) | + (cat1 == "Age" & cat1_group == "25-44" & (chi_age < 25 | chi_age > 44)) | + (cat1 == "Age" & cat1_group == "45-64" & (chi_age < 45 | chi_age > 64)) | + (cat1 == "Age" & cat1_group == "65-74" & (chi_age < 65 | chi_age > 74)) | + (cat1 == "Age" & cat1_group == "75+" & chi_age < 75) | + (cat2 == "Age" & cat2_group == "0-4" & (chi_age < 0 | chi_age > 4)) | + (cat2 == "Age" & cat2_group == "5-9" & (chi_age < 5 | chi_age > 9)) | + (cat2 == "Age" & cat2_group == "10-14" & (chi_age < 10 | chi_age > 14)) | + (cat2 == "Age" & cat2_group == "15-17" & (chi_age < 15 | chi_age > 17)) + ] + + # Count rows that should remain + rows_with_age_categories <- test.data[(cat1 == "Age") | (cat2 == "Age")] + non_age_categories_rows <- test.data[(cat1 != "Age") & (cat2 != "Age")] + + expected_age_rows_after_filtering <- nrow(rows_with_age_categories) - nrow(expected_filtered_rows) + expected_total_rows <- expected_age_rows_after_filtering + nrow(non_age_categories_rows) + + # Check if we have the expected number of rows after filtering + expect_equal(nrow(filtered_data), expected_total_rows) +}) + diff --git a/tests/testthat/test-chi_process_trends.R b/tests/testthat/test-chi_process_trends.R deleted file mode 100644 index f9ba495..0000000 --- a/tests/testthat/test-chi_process_trends.R +++ /dev/null @@ -1,5 +0,0 @@ -test_that("calculates trends", { - # chi_generate_trend_years does not exist - # DT <- chi_generate_trend_years(indicator_key = c("test1", "test2"),span = 3,begin.year = 2009,final.year = 2023) - expect_identical(1L, 1L) # a dummy test because devtools::check does not allow empty test_that statements -}) diff --git a/tests/testthat/test-chi_qa.R b/tests/testthat/test-chi_qa.R deleted file mode 100644 index 8849056..0000000 --- a/tests/testthat/test-chi_qa.R +++ /dev/null @@ -1,3 +0,0 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) -}) diff --git a/tests/testthat/test-chi_sql_update.R b/tests/testthat/test-chi_sql_update.R index 3962127..5d0ab50 100644 --- a/tests/testthat/test-chi_sql_update.R +++ b/tests/testthat/test-chi_sql_update.R @@ -2,17 +2,21 @@ test_that("chi_update_sql validates inputs", { test_data <- setup_test_data() - expect_warning( - chi_update_sql( - CHIestimates = test_data$my.estimate, - CHImetadata = test_data$my.metadata, - table_name = 'JustTesting', - server = 'development', - replace_table = FALSE - ), - "Validation may be flawed for the following variables because they are 100% missing" - ) - + con_status <- test_data$my.hhsaw_status_test() + if(con_status == 1) { + expect_warning( + chi_update_sql( + CHIestimates = test_data$my.estimate, + CHImetadata = test_data$my.metadata, + table_name = 'JustTesting', + server = 'development', + replace_table = FALSE + ), + "Validation may be flawed for the following variables because they are 100% missing" + ) + } else { + message("connection test skipped") + } expect_error(chi_update_sql(), "The results table to push to SQL \\(CHIestimates\\) is missing") expect_error(suppressWarnings(chi_update_sql(CHIestimates = test_data$my.estimate)), diff --git a/tests/testthat/test_chi_generate_metadata.R b/tests/testthat/test_chi_generate_metadata.R index 41663c1..9f050ce 100644 --- a/tests/testthat/test_chi_generate_metadata.R +++ b/tests/testthat/test_chi_generate_metadata.R @@ -4,4 +4,10 @@ test_that("chi_generate_metadata handles valid inputs", { expect_error(chi_generate_metadata(), "meta.old must be provided") expect_error(chi_generate_metadata(meta.old = test_data$my.metadata), "est.current must be provided") + + # why does this test fail? the DTs are not properly constructed, perhaps I need to update packages? will try later + #DTtest <- test_data$my.estimate + #DTtest[tab,] # throws error + #chi_generate_metadata(meta.old = test_data$my.metadata, est.current = test_data$my.estimate) + })