From 7b950dd74b76bf5a1b18eae2d399a015542e15f6 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Thu, 3 Apr 2025 08:11:29 -0700 Subject: [PATCH 1/9] Created chi_chars_ccs & chi_chars_injury & tests - Both functions tabulate CHARS hosp. counts by demographics, similar to chi_calc() - chi_chars_ccs handles CCS categories; chi_chars_injury handles injury matrix - Added comprehensive testthat tests for both functions --- NAMESPACE | 4 + R/chi_chars_ccs.R | 423 +++++++++++++++++++++++++ R/chi_chars_injury.R | 341 ++++++++++++++++++++ man/chi_chars_ccs.Rd | 72 +++++ man/chi_chars_injury.Rd | 76 +++++ tests/testthat/test-chi_chars_ccs.R | 387 ++++++++++++++++++++++ tests/testthat/test-chi_chars_injury.R | 403 +++++++++++++++++++++++ 7 files changed, 1706 insertions(+) create mode 100644 R/chi_chars_ccs.R create mode 100644 R/chi_chars_injury.R create mode 100644 man/chi_chars_ccs.Rd create mode 100644 man/chi_chars_injury.Rd create mode 100644 tests/testthat/test-chi_chars_ccs.R create mode 100644 tests/testthat/test-chi_chars_injury.R diff --git a/NAMESPACE b/NAMESPACE index de2cc8a..7b531cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(chi_calc) +export(chi_chars_ccs) +export(chi_chars_injury) export(chi_compare_estimates) export(chi_count_by_age) export(chi_drop_illogical_ages) @@ -13,10 +15,12 @@ export(chi_get_proper_pop) export(chi_get_yaml) export(chi_qa_tro) export(chi_update_sql) +import(data.table) import(dtsurvey) import(future) import(future.apply) import(progressr) +import(rads) importFrom(DBI,Id) importFrom(DBI,dbConnect) importFrom(DBI,dbDisconnect) diff --git a/R/chi_chars_ccs.R b/R/chi_chars_ccs.R new file mode 100644 index 0000000..fc85395 --- /dev/null +++ b/R/chi_chars_ccs.R @@ -0,0 +1,423 @@ +# chi_chars_ccs ---- +#' Generate CHI CHARS counts by CCS category +#' +#' @description +#' Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +#' System (CHARS) data based on Clinical Classification Software (CCS) categories +#' for CHI. This function processes instructions for specific indicators and +#' summarizes CHARS data accordingly. It automatically handles the ICD-9 to ICD-10 +#' transition that occurred in 2016. +#' +#' @param ph.indicator A character string of length 1. The indicator key to process, +#' which must exist in the chars.defs data table. +#' @param ph.data A data.table containing the CHARS data to be processed. +#' @param myinstructions A data.table containing processing instructions for each indicator. +#' Default is the output from chi_generate_tro_shell(). +#' @param chars.defs A data.table containing definitions for each indicator. It +#' should have the following columns: `indicator_name`, `indicator_key`, `intent`, +#' `mechanism`, `superlevel`, `broad`, `midlevel`, `detailed`, `age_start`, and +#' `age_end`. +#' +#' @return A data.table containing the summarized CHARS hospitalization data for the +#' specified indicator, stratified by the requested demographic variables. +#' +#' @details +#' The function processes multiple instructions for the same indicator sequentially. +#' +#' The function automatically detects whether data spans the ICD-9 to ICD-10 transition in 2016 +#' and processes each part with the appropriate ICD version. Results are then combined seamlessly. +#' +#' @examples +#' \dontrun{ +#' # Example of how to run with future_lapply for memory efficiency +#' library(future) +#' library(future.apply) +#' +#' plan(multisession, workers = future::availableCores() - 1) +#' +#' countsCCS <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { +#' chi_chars_ccs( +#' ph.indicator = indicator, +#' ph.data = chars, +#' myinstructions = myinstructions, +#' chars.defs = chars.defs) +#' }, future.seed = TRUE)) +#' +#' plan(sequential) +#' +#' } +#' +#' @seealso +#' \code{\link[rads]{get_data_chars}}, which provides creates ph.data +#' +#' \code{\link[rads]{chars_icd_ccs_count}}, which is the engine used by this function +#' +#' \code{\link{chi_generate_tro_shell}}, which creates myinstructions +#' +#' @import data.table +#' @import rads +#' @export +chi_chars_ccs <- function(ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL) { + # Input validation ----- + if (is.na(ph.indicator)) { + stop("\n\U1F6D1 ph.indicator must be provided.") + } + + for (arg_name in c("ph.data", "myinstructions", "chars.defs")) { + arg_value <- get(arg_name) + if (is.null(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be specified.")) + } + if (is.data.frame(arg_value)) { + setDT(arg_value) + } else if (!is.data.frame(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be a data.table or data.frame.")) + } + } + + # Ensure indicator exists + instructions <- myinstructions[indicator_key == ph.indicator] + + if (nrow(instructions) == 0) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in myinstructions.")) + } + + if (!ph.indicator %in% chars.defs$indicator_key) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in chars.defs.")) + } + + + # Ensure cat1_varname and cat2_varname exist in ph.data + unique_cat_vars <- unique(c( + instructions$cat1_varname[!is.na(instructions$cat1_varname)], + instructions$cat2_varname[!is.na(instructions$cat2_varname)] + )) + + missing_cols <- unique_cat_vars[!unique_cat_vars %in% names(ph.data)] + if (length(missing_cols) > 0) { + stop(paste0("\n\U1F47F The following columns referenced in instructions don't exist in ph.data: ", + paste(missing_cols, collapse = ", "))) + } + + # Extract indicator parameters + indicator_def <- chars.defs[indicator_key == ph.indicator] + age_start <- indicator_def$age_start + age_end <- indicator_def$age_end + + # Verify that this is not an injury indicator (those should use chi_chars_injury) + has_mechanism <- "mechanism" %in% names(indicator_def) && !is.na(indicator_def$mechanism) + has_intent <- "intent" %in% names(indicator_def) && !is.na(indicator_def$intent) + + if (has_mechanism || has_intent) { + stop(paste0("\n\U1F6D1 Indicator '", ph.indicator, "' appears to be an injury indicator. ", + "You must using chi_chars_injury() instead.")) + } + + # Check for required parameters + if (!all(c("superlevel", "broad", "midlevel", 'detailed') %in% names(indicator_def))) { + stop("\n\U1F6D1 chars.defs must contain columns 'superlevel', 'broad', 'midlevel', and 'detailed' for CCS indicators.") + } + + # Import CM reference table ---- + message("Loading ICD-CM reference table (this will happen only once per ph.indicator)") + if (any(instructions$start < 2016, na.rm = TRUE)){ICDCM9_table <- rads::chars_icd_ccs(icdcm_version = 9)} + ICDCM10_table <- rads::chars_icd_ccs(icdcm_version = 10) + + # Helper function to process ICD data (will be passed to lapply) ---- + process_icd_data <- function(icdcm_version, + current_start, + current_end, + current_row, + age_start, + age_end, + ph_data, + ICDCM_table, + indicator_def, + indicator_key) { + # Determine geographical filter based on tab ---- + is_wastate <- current_row$tab == "_wastate" + + # Filter data for appropriate years and geography ---- + if (is_wastate) { + data_subset <- data.table::copy(ph.data[wastate == 'Washington State' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } else { + data_subset <- data.table::copy(ph.data[chi_geo_kc == 'King County' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } + + if (nrow(data_subset) == 0) { + return(NULL) + } + + # Add year information ---- + data_subset[, year_range := paste0(current_start, "-", current_end)] + + # Get CCS table based on ICD version ---- + ICDCM_table<- if (icdcm_version == 9) ICDCM9_table else ICDCM10_table + + # Generate arguments for chars_icd_ccs_count ---- + superlevel_arg <- if (!is.na(indicator_def$superlevel)) indicator_def$superlevel else NULL + broad_arg <- if (!is.na(indicator_def$broad)) indicator_def$broad else NULL + midlevel_arg <- if (!is.na(indicator_def$midlevel)) indicator_def$midlevel else NULL + detailed_arg <- if (!is.na(indicator_def$detailed)) indicator_def$detailed else NULL + + # Generate group_by argument ---- + if (is.na(current_row$cat2_varname)) { + group_by_arg <- current_row$cat1_varname + } else { + group_by_arg <- c(current_row$cat1_varname, current_row$cat2_varname) + } + group_by_arg <- c('chi_age', unique(group_by_arg)) + + # Process data with rads::chars_icd_ccs_count---- + result <- rads::chars_icd_ccs_count( + ph.data = data_subset, + icdcm_version = icdcm_version, + CMtable= ICDCM_table, + icdcm = NULL, + superlevel = superlevel_arg, + broad = broad_arg, + midlevel = midlevel_arg, + detailed = detailed_arg, + icdcol = 'diag1', + group_by = group_by_arg, + kingco = FALSE # Already filtered data above, so always FALSE + ) + + # Format the results ---- + if (nrow(result) > 0) { + # Add metadata columns + result[, icd_version := icdcm_version] + result[, original_year_range := paste0(current_row$start, "-", current_row$end)] + result[, tab := current_row$tab] + result[, cat1 := current_row$cat1] + result[, cat2 := current_row$cat2] + result[, cat1_varname := gsub("_hispanic", "", current_row$cat1_varname)] + result[, cat2_varname := gsub("_hispanic", "", current_row$cat2_varname)] + + # Rename category columns + data.table::setnames(result, current_row$cat1_varname, "cat1_group") + + if (!is.na(current_row$cat2_varname)) { + data.table::setnames(result, current_row$cat2_varname, "cat2_group", skip_absent = TRUE) + } + + # Filter and format + result <- result[!is.na(cat1_group)] + result[cat2_varname == 'chi_geo_kc', cat2_group := 'King County'] + result <- result[is.na(cat2_varname) | (!is.na(cat2_varname) & !is.na(cat2_group))] + result[, ph.indicator := ph.indicator] + + } + + return(result) + } + + # Process sequentially ---- + message('\U023F3 Processing calculations... this may take a while') + + # Process each row + results_list <- lapply( + X = seq_len(nrow(instructions)), + FUN = function(i) { + # Progress meter ---- + current_row <- instructions[i] + row_info <- paste( + ph.indicator, + current_row$tab, + current_row$cat1, + current_row$cat1_varname, + current_row$cat2, + current_row$cat2_varname, + current_row$end, + current_row$start, + sep = "|" + ) + message(paste0("myinstructions row ", i, "/", nrow(instructions), ": \n - ", row_info)) + + # Determine if we need ICD-9, ICD-10, or both ---- + current_start <- as.integer(instructions[i, start]) + current_end <- as.integer(instructions[i, end]) + need_icd9 <- current_start < 2016 # Includes any year before 2016 + need_icd10 <- current_end >= 2016 # Includes any year from 2016 onwards + + # Results for this instruction row ---- + row_results <- list() + + # Process with ICD-9 if needed ---- + if (need_icd9) { + message(paste0(" - Processing ICD-9 portion (", current_start, "-", min(current_end, 2015), ")")) + row_results$icd9 <- process_icd_data( + icdcm_version = 9, + current_start = current_start, + current_end = min(current_end, 2015), + current_row = instructions[i,], + age_start = age_start, + age_end = age_end, + ph_data = ph.data, + ICDCM_table= ICDCM9_table, + indicator_def = indicator_def, + indicator_key = ph.indicator + ) + } + + # Process with ICD-10 if needed ---- + if (need_icd10) { + message(paste0(" - Processing ICD-10 portion (", max(current_start, 2016), "-", current_end, ")")) + row_results$icd10 <- process_icd_data( + icdcm_version = 10, + current_start = max(current_start, 2016), + current_end = current_end, + current_row = instructions[i,], + age_start = age_start, + age_end = age_end, + ph_data = ph.data, + ICDCM_table= ICDCM10_table, + indicator_def = indicator_def, + indicator_key = ph.indicator + ) + } + + # If we have both ICD-9 and ICD-10 results, merge them ---- + # When data spans both ICD-9 and ICD-10 periods, we need to merge the results + # This handles the ICD transition in 2016 by combining hospitalization counts + # while preserving all demographic stratifications + if (!is.null(row_results$icd9) && !is.null(row_results$icd10) && + nrow(row_results$icd9) > 0 && nrow(row_results$icd10) > 0) { + + # Identify common columns for merging + merge_cols <- intersect( + names(row_results$icd9)[!names(row_results$icd9) %in% c("hospitalizations", "icd_version", "year_range")], + names(row_results$icd10)[!names(row_results$icd10) %in% c("hospitalizations", "icd_version", "year_range")] + ) + + # Merge data + combined <- merge( + row_results$icd9, + row_results$icd10, + by = merge_cols, + suffixes = c("_icd9", "_icd10"), + all = TRUE + ) + + # Sum hospitalizations (handling missing values) + combined[, hospitalizations := sum( + ifelse(is.na(hospitalizations_icd9), 0, hospitalizations_icd9), + ifelse(is.na(hospitalizations_icd10), 0, hospitalizations_icd10) + ), by = merge_cols] + + # Set year to the full original range + combined[, year := original_year_range] + + return(combined) + } else if (!is.null(row_results$icd9) && nrow(row_results$icd9) > 0) { + # Only ICD-9 results available + row_results$icd9[, year := original_year_range] + return(row_results$icd9) + } else if (!is.null(row_results$icd10) && nrow(row_results$icd10) > 0) { + # Only ICD-10 results available + row_results$icd10[, year := original_year_range] + return(row_results$icd10) + } else { + # No results available + return(NULL) + } + } + ) + + # Filter out NULL results + results_list <- results_list[!sapply(results_list, is.null)] + + # Combine results if any exist + if (length(results_list) > 0) { + result <- data.table::rbindlist(results_list, fill = TRUE, use.names = TRUE) + } else { + warning("\u26A0\ufe0f No results found for indicator: ", ph.indicator) + result <- data.table::data.table() + } + + # Remove temporary columns if they exist + cols_to_remove <- intersect(names(result), + c('icd_version_icd9', 'icd_version_icd10', 'hospitalizations_icd9', + 'hospitalizations_icd10', 'icd_version', 'original_year_range')) + if (length(cols_to_remove) > 0) { + result[, (cols_to_remove) := NULL] + } + + # Ensure we have complete data (for all ages withing the specified range) ---- + # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results + template <- unique(result[, .(tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + indicator_key = ph.indicator, year)]) + + template <- template[, .(chi_age = seq(age_start, age_end)), by = names(template)] + + result <- merge(template, + result[, chi_age := as.numeric(chi_age)], + by = intersect(names(template), names(result)), + all = TRUE) + + result[is.na(hospitalizations), hospitalizations := 0] + + result <- result[, .(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + + setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) + + # Identify instructions that caused all data to be filtered out ---- + # this helps diagnose data quality issues, either in myinstructions or in ph.data + if (nrow(result) > 0) { + # Get combinations actually used in results + result_combos <- unique(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + + # Handle the special case for race3_hispanic + result_combos[cat1_varname == 'race3' & cat1 == 'Ethnicity', cat1_varname := 'race3_hispanic'] + + # Use fsetdiff to find instructions that didn't produce results + unused_instructions <- fsetdiff( + setcolorder(instructions, names(result_combos)), + result_combos + ) + + # Sort for warning display + setorder(unused_instructions, indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, start, end) + } else { + unused_instructions <- copy(instructions) + } + + # Generate warnings and attach to result + if (nrow(unused_instructions) > 0) { + # Capture the formatted output directly + empty_table <- paste( + capture.output( + print(unused_instructions, + row.names = FALSE, + class = FALSE, + printClassOfColumns = FALSE) + ), + collapse = "\n" + ) + + warning(paste0( + "\n\u26A0\ufe0f No data found for the following ", nrow(unused_instructions), + " instruction(s) for indicator '", ph.indicator, "':\n", + empty_table + )) + + } + + # Return data.table ---- + return(result) +} diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R new file mode 100644 index 0000000..9c768eb --- /dev/null +++ b/R/chi_chars_injury.R @@ -0,0 +1,341 @@ +# chi_chars_injury ---- +#' Generate CHI CHARS counts for injury indicators +#' +#' @description +#' Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +#' System (CHARS) data for injury indicators including falls, poisoning, +#' self-harm, and other injury mechanisms for CHI. This function processes +#' instructions for specific indicators and summarizes CHARS data accordingly. +#' +#' @param ph.indicator A character string of length 1. The indicator key to process, +#' which must exist in the chars.defs data table. +#' @param ph.data A data.table containing the CHARS data to be processed. +#' @param myinstructions A data.table containing processing instructions for each indicator. +#' Default is the output from \code{\link{chi_generate_tro_shell}}. +#' @param chars.defs A data.table containing definitions for each indicator. It +#' should have the following columns: `indicator_name`, `indicator_key`, `intent`, +#' `mechanism`, `age_start`, and `age_end`. +#' @param def A character string indicating which injury definition to use. +#' Default is \code{def = 'narrow'}. +#' +#' @return A data.table containing the summarized CHARS injury hospitalization data for the +#' specified indicator, stratified by the requested demographic variables. +#' +#' @details +#' This function processes instructions for a specific indicator sequentially. +#' +#' Note that injury data is only available for 2012 and later years. Unlike chi_chars_ccs(), +#' this function doesn't need to handle the ICD-9 to ICD-10 transition explicitly as the +#' mechanism and intent columns have been standardized across coding systems. +#' +#' @examples +#' \dontrun{ +#' # Example of how to run with future_lapply for memory efficiency +#' library(future) +#' library(future.apply) +#' +#' plan(multisession, workers = future::availableCores() - 1) +#' +#' countsINJURY <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { +#' chi_chars_injury( +#' ph.indicator = indicator, +#' ph.data = chars, +#' myinstructions = myinstructions, +#' chars.defs = chars.defs, +#' def = 'narrow') +#' }, future.seed = TRUE)) +#' +#' plan(sequential) +#' +#' } +#' +#' @seealso +#' \code{\link[rads]{get_data_chars}}, which provides creates ph.data +#' +#' \code{\link[rads]{chars_injury_matrix_count}}, which is the engine used by this function +#' +#' \code{\link{chi_generate_tro_shell}}, which creates myinstructions +#' +#' @import data.table +#' @import rads +#' @export +chi_chars_injury <- function(ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL, + def = 'narrow') { + # Input validation ----- + if (is.na(ph.indicator)) { + stop("\n\U1F6D1 ph.indicator must be provided.") + } + + for (arg_name in c("ph.data", "myinstructions", "chars.defs")) { + arg_value <- get(arg_name) + if (is.null(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be specified.")) + } + if (is.data.frame(arg_value)) { + setDT(arg_value) + } else if (!is.data.frame(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be a data.table or data.frame.")) + } + } + + # Ensure indicator exists + instructions <- myinstructions[indicator_key == ph.indicator] + + if (nrow(instructions) == 0) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in myinstructions.")) + } + + if (!ph.indicator %in% chars.defs$indicator_key) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in chars.defs.")) + } + + + # Ensure cat1_varname and cat2_varname exist in ph.data + unique_cat_vars <- unique(c( + instructions$cat1_varname[!is.na(instructions$cat1_varname)], + instructions$cat2_varname[!is.na(instructions$cat2_varname)] + )) + + missing_cols <- unique_cat_vars[!unique_cat_vars %in% names(ph.data)] + if (length(missing_cols) > 0) { + stop(paste0("\n\U1F47F The following columns referenced in instructions don't exist in ph.data: ", + paste(missing_cols, collapse = ", "))) + } + + # Extract indicator parameters + indicator_def <- chars.defs[indicator_key == ph.indicator] + age_start <- indicator_def$age_start + age_end <- indicator_def$age_end + if (!is.numeric(age_start) || !is.numeric(age_end)) { + stop("\n\U1F6D1 age_start and age_end must be numeric.") + } + + # Verify that this is an injury indicator (those should use chi_chars_injury) + has_mechanism <- "mechanism" %in% names(indicator_def) && !is.na(indicator_def$mechanism) + has_intent <- "intent" %in% names(indicator_def) && !is.na(indicator_def$intent) + + if (!has_mechanism && !has_intent) { + stop(paste0("\n\U1F6D1 Indicator '", ph.indicator, "' does not appear to be an injury indicator. ", + "You must use chi_chars_ccs() instead.")) + } + + # Check for required parameters + if (!all(c("intent", "mechanism") %in% names(indicator_def))) { + stop("\n\U1F6D1 chars.defs must contain columns 'intent' and 'mechanism' for injury indicators.") + } + + # Validate injury-specific parameters + if (!is.character(def) || length(def) != 1 || !def %in% c('narrow', 'broad')) { + stop("\n\U1F6D1 'def' must be either 'narrow' or 'broad'.") + } + + # Process sequentially ---- + message('\U023F3 Processing calculations... this may take a while') + + # Process each row + results_list <- lapply( + X = seq_len(nrow(instructions)), + FUN = function(i) { + # Progress meter + current_row <- instructions[i] + row_info <- paste( + ph.indicator, + current_row$tab, + current_row$cat1, + current_row$cat1_varname, + current_row$cat2, + current_row$cat2_varname, + current_row$end, + current_row$start, + sep = "|" + ) + message(paste0("myinstructions row ", i, "/", nrow(instructions), ": \n - ", row_info)) + + # Get current instruction parameters + current_start <- as.integer(current_row$start) + current_end <- as.integer(current_row$end) + + # For injury data, check if the year range is valid (only available from 2012+) + if (current_start < 2012) { + message(paste0(" - \u26A0\ufe0f Warning: Injury data only available from 2012. Adjusting start year from ", + current_start, " to 2012.")) + current_start <- 2012 + } + + # If start year is now after end year, skip this row + if (current_start > current_end) { + message(paste0(" - \u26A0\ufe0f Skipping: After year adjustment, start year ", current_start, + " is after end year ", current_end)) + return(NULL) + } + + # Determine geographical filter based on tab + is_wastate <- current_row$tab == "_wastate" + + # Filter data for appropriate years, geography, and age + if (is_wastate) { + data_subset <- data.table::copy(ph.data[wastate == 'Washington State' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } else { + data_subset <- data.table::copy(ph.data[chi_geo_kc == 'King County' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } + + # Return NULL if no data after filtering + if (nrow(data_subset) == 0) { + return(NULL) + } + + # Add year information + data_subset[, year_range := paste0(current_start, "-", current_end)] + + # Generate group_by argument + if (is.na(current_row$cat2_varname)) { + group_by_arg <- current_row$cat1_varname + } else { + group_by_arg <- c(current_row$cat1_varname, current_row$cat2_varname) + } + group_by_arg <- c('chi_age', unique(group_by_arg)) + + # Process data with rads::chars_injury_matrix_count + result <- rads::chars_injury_matrix_count( + ph.data = data_subset, + intent = indicator_def$intent, + mechanism = indicator_def$mechanism, + group_by = group_by_arg, + def = def, + kingco = FALSE # Already filtered data above + ) + + # Return NULL if no results + if (nrow(result) == 0) { + return(NULL) + } + + # For ICD10cm, poisoning is split into drug & non-drug, collapse them if needed + if (!is.null(indicator_def$mechanism) && !is.na(indicator_def$mechanism) && + indicator_def$mechanism == 'poisoning') { + result <- result[, .(mechanism = 'poisoning', hospitalizations = sum(hospitalizations)), + by = setdiff(names(result), c('mechanism', 'hospitalizations'))] + } + + # Remove mechanism and intent columns + if ("mechanism" %in% names(result)) { + result[, mechanism := NULL] + } + if ("intent" %in% names(result)) { + result[, intent := NULL] + } + + # Format the results + # Add metadata columns + result[, year := paste0(current_start, "-", current_end)] + result[, tab := current_row$tab] + result[, cat1 := current_row$cat1] + result[, cat2 := current_row$cat2] + result[, cat1_varname := gsub("_hispanic", "", current_row$cat1_varname)] + result[, cat2_varname := gsub("_hispanic", "", current_row$cat2_varname)] + + # Rename category columns + data.table::setnames(result, current_row$cat1_varname, "cat1_group") + + if (!is.na(current_row$cat2_varname)) { + data.table::setnames(result, current_row$cat2_varname, "cat2_group", skip_absent = TRUE) + } + + # Filter and format + result <- result[!is.na(cat1_group)] # Drop when group not identified + result[cat2_varname == 'chi_geo_kc', cat2_group := 'King County'] + result <- result[is.na(cat2_varname) | (!is.na(cat2_varname) & !is.na(cat2_group))] + result[, ph.indicator := ph.indicator] + + return(result) + } + ) + + # Filter out NULL results + results_list <- results_list[!sapply(results_list, is.null)] + + # Combine results if any exist + if (length(results_list) > 0) { + result <- data.table::rbindlist(results_list, fill = TRUE, use.names = TRUE) + } else { + warning("\u26A0\ufe0f No results found for indicator: ", ph.indicator) + result <- data.table::data.table() + } + + # Ensure we have complete data (for all ages within the specified range) ---- + # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results + template <- unique(result[, .(tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + indicator_key = ph.indicator, year)]) + + template <- template[, .(chi_age = seq(age_start, age_end)), by = names(template)] + + result <- merge(template, + result[, chi_age := as.numeric(chi_age)], + by = intersect(names(template), names(result)), + all = TRUE) + + result[is.na(hospitalizations), hospitalizations := 0] + + result <- result[, .(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + + setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) + + # Identify instructions that caused all data to be filtered out ---- + # this helps diagnose data quality issues, either in myinstructions or in ph.data + if (nrow(result) > 0) { + # Get combinations actually used in results + result_combos <- unique(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + + # Handle the special case for race3_hispanic + result_combos[cat1_varname == 'race3' & cat1 == 'Ethnicity', cat1_varname := 'race3_hispanic'] + + # Use fsetdiff to find instructions that didn't produce results + unused_instructions <- fsetdiff( + setcolorder(instructions[, start := max(2012, start)], names(result_combos)), + result_combos + ) + + # Sort for warning display + setorder(unused_instructions, indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, start, end) + } else { + unused_instructions <- copy(instructions) + } + + # Generate warnings and attach to result + if (nrow(unused_instructions) > 0) { + # Capture the formatted output directly + empty_table <- paste( + capture.output( + print(unused_instructions, + row.names = FALSE, + class = FALSE, + printClassOfColumns = FALSE) + ), + collapse = "\n" + ) + + warning(paste0( + "\n\u26A0\ufe0f No data found for the following ", nrow(unused_instructions), + " instruction(s) for indicator '", ph.indicator, "':\n", + empty_table + )) + } + + # Return data.table ---- + return(result) +} diff --git a/man/chi_chars_ccs.Rd b/man/chi_chars_ccs.Rd new file mode 100644 index 0000000..21095f7 --- /dev/null +++ b/man/chi_chars_ccs.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_chars_ccs.R +\name{chi_chars_ccs} +\alias{chi_chars_ccs} +\title{Generate CHI CHARS counts by CCS category} +\usage{ +chi_chars_ccs( + ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL +) +} +\arguments{ +\item{ph.indicator}{A character string of length 1. The indicator key to process, +which must exist in the chars.defs data table.} + +\item{ph.data}{A data.table containing the CHARS data to be processed.} + +\item{myinstructions}{A data.table containing processing instructions for each indicator. +Default is the output from chi_generate_tro_shell().} + +\item{chars.defs}{A data.table containing definitions for each indicator. It +should have the following columns: `indicator_name`, `indicator_key`, `intent`, +`mechanism`, `superlevel`, `broad`, `midlevel`, `detailed`, `age_start`, and +`age_end`.} +} +\value{ +A data.table containing the summarized CHARS hospitalization data for the + specified indicator, stratified by the requested demographic variables. +} +\description{ +Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +System (CHARS) data based on Clinical Classification Software (CCS) categories +for CHI. This function processes instructions for specific indicators and +summarizes CHARS data accordingly. It automatically handles the ICD-9 to ICD-10 +transition that occurred in 2016. +} +\details{ +The function processes multiple instructions for the same indicator sequentially. + +The function automatically detects whether data spans the ICD-9 to ICD-10 transition in 2016 +and processes each part with the appropriate ICD version. Results are then combined seamlessly. +} +\examples{ +\dontrun{ +# Example of how to run with future_lapply for memory efficiency +library(future) +library(future.apply) + +plan(multisession, workers = future::availableCores() - 1) + +countsCCS <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = chars, + myinstructions = myinstructions, + chars.defs = chars.defs) +}, future.seed = TRUE)) + +plan(sequential) + +} + +} +\seealso{ +\code{\link[rads]{get_data_chars}}, which provides creates ph.data + +\code{\link[rads]{chars_icd_ccs_count}}, which is the engine used by this function + +\code{\link{chi_generate_tro_shell}}, which creates myinstructions +} diff --git a/man/chi_chars_injury.Rd b/man/chi_chars_injury.Rd new file mode 100644 index 0000000..a0a2fed --- /dev/null +++ b/man/chi_chars_injury.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_chars_injury.R +\name{chi_chars_injury} +\alias{chi_chars_injury} +\title{Generate CHI CHARS counts for injury indicators} +\usage{ +chi_chars_injury( + ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL, + def = "narrow" +) +} +\arguments{ +\item{ph.indicator}{A character string of length 1. The indicator key to process, +which must exist in the chars.defs data table.} + +\item{ph.data}{A data.table containing the CHARS data to be processed.} + +\item{myinstructions}{A data.table containing processing instructions for each indicator. +Default is the output from \code{\link{chi_generate_tro_shell}}.} + +\item{chars.defs}{A data.table containing definitions for each indicator. It +should have the following columns: `indicator_name`, `indicator_key`, `intent`, +`mechanism`, `age_start`, and `age_end`.} + +\item{def}{A character string indicating which injury definition to use. +Default is \code{def = 'narrow'}.} +} +\value{ +A data.table containing the summarized CHARS injury hospitalization data for the + specified indicator, stratified by the requested demographic variables. +} +\description{ +Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +System (CHARS) data for injury indicators including falls, poisoning, +self-harm, and other injury mechanisms for CHI. This function processes +instructions for specific indicators and summarizes CHARS data accordingly. +} +\details{ +This function processes instructions for a specific indicator sequentially. + +Note that injury data is only available for 2012 and later years. Unlike chi_chars_ccs(), +this function doesn't need to handle the ICD-9 to ICD-10 transition explicitly as the +mechanism and intent columns have been standardized across coding systems. +} +\examples{ +\dontrun{ +# Example of how to run with future_lapply for memory efficiency +library(future) +library(future.apply) + +plan(multisession, workers = future::availableCores() - 1) + +countsINJURY <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { + chi_chars_injury( + ph.indicator = indicator, + ph.data = chars, + myinstructions = myinstructions, + chars.defs = chars.defs, + def = 'narrow') +}, future.seed = TRUE)) + +plan(sequential) + +} + +} +\seealso{ +\code{\link[rads]{get_data_chars}}, which provides creates ph.data + +\code{\link[rads]{chars_injury_matrix_count}}, which is the engine used by this function + +\code{\link{chi_generate_tro_shell}}, which creates myinstructions +} diff --git a/tests/testthat/test-chi_chars_ccs.R b/tests/testthat/test-chi_chars_ccs.R new file mode 100644 index 0000000..3ea6aee --- /dev/null +++ b/tests/testthat/test-chi_chars_ccs.R @@ -0,0 +1,387 @@ +library(testthat) +library(data.table) +library(rads) + +# Create mock_chars data ---- + set.seed(98104) + + # ICD codes for asthma and non-asthma conditions + icd9_asthma <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392") + + icd10_asthma <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998") + + icd9_non_asthma <- c("E9855", "36190", "V1508", "76425", "36642", "66061", "V7040", + "V7710", "V4530", "V0660", "76830", "85406") + + icd10_non_asthma <- c("S80241D", "S52134B", "S62256S", "S60922S", "V629", "K9420", + "S59241G", "S52042R", "J8417", "S83136D", "M06851", "T363X3D") + + # Create 1000 sample CHARS records + n_samples <- 1000 + half_samples <- n_samples / 2 + + # Create ICD-9 era data (pre-2016) + icd9_data <- data.table( + seq_no = 1:half_samples, # Unique identifier + chi_year = sample(2013:2015, half_samples, replace = TRUE), + chi_age = sample(0:99, half_samples, replace = TRUE), + chi_geo_kc = rep("King County", half_samples), + wastate = rep("Washington State", half_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), half_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), half_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), half_samples, replace = TRUE) + ) + + # Ensure at least 100 asthma cases + asthma_cases <- 100 + non_asthma_cases <- half_samples - asthma_cases + + icd9_data[1:asthma_cases, diag1 := sample(icd9_asthma, asthma_cases, replace = TRUE)] + icd9_data[(asthma_cases+1):half_samples, diag1 := sample(icd9_non_asthma, non_asthma_cases, replace = TRUE)] + + # Create ICD-10 era data (2016 and after) + icd10_data <- data.table( + seq_no = (half_samples+1):n_samples, # Continue sequence numbers + chi_year = sample(2016:2022, half_samples, replace = TRUE), + chi_age = sample(0:99, half_samples, replace = TRUE), + chi_geo_kc = rep("King County", half_samples), + wastate = rep("Washington State", half_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), half_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), half_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), half_samples, replace = TRUE) + ) + + # Ensure at least 100 asthma cases + icd10_data[1:asthma_cases, diag1 := sample(icd10_asthma, asthma_cases, replace = TRUE)] + icd10_data[(asthma_cases+1):half_samples, diag1 := sample(icd10_non_asthma, non_asthma_cases, replace = TRUE)] + + # Combine the data + mock_chars <- rbindlist(list(icd9_data, icd10_data)) + +# Create mock_instructions ---- + mock_instructions <- data.table( + indicator_key = rep(c("hos1803000_v1", "hos1803000_v2"), 3), + tab = rep(c("trends", "_wastate"), each = 3), + cat1 = rep("Ethnicity", 6), + cat1_varname = rep("race3_hispanic", 6), + cat2 = c(NA_character_, NA_character_, "Sex", "Sex", NA_character_, NA_character_), + cat2_varname = c(NA_character_, NA_character_, "chi_sex", "chi_sex", NA_character_, NA_character_), + end = c(2017, 2022, 2017, 2022, 2015, 2015), + start = c(2013, 2018, 2014, 2016, 2013, 2013) + ) + +# Create mock_chars_def ---- + mock_chars_def <- data.table( + indicator_name = c("Asthma hospitalizations (all ages)", "Asthma hospitalizations (children)"), + indicator_key = c("hos1803000_v1", "hos1803000_v2"), + intent = c(NA_character_, NA_character_), + mechanism = c(NA_character_, NA_character_), + superlevel = c(NA_character_, NA_character_), + broad = c("RESP", NA_character_), + midlevel = c(NA_character_, "Asthma"), + detailed = c(NA_character_, NA_character_), + age_start = c(0, 0), + age_end = c(120, 17) + ) + +# Create function for mock ccs_table ---- + # Function to create mock CCS reference tables + create_mock_ccs_table <- function(icdcm_version) { + if (icdcm_version == 9) { + icd_codes <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392", + "E9855", "36190", "V1508", "76425", "36642", "66061", "V7040", + "V7710", "V4530", "V0660", "76830", "85406") + + # Ensure all asthma codes are properly classified + asthma_codes <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392") + } else { # ICD-10 + icd_codes <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998", + "S80241D", "S52134B", "S62256S", "S60922S", "V629", "K9420", + "S59241G", "S52042R", "J8417", "S83136D", "M06851", "T363X3D") + + # Ensure all asthma codes are properly classified + asthma_codes <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998") + } + + # Create the data table + ccs_table <- data.table( + icdcm_code = icd_codes, + icdcm = paste0("Description for ", icd_codes), + superlevel = NA_character_, + broad = NA_character_, + midlevel = NA_character_, + detailed = NA_character_, + icdcm_version = icdcm_version + ) + + # Assign RESP to all asthma codes + ccs_table[icdcm_code %in% asthma_codes, broad := "RESP"] + + # Assign Asthma to all asthma codes + ccs_table[icdcm_code %in% asthma_codes, midlevel := "Asthma"] + + return(ccs_table) + } + +# Create vector of expected column order ---- + expectedCols <- c('indicator_key', 'year', 'chi_age', 'hospitalizations', 'tab', 'cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group') + +# Test validation ---- + test_that("chi_chars_ccs validates inputs correctly", { + # Test missing ph.indicator + expect_error(chi_chars_ccs(ph.indicator = NA, + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.indicator must be provided") + + # Test missing ph.data + expect_error(chi_chars_ccs(ph.indicator = "hos1803000_v1", + ph.data = NULL, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.data must be specified") + + # Test indicator not found in instructions + expect_error(chi_chars_ccs(ph.indicator = "not_an_indicator", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "not found in myinstructions") + + # Test invalid column in instructions + bad_instructions <- copy(mock_instructions) + bad_instructions[1, cat1_varname := "not_a_column"] + + expect_error(chi_chars_ccs(ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = bad_instructions, + chars.defs = mock_chars_def), + "don't exist in ph.data") + }) + +# Test function handles ICD-9 data correctly ---- + test_that("chi_chars_ccs processes ICD-9 data correctly", { + # Filter instructions to only include pre-2016 data + icd9_instructions <- mock_instructions[end < 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = icd9_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all data is from ICD-9 era (pre-2016) + expect_true(all(grepl("201[3-5]", result$year))) + + # Check hospitalization count is as expected + expect_equal(sum(result$hospitalizations), 100) + + # Check age filtering worked correctly + expect_gt(max(as.numeric(result$chi_age)), 50) # is for all ages, so should definitely have some > 50 yrs old + + # Check correct indicator is used + expect_true(all(result$indicator_key == "hos1803000_v1")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test function handles ICD-10 data correctly ---- + test_that("chi_chars_ccs processes ICD-10 data correctly", { + # Filter instructions to only include post-2016 data + icd10_instructions <- mock_instructions[start >= 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v2", + ph.data = mock_chars, + myinstructions = icd10_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all data is from ICD-10 era (2016+) + expect_true(all(grepl("201[6-9]|202[0-2]", result$year))) + + # Check hospitalization count is as expected + expect_lt(sum(result$hospitalizations), 100) # All ICD10 should be 100, so limiting to up to 17 should be less + + # Check age filtering worked correctly + expect_equal(max(as.numeric(result$chi_age)), 17) # for children, so max is 17 yrs old + + # Check correct indicator is used + expect_true(all(result$indicator_key == "hos1803000_v2")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test function handles mixed ICD-9/ICD-10 data correctly ---- + test_that("chi_chars_ccs processes mixed ICD-9/ICD-10 data correctly", { + # Filter data to include instructions that span the 2016 transition + mixed_instructions <- mock_instructions[start < 2016 & end >= 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = mixed_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Verify that year ranges are formatted correctly + expect_true(all(grepl("^\\d{4}-\\d{4}$", result$year))) + + # Verify that the range spans the transition + expect_true(any(grepl("201[3-5]-201[6-9]", result$year))) + + # Verify we have all cat1_group values + expect_equal(unique(c("Hispanic", "Non-Hispanic")), unique(result$cat1_group) ) + + # Verify we have the proper year spans + expect_equal(sort(unique(result$year)), + sort(unique(mixed_instructions[, year := paste0(start, '-', end)]$year))) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test that instructions with different indicator variables work ---- + test_that("chi_chars_ccs handles different indicator variables correctly", { + # Instructions with two category variables + cat2_instructions <- mock_instructions[!is.na(cat2_varname)] + + # Run function + result <- rbindlist(lapply(c("hos1803000_v1", "hos1803000_v2"), function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = mock_chars, + myinstructions = cat2_instructions, + chars.defs = mock_chars_def) + }), fill = TRUE) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that both category values are correctly filled + expect_true(all(!is.na(result$cat1_group))) + expect_true(all(!is.na(result$cat2_group))) + + # Check all ph.indicator (indicator_keys) exist + expect_equal(sort(unique(cat2_instructions$indicator_key)), + sort(unique(result$indicator_key))) + + # Check cat1_group and cat2_group are all there + expect_equal( + setorder(unique(result[, .(cat1_group, cat2_group)]), cat1_group, cat2_group), + data.table(cat1_group = rep(c("Hispanic", "Non-Hispanic"), each = 2), cat2_group = rep(c("Female", "Male"), times = 2)) + ) + + # Check if have the same number of observations for each age (within an indicator) + expect_equal(uniqueN(result[indicator_key == "hos1803000_v1", .N, chi_age]$N), 1) + expect_equal(uniqueN(result[indicator_key == "hos1803000_v2", .N, chi_age]$N), 1) + }) + +# Test that WA state filtering works ---- + test_that("chi_chars_ccs handles WA state filtering correctly", { + # Instructions for WA state + wa_instructions <- mock_instructions[tab == "_wastate"] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = wa_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all rows are for WA state + expect_true(all(result$tab == "_wastate")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test child-specific age filtering ---- + test_that("chi_chars_ccs handles age filtering correctly", { + # Test children-only indicator (hos1803000_v2 has age_end = 17) + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v2", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1803000_v2"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # check correct age range + expect_true(all(result$chi_age %in% 0:17)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test what happens when instructions filter out all rows---- + test_that("When some instructions filter out all rows, expect it to work but returning warnings", { + warnings <- capture_warnings({ + result <- rbindlist(lapply(c("hos1803000_v1", "hos1803000_v2"), function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = copy(mock_chars)[, chi_geo_kc := 'KC'], + myinstructions = mock_instructions, + chars.defs = mock_chars_def) + }), fill = TRUE) + }) + + # Check that we got exactly 2 warnings + expect_length(warnings, 2) + + # Check content of first warning + expect_match(warnings[1], + "No data found for the following 2 instruction\\(s\\) for indicator 'hos1803000_v1'") + + # Check content of second warning + expect_match(warnings[2], + "No data found for the following 1 instruction\\(s\\) for indicator 'hos1803000_v2'") + + expect_equal( # expect three of the instructions 'worked' because _wastate was not corrupted + 3, + uniqueN(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + ) + + }) diff --git a/tests/testthat/test-chi_chars_injury.R b/tests/testthat/test-chi_chars_injury.R new file mode 100644 index 0000000..6f51b70 --- /dev/null +++ b/tests/testthat/test-chi_chars_injury.R @@ -0,0 +1,403 @@ +library(testthat) +library(data.table) +library(rads) + +# Create mock_chars data for injury ---- +set.seed(98104) + +# Create 1000 sample CHARS records +n_samples <- 1000 + +# Define all injury mechanisms and intents as needed by chars_injury_matrix_count +mechanisms <- c("any", "bites_stings", "cut_pierce", "drowning", "fall", "fire", + "fire_burn", "firearm", "machinery", "motor_vehicle_nontraffic", + "mvt_motorcyclist", "mvt_occupant", "mvt_other", "mvt_pedal_cyclist", + "mvt_pedestrian", "mvt_traffic", "mvt_unspecified", "natural", + "natural_environmental", "other", "other_land_transport", "other_spec", + "other_specified", "other_transport", "overexertion", "pedal_cyclist", + "pedestrian", "poisoning", "poisoning_drug", "poisoning_nondrug", "struck", + "struck_by_against", "suffocation", "transport_other", "unspecified") + +intents <- c("any", "unintentional", "intentional", "assault", "legal", "undetermined") + +# Initialize the data.table with demographic columns +mock_chars <- data.table( + seq_no = 1:n_samples, # Unique identifier + injury_nature_narrow = sample(c(TRUE, FALSE), n_samples, replace = TRUE, prob = c(0.8, 0.2)), + injury_nature_broad = NA, + chi_year = sample(2012:2022, n_samples, replace = TRUE), + chi_age = sample(0:99, n_samples, replace = TRUE), + chi_geo_kc = rep("King County", n_samples), + wastate = rep("Washington State", n_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), n_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), n_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), n_samples, replace = TRUE) +) +mock_chars[, injury_nature_broad := !injury_nature_narrow] # Define broad based on narrow + + +# Add mechanism columns (all 0 initially) +for (mech in mechanisms) { + mock_chars[, paste0("mechanism_", mech) := 0] +} + +# Add intent columns (all 0 initially) +for (int in intents) { + mock_chars[, paste0("intent_", int) := 0] +} + +# First, set our fixed test cases +# Set exactly 100 fall injuries with unintentional intent (rows 1-100) +mock_chars[1:100, `:=`( + mechanism_fall = 1, + intent_unintentional = 1, + injury_intent = "unintentional" +)] + +# Set exactly 50 poisoning self-harm cases (rows 101-150) +mock_chars[101:150, `:=`( + mechanism_poisoning = 1, + intent_intentional = 1, + injury_intent = "intentional" +)] + +# Now randomly assign mechanism and intent to the remaining records (151-1000) +remaining_rows <- 151:n_samples +for (i in remaining_rows) { + # Select random mechanism and intent + mech <- sample(mechanisms, 1) + int <- sample(intents, 1) + + # Set the selected mechanism and intent to 1 + mock_chars[i, paste0("mechanism_", mech) := 1] + mock_chars[i, paste0("intent_", int) := 1] + + # Set the injury_intent string value based on which intent_* column is 1 + # Skip "any" since it's not a real intent category for injury_intent + if (int != "any") { + mock_chars[i, injury_intent := int] + } else { + # If "any" was selected, choose one of the real intents + real_int <- sample(setdiff(intents, "any"), 1) + mock_chars[i, injury_intent := real_int] + mock_chars[i, paste0("intent_", real_int) := 1] + } +} + +# Create mock_instructions ---- +mock_instructions <- data.table( + indicator_key = rep(c("hos1901000_v1", "hos1901000_v2"), 3), + tab = rep(c("trends", "_wastate"), each = 3), + cat1 = rep("Ethnicity", 6), + cat1_varname = rep("race3_hispanic", 6), + cat2 = c(NA_character_, NA_character_, "Sex", "Sex", NA_character_, NA_character_), + cat2_varname = c(NA_character_, NA_character_, "chi_sex", "chi_sex", NA_character_, NA_character_), + end = c(2017, 2022, 2017, 2022, 2015, 2015), + start = c(2012, 2018, 2014, 2016, 2012, 2012) +) + +# Create mock_chars_def ---- +mock_chars_def <- data.table( + indicator_name = c("Fall injuries (all ages)", "Fall injuries (children)"), + indicator_key = c("hos1901000_v1", "hos1901000_v2"), + intent = c("unintentional", "unintentional"), + mechanism = c("fall", "fall"), + age_start = c(0, 0), + age_end = c(120, 17) +) + +# Add poisoning self-harm indicator +mock_chars_def <- rbind(mock_chars_def, data.table( + indicator_name = "Self-harm poisoning", + indicator_key = "hos1902000_v1", + intent = "intentional", # Changed from intentional_self_harm to match intent column names + mechanism = "poisoning", + age_start = 10, + age_end = 120 +)) + +# Add row to instructions for self-harm +mock_instructions <- rbind(mock_instructions, data.table( + indicator_key = "hos1902000_v1", + tab = "trends", + cat1 = "Ethnicity", + cat1_varname = "race3_hispanic", + cat2 = "Sex", + cat2_varname = "chi_sex", + end = 2022, + start = 2012 +)) + +# Create vector of expected column order ---- +expectedCols <- c('indicator_key', 'year', 'chi_age', 'hospitalizations', 'tab', 'cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group') + +# Test validation ---- +test_that("chi_chars_injury validates inputs correctly", { + # Test missing ph.indicator + expect_error(chi_chars_injury(ph.indicator = NA, + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.indicator must be provided") + + # Test missing ph.data + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = NULL, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.data must be specified") + + # Test indicator not found in instructions + expect_error(chi_chars_injury(ph.indicator = "not_an_indicator", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "not found in myinstructions") + + # Test invalid column in instructions + bad_instructions <- copy(mock_instructions) + bad_instructions[1, cat1_varname := "not_a_column"] + + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = bad_instructions, + chars.defs = mock_chars_def), + "don't exist in ph.data") + + # Test invalid def parameter + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def, + def = "invalid_def"), + "must be either 'narrow' or 'broad'") +}) + +# Test function processes fall injury data correctly ---- +test_that("chi_chars_injury processes fall injury data correctly", { + # Run function with default parameters + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that data matches the fall/unintentional criteria + expect_true(sum(result$hospitalizations) > 0) + + # Check age filtering worked correctly + expect_gt(max(as.numeric(result$chi_age)), 50) # is for all ages, so should definitely have some > 50 yrs old + + # Check if have consistent number of rows per age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles age filtering correctly ---- +test_that("chi_chars_injury handles age filtering correctly", { + # Test children-only indicator (hos1901000_v2 has age_end = 17) + result <- chi_chars_injury( + ph.indicator = "hos1901000_v2", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v2"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check correct age range + expect_true(all(result$chi_age %in% 0:17)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles different injury types correctly ---- +test_that("chi_chars_injury handles different injury types correctly", { + # Test self-harm poisoning indicator + result <- chi_chars_injury( + ph.indicator = "hos1902000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1902000_v1"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that we have counts > 0 + expect_true(sum(result$hospitalizations) > 0) + + # Check age filtering worked correctly (min age should be 10) + expect_equal(min(as.numeric(result$chi_age)), 10) +}) + +# Test def parameter variation ---- +test_that("chi_chars_injury handles 'def' parameter correctly", { + # Test with narrow definition (default) + narrow_result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"][1], + chars.defs = mock_chars_def, + def = "narrow" + ) + + # Test with broad definition + broad_result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"][1], + chars.defs = mock_chars_def, + def = "broad" + ) + + # Both should have valid structure + expect_true(is.data.table(narrow_result)) + expect_true(is.data.table(broad_result)) + + # Results can be the same in our test data, but function should run without error + expect_true(TRUE) +}) + +# Test that WA state filtering works ---- +test_that("chi_chars_injury handles WA state filtering correctly", { + # Instructions for WA state + wa_instructions <- mock_instructions[tab == "_wastate" & indicator_key == "hos1901000_v1"] + + # Run function + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = wa_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all rows are for WA state + expect_true(all(result$tab == "_wastate")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test processing multiple instructions ---- +test_that("chi_chars_injury processes multiple instructions correctly", { + # Use multiple instructions for the same indicator + multiple_instructions <- mock_instructions[indicator_key == "hos1901000_v1"] + + # Run function + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = multiple_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure with multiple rows + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + expect_true(nrow(result) > 0) + + # confirm we processed the all of the instruction sets + expect_equal(uniqueN(result[, .(indicator_key, tab, cat1_varname, cat2_varname)]), + nrow(multiple_instructions)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test what happens when instructions filter out all rows---- +test_that("When some instructions filter out all rows, expect it to work but returning warnings", { + warnings <- capture_warnings({ + result <- rbindlist(lapply(c("hos1901000_v1", "hos1901000_v2"), function(indicator) { + chi_chars_injury( + ph.indicator = indicator, + ph.data = copy(mock_chars)[, chi_geo_kc := 'KC'], # Corrupt the data + myinstructions = mock_instructions[indicator_key %in% c("hos1901000_v1", "hos1901000_v2")], + chars.defs = mock_chars_def) + }), fill = TRUE) + }) + + # Check that we got exactly 2 warnings (1 per indicator) + expect_length(warnings, 2) + + # Check content of first warning + expect_match(warnings[1], + "No data found for the following .* instruction\\(s\\) for indicator 'hos1901000_v1'") + + # Check content of second warning + expect_match(warnings[2], + "No data found for the following .* instruction\\(s\\) for indicator 'hos1901000_v2'") + + # Check for any data from _wastate (which should still work) + expect_gt(nrow(result), 0) +}) + +# *Test year restriction for injury data (only 2012+) ---- +test_that("chi_chars_injury correctly handles pre-2012 years", { + # Create instructions with pre-2012 years + early_instructions <- copy(mock_instructions[1]) + early_instructions[, `:=`(start = 2010, end = 2015)] + + # Run the function with pre-2012 years + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = early_instructions, + chars.defs = mock_chars_def + ) + + # Check the results + expect_true(is.data.table(result)) + expect_true(nrow(result) > 0) + + # The year range in the result should be 2012-2015, not 2010-2015 + expect_true(all(result$year == "2012-2015")) +}) + +# Test poisoning mechanism handling with drug & non-drug ---- +test_that("chi_chars_injury correctly handles poisoning with ICD-10", { + # Create mock data with both poisoning_drug and poisoning_nondrug + poisoning_test_chars <- copy(mock_chars) + + # Add some specific poisoning_drug and poisoning_nondrug cases + poisoning_test_chars[201:225, `:=`( + mechanism_poisoning_drug = 1, + mechanism_poisoning = 0, + intent_intentional = 1, + injury_intent = "intentional" + )] + + poisoning_test_chars[226:250, `:=`( + mechanism_poisoning_nondrug = 1, + mechanism_poisoning = 0, + intent_intentional = 1, + injury_intent = "intentional" + )] + + # Create a test for poisoning indicator (which may need to collapse drug & non-drug) + result <- chi_chars_injury( + ph.indicator = "hos1902000_v1", + ph.data = poisoning_test_chars, + myinstructions = mock_instructions[indicator_key == "hos1902000_v1"], + chars.defs = mock_chars_def + ) + + # Check the results + expect_true(is.data.table(result)) + expect_true(nrow(result) > 0) + + # The result should include data from the poisoning mechanism + expect_true(sum(result$hospitalizations) > 0) +}) From 20de324949a66a97a688faaee6c1e28489a56d11 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Thu, 3 Apr 2025 08:58:09 -0700 Subject: [PATCH 2/9] Tiny tweaks to CHARS functions for devtools::check - .() to list() - declared globals - importFrom utils capture.output --- NAMESPACE | 1 + R/chi_chars_ccs.R | 9 +++++---- R/chi_chars_injury.R | 11 ++++++----- R/globals.R | 9 +++++++++ 4 files changed, 21 insertions(+), 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 7b531cf..365f53f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -66,6 +66,7 @@ importFrom(stats,na.omit) importFrom(stats,qnorm) importFrom(tidyr,crossing) importFrom(tools,toTitleCase) +importFrom(utils,capture.output) importFrom(utils,tail) importFrom(yaml,read_yaml) importFrom(yaml,yaml.load) diff --git a/R/chi_chars_ccs.R b/R/chi_chars_ccs.R index fc85395..d3835dc 100644 --- a/R/chi_chars_ccs.R +++ b/R/chi_chars_ccs.R @@ -56,6 +56,7 @@ #' #' @import data.table #' @import rads +#' @importFrom utils capture.output #' @export chi_chars_ccs <- function(ph.indicator = NA, ph.data = NULL, @@ -356,12 +357,12 @@ chi_chars_ccs <- function(ph.indicator = NA, # Ensure we have complete data (for all ages withing the specified range) ---- # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results - template <- unique(result[, .(tab, + template <- unique(result[, list(tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, indicator_key = ph.indicator, year)]) - template <- template[, .(chi_age = seq(age_start, age_end)), by = names(template)] + template <- template[, list(chi_age = seq(age_start, age_end)), by = names(template)] result <- merge(template, result[, chi_age := as.numeric(chi_age)], @@ -370,7 +371,7 @@ chi_chars_ccs <- function(ph.indicator = NA, result[is.na(hospitalizations), hospitalizations := 0] - result <- result[, .(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + result <- result[, list(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) @@ -401,7 +402,7 @@ chi_chars_ccs <- function(ph.indicator = NA, if (nrow(unused_instructions) > 0) { # Capture the formatted output directly empty_table <- paste( - capture.output( + utils::capture.output( print(unused_instructions, row.names = FALSE, class = FALSE, diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R index 9c768eb..768297c 100644 --- a/R/chi_chars_injury.R +++ b/R/chi_chars_injury.R @@ -58,6 +58,7 @@ #' #' @import data.table #' @import rads +#' @importFrom utils capture.output #' @export chi_chars_injury <- function(ph.indicator = NA, ph.data = NULL, @@ -224,7 +225,7 @@ chi_chars_injury <- function(ph.indicator = NA, # For ICD10cm, poisoning is split into drug & non-drug, collapse them if needed if (!is.null(indicator_def$mechanism) && !is.na(indicator_def$mechanism) && indicator_def$mechanism == 'poisoning') { - result <- result[, .(mechanism = 'poisoning', hospitalizations = sum(hospitalizations)), + result <- result[, list(mechanism = 'poisoning', hospitalizations = sum(hospitalizations)), by = setdiff(names(result), c('mechanism', 'hospitalizations'))] } @@ -275,12 +276,12 @@ chi_chars_injury <- function(ph.indicator = NA, # Ensure we have complete data (for all ages within the specified range) ---- # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results - template <- unique(result[, .(tab, + template <- unique(result[, list(tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, indicator_key = ph.indicator, year)]) - template <- template[, .(chi_age = seq(age_start, age_end)), by = names(template)] + template <- template[, list(chi_age = seq(age_start, age_end)), by = names(template)] result <- merge(template, result[, chi_age := as.numeric(chi_age)], @@ -289,7 +290,7 @@ chi_chars_injury <- function(ph.indicator = NA, result[is.na(hospitalizations), hospitalizations := 0] - result <- result[, .(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + result <- result[, list(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) @@ -320,7 +321,7 @@ chi_chars_injury <- function(ph.indicator = NA, if (nrow(unused_instructions) > 0) { # Capture the formatted output directly empty_table <- paste( - capture.output( + utils::capture.output( print(unused_instructions, row.names = FALSE, class = FALSE, diff --git a/R/globals.R b/R/globals.R index 72a30e3..2aafb26 100644 --- a/R/globals.R +++ b/R/globals.R @@ -36,8 +36,13 @@ utils::globalVariables(c( "group", "group_by1", "group_by2", + "hospitalizations", + "hospitalizations_icd9", + "hospitalizations_icd10", "hra20_name", + "icd_version", "indicator_key", + "intent", "keepme", "latest_year", "latest_year_count", @@ -49,11 +54,13 @@ utils::globalVariables(c( "latest_yearx", "level", "lower_bound", + "mechanism", "min_age", "max_age", "mykey", "notable", "numerator", + "original_year_range", "overall", "pattern", "pop", @@ -91,6 +98,8 @@ utils::globalVariables(c( "valid_years", "varname", "vebrose", + "wastate", + "year_range", "year.span", "year", "_kingcounty.x", From d22be02f26797986c93065b7b8f281afabf22256 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 8 Apr 2025 14:33:20 -0700 Subject: [PATCH 3/9] rendered new chi_get_cols helpfile no changes to coding, just rendered with devtools::document() --- man/chi_get_cols.Rd | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/man/chi_get_cols.Rd b/man/chi_get_cols.Rd index 8580d92..c1c75d8 100644 --- a/man/chi_get_cols.Rd +++ b/man/chi_get_cols.Rd @@ -4,10 +4,13 @@ \alias{chi_get_cols} \title{Get CHI variable column names} \usage{ -chi_get_cols() +chi_get_cols(metadata = FALSE) +} +\arguments{ +\item{metadata}{returns metadata column names instead of primary data} } \value{ -A character vector of column names +A character vector of column names for the chi data (Default) or metadata } \description{ Returns a character vector of column names defined in the CHI YAML reference file. From 0e7e1e6a4ed532592174be02d13cd1840265bc12 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Thu, 10 Apr 2025 15:48:53 -0700 Subject: [PATCH 4/9] FIX warnings for chi_chars_injury previously used the overall max start date for a given indicator_key to identify missing data. This was flawed, because it needs to be row wise. changed max to pmax ensure it is a parallel comparison (i.e., by rows) --- R/chi_chars_injury.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R index 768297c..9b4a418 100644 --- a/R/chi_chars_injury.R +++ b/R/chi_chars_injury.R @@ -307,7 +307,7 @@ chi_chars_injury <- function(ph.indicator = NA, # Use fsetdiff to find instructions that didn't produce results unused_instructions <- fsetdiff( - setcolorder(instructions[, start := max(2012, start)], names(result_combos)), + setcolorder(instructions[, start := pmax(2012, start)], names(result_combos)), result_combos ) From b7d3907b6396d628f80b864f8c6165196d162b82 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Thu, 10 Apr 2025 16:35:17 -0700 Subject: [PATCH 5/9] FIXED warning for race3_hispanic cat2_varname instructions have 'race3_hispanic' because it is a distinct variable. It is however reported in the results as 'race3', so, when comparing the results to the instructions to identify missing data, we need need to change back to race3_hispanic (for cat2==Ethnicity) --- R/chi_chars_injury.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R index 9b4a418..b149696 100644 --- a/R/chi_chars_injury.R +++ b/R/chi_chars_injury.R @@ -302,8 +302,9 @@ chi_chars_injury <- function(ph.indicator = NA, start = as.numeric(substr(year, 1, 4)), end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) - # Handle the special case for race3_hispanic + # Handle the special case for race3_hispanic (format it like the instructions) result_combos[cat1_varname == 'race3' & cat1 == 'Ethnicity', cat1_varname := 'race3_hispanic'] + result_combos[cat2_varname == 'race3' & cat2 == 'Ethnicity', cat2_varname := 'race3_hispanic'] # Use fsetdiff to find instructions that didn't produce results unused_instructions <- fsetdiff( From 9e352cc76cb9bc08fe5b50cc9bb8ed0d68ae22a8 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 11 Apr 2025 15:48:10 -0700 Subject: [PATCH 6/9] Created chi_keep_proper_ages + tests + helpfile - passed all tests - will be used with chi_chars_ccs & chi_chars_injury & possibly other f() --- NAMESPACE | 2 + R/chi_keep_proper_ages.R | 148 ++++++++++++++++++ R/globals.R | 4 + man/chi_keep_proper_ages.Rd | 55 +++++++ tests/testthat/test-chi_keep_proper_ages.R | 172 +++++++++++++++++++++ 5 files changed, 381 insertions(+) create mode 100644 R/chi_keep_proper_ages.R create mode 100644 man/chi_keep_proper_ages.Rd create mode 100644 tests/testthat/test-chi_keep_proper_ages.R diff --git a/NAMESPACE b/NAMESPACE index 365f53f..6248b5c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(chi_generate_tro_shell) export(chi_get_cols) export(chi_get_proper_pop) export(chi_get_yaml) +export(chi_keep_proper_ages) export(chi_qa_tro) export(chi_update_sql) import(data.table) @@ -33,6 +34,7 @@ importFrom(data.table,.GRP) importFrom(data.table,.SD) importFrom(data.table,CJ) importFrom(data.table,`:=`) +importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) importFrom(data.table,data.table) diff --git a/R/chi_keep_proper_ages.R b/R/chi_keep_proper_ages.R new file mode 100644 index 0000000..02f1982 --- /dev/null +++ b/R/chi_keep_proper_ages.R @@ -0,0 +1,148 @@ +# chi_keep_proper_ages ---- +#' Keep data for appropriate ages only +#' +#' @description +#' Internal function to filter data based on age groups in cat1_group and cat2_group, +#' preserving only the rows where chi_age falls within the implied age ranges of +#' any age groups in the data. +#' +#' @param ph.data A data.table containing the data to be filtered. Should have the +#' following columns: cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, +#' indicator_key, year, chi_age, tab -- all of the appropriate classes. +#' +#' @return A filtered data.table with only the rows where chi_age is compatible +#' with the age ranges implied by age groups. +#' +#' @details +#' When \code{cat# == 'Age'}, the function parses \code{cat#_group} labels (e.g., +#' "<18", "18-24", "75+") to determine minimum and maximum ages, then filters the +#' data accordingly. It handles age groups in both cat1 and cat2 categories. +#' +#' @importFrom data.table is.data.table as.data.table rbindlist setnames data.table +#' +#' @examples +#' library(data.table) +#' +#' # Make test data with all combinations of categories and age +#' test_data <- CJ( +#' cat1 = "Gender", +#' cat1_varname = "chi_sex", +#' cat1_group = c("Female", "Male"), +#' cat2 = "Age", +#' cat2_varname = "yage4", +#' cat2_group = c("0-4", "5-9", "10-14"), +#' chi_age = 1:20, +#' unique = TRUE +#' ) +#' +#' # Add extra required columns +#' test_data[, `:=`( +#' indicator_key = "indicator1", +#' year = "2023", +#' tab = "mytab", +#' count = 1 +#' )] +#' +#' # Example usage +#' filtered_data <- chi_keep_proper_ages(test_data) +#' filtered_data[cat2_group == "10-14", unique(chi_age)] +#' +#' @export +chi_keep_proper_ages <- function(ph.data = NULL) { + # Validate inputs + if (is.null(ph.data)) { + stop("\n\U1F6D1 ph.data must be provided") + } + + if (!data.table::is.data.table(ph.data)) { + ph.data <- data.table::as.data.table(ph.data) + } + + # Check for required columns + required_columns <- c("cat2_group", "cat1_group", "indicator_key", "year", + "chi_age", "tab", "cat1", "cat1_varname", + "cat2", "cat2_varname") + + missing_columns <- required_columns[!required_columns %in% names(ph.data)] + if (length(missing_columns) > 0) { + stop(paste0("\n\U1F6D1 ph.data is missing required columns: ", + paste(missing_columns, collapse = ", "))) + } + + # Check expected column classes + expected_classes <- list( + cat1 = c("character", "factor"), + cat1_varname = c("character", "factor"), + cat1_group = c("character", "factor"), + cat2 = c("character", "factor"), + cat2_varname = c("character", "factor"), + cat2_group = c("character", "factor"), + indicator_key = c("character", "factor"), + year = "character", + chi_age = c("numeric", "integer"), + tab = c("character", "factor") + ) + + for (col_name in names(expected_classes)) { + expected <- expected_classes[[col_name]] + valid_class <- FALSE + + # Check if column inherits from any of the expected classes + for (expected_class in expected) { + if (inherits(ph.data[[col_name]], expected_class)) { + valid_class <- TRUE + break # once a valid class is established, no need to keep checking for that col_name + } + } + + if (isFALSE(valid_class)) { + expected_str <- paste(expected, collapse = " or ") + actual_class <- class(ph.data[[col_name]])[1] # Get primary class + stop(paste0("\n\U1F6D1 Column '", col_name, "' should be of class '", + expected_str, "', but is '", actual_class, "'")) + } + } + + # Make function used in lapply below to get min and max age from age group + parse_age_group <- function(group) { + + group <- gsub(" ", "", group) + if (grepl("^<[0-9]+$", group)) { + return(list(c(NA, as.numeric(sub("<", "", group)) - 1))) + } else if (grepl("^[0-9]+-[0-9]+$", group)) { + return(list(as.numeric(unlist(strsplit(group, "-"))))) + } else if (grepl("^[0-9]+\\+$", group)) { + return(list(c(as.numeric(sub("\\+", "", group)), NA))) + } else { + return(list(c(NA, NA))) + } + } + + + # Create reference table of age groups and corresponding min_age and max_age + all_groups <- unique(c(ph.data$cat1_group[ph.data$cat1 == "Age"], + ph.data$cat2_group[ph.data$cat2 == "Age"])) + all_bounds <- rbindlist(lapply(all_groups, function(mygroup) { + bounds <- parse_age_group(mygroup)[[1]] + data.table(age_group = mygroup, min_age = bounds[1], max_age = bounds[2]) + })) + + # Merge cat1 bounds + setnames(all_bounds, c("age_group", "min_age", "max_age"), c("cat1_group", "min1", "max1")) + ph.data <- merge(ph.data, all_bounds, by = "cat1_group", all.x = TRUE, sort = FALSE) + + # Merge cat2 bounds + setnames(all_bounds, c("cat1_group", "min1", "max1"), c("cat2_group", "min2", "max2")) + ph.data <- merge(ph.data, all_bounds, by = "cat2_group", all.x = TRUE, sort = FALSE) + + # Create filter to identify rows to drop + ph.data[, keepme := TRUE] + + ph.data[cat1 == "Age", keepme := keepme & + (is.na(min1) | chi_age >= min1) & (is.na(max1) | chi_age <= max1)] + ph.data[cat2 == "Age", keepme := keepme & + (is.na(min2) | chi_age >= min2) & (is.na(max2) | chi_age <= max2)] + + # Clean up and return + return(ph.data[keepme == TRUE][, c("keepme", "min1", "max1", "min2", "max2") := NULL]) +} diff --git a/R/globals.R b/R/globals.R index 2aafb26..e51db41 100644 --- a/R/globals.R +++ b/R/globals.R @@ -55,7 +55,11 @@ utils::globalVariables(c( "level", "lower_bound", "mechanism", + "min1", + "min2", "min_age", + "max1", + "max2", "max_age", "mykey", "notable", diff --git a/man/chi_keep_proper_ages.Rd b/man/chi_keep_proper_ages.Rd new file mode 100644 index 0000000..49d44f3 --- /dev/null +++ b/man/chi_keep_proper_ages.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_keep_proper_ages.R +\name{chi_keep_proper_ages} +\alias{chi_keep_proper_ages} +\title{Keep data for appropriate ages only} +\usage{ +chi_keep_proper_ages(ph.data = NULL) +} +\arguments{ +\item{ph.data}{A data.table containing the data to be filtered. Should have the +following columns: cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, +indicator_key, year, chi_age, tab -- all of the appropriate classes.} +} +\value{ +A filtered data.table with only the rows where chi_age is compatible +with the age ranges implied by age groups. +} +\description{ +Internal function to filter data based on age groups in cat1_group and cat2_group, +preserving only the rows where chi_age falls within the implied age ranges of +any age groups in the data. +} +\details{ +When \code{cat# == 'Age'}, the function parses \code{cat#_group} labels (e.g., +"<18", "18-24", "75+") to determine minimum and maximum ages, then filters the +data accordingly. It handles age groups in both cat1 and cat2 categories. +} +\examples{ +library(data.table) + +# Make test data with all combinations of categories and age +test_data <- CJ( + cat1 = "Gender", + cat1_varname = "chi_sex", + cat1_group = c("Female", "Male"), + cat2 = "Age", + cat2_varname = "yage4", + cat2_group = c("0-4", "5-9", "10-14"), + chi_age = 1:20, + unique = TRUE +) + +# Add extra required columns +test_data[, `:=`( + indicator_key = "indicator1", + year = "2023", + tab = "mytab", + count = 1 +)] + +# Example usage +filtered_data <- chi_keep_proper_ages(test_data) +filtered_data[cat2_group == "10-14", unique(chi_age)] + +} diff --git a/tests/testthat/test-chi_keep_proper_ages.R b/tests/testthat/test-chi_keep_proper_ages.R new file mode 100644 index 0000000..f5b88d3 --- /dev/null +++ b/tests/testthat/test-chi_keep_proper_ages.R @@ -0,0 +1,172 @@ +# Helper function to create test data (not in helper.R because very specific to this function) +create_test_data <- function() { + cat1_age6 <- data.table(cat1 = 'Age', cat1_varname = 'age6', + cat1_group = c('<18', '18-24', '25-44', '45-64', '65-74', '75+'), + mykey = 1) + cat2_sex <- data.table(cat2 = 'Gender', cat2_varname = 'chi_sex', + cat2_group = c('Female', 'Male'), mykey = 1) + cat1_sex <- setnames(copy(cat2_sex), gsub('cat2', 'cat1', names(cat2_sex))) + cat2_yage4 <- data.table(cat2 = 'Age', cat2_varname = 'yage4', + cat2_group = c('0-4', '10-14', '15-17', '5-9'), mykey = 1) + cat1_geo <- data.table(cat1 = 'King County', cat1_varname = 'chi_geo_kc', + cat1_group = 'King County', mykey = 1) + chi_ages <- data.table(chi_age = 0:100, mykey = 1) + + test.counts <- rbind( + merge(cat1_age6, cat2_sex, allow.cartesian = TRUE), + merge(cat1_sex, cat2_yage4, allow.cartesian = TRUE), + merge(cat1_geo, cat2_sex, allow.cartesian = TRUE)) + + test.counts <- merge(test.counts, chi_ages, allow.cartesian = TRUE)[, mykey := NULL] + + test.counts[, `:=` (indicator_key = 'indicator1', + year = '2020', + tab = 'mytab', + count = 1)] + + return(test.counts) +} + +test_that("Input validation works correctly", { + # Test NULL input + expect_error(chi_keep_proper_ages(NULL), "ph.data must be provided") + + # Test missing columns + incomplete_data <- data.table(cat1 = "Age", cat1_group = "<18") + expect_error(chi_keep_proper_ages(incomplete_data), "ph.data is missing required columns") + + # Test incorrect column classes + bad_class_data <- create_test_data() + bad_class_data[, chi_age := as.character(chi_age)] + expect_error(chi_keep_proper_ages(bad_class_data), "should be of class 'numeric or integer'") +}) + +test_that("Age filtering works correctly for cat1 Age groups", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test age group '<18' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "<18"] + expect_true(all(age_group_rows$chi_age < 18)) + + # Test age group '18-24' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "18-24"] + expect_true(all(age_group_rows$chi_age >= 18 & age_group_rows$chi_age <= 24)) + + # Test age group '25-44' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "25-44"] + expect_true(all(age_group_rows$chi_age >= 25 & age_group_rows$chi_age <= 44)) + + # Test age group '45-64' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "45-64"] + expect_true(all(age_group_rows$chi_age >= 45 & age_group_rows$chi_age <= 64)) + + # Test age group '65-74' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "65-74"] + expect_true(all(age_group_rows$chi_age >= 65 & age_group_rows$chi_age <= 74)) + + # Test age group '75+' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "75+"] + expect_true(all(age_group_rows$chi_age >= 75)) +}) + +test_that("Age filtering works correctly for cat2 Age groups", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test age group '0-4' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "0-4"] + expect_true(all(age_group_rows$chi_age >= 0 & age_group_rows$chi_age <= 4)) + + # Test age group '5-9' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "5-9"] + expect_true(all(age_group_rows$chi_age >= 5 & age_group_rows$chi_age <= 9)) + + # Test age group '10-14' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "10-14"] + expect_true(all(age_group_rows$chi_age >= 10 & age_group_rows$chi_age <= 14)) + + # Test age group '15-17' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "15-17"] + expect_true(all(age_group_rows$chi_age >= 15 & age_group_rows$chi_age <= 17)) +}) + +test_that("Non-Age categories are not filtered", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test for geographic category (completely unrelated to Age) + geo_rows_original <- test.data[cat1 == "King County"] + geo_rows_filtered <- filtered_data[cat1 == "King County"] + + # For completely non-Age categories, all rows should remain + # (note: filtering only applies when cat1 or cat2 is "Age") + expect_equal(nrow(geo_rows_original), nrow(geo_rows_filtered)) + + # Check that all unique chi_age values remain for King County rows + geo_original_ages <- sort(unique(geo_rows_original$chi_age)) + geo_filtered_ages <- sort(unique(geo_rows_filtered$chi_age)) + + expect_equal(geo_original_ages, geo_filtered_ages) + + # Also check that Gender categories not paired with Age are preserved + gender_only_original <- test.data[cat1 == "Gender" & cat2 != "Age"] + gender_only_filtered <- filtered_data[cat1 == "Gender" & cat2 != "Age"] + + expect_equal(nrow(gender_only_original), nrow(gender_only_filtered)) +}) + +test_that("Count sums decrease after filtering", { + # Create test data + test.data <- create_test_data() + + # Sum of counts before filtering + original_count_sum <- sum(test.data$count) + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Sum of counts after filtering + filtered_count_sum <- sum(filtered_data$count) + + # There should be fewer rows after filtering + expect_true(nrow(filtered_data) < nrow(test.data)) + + # Sum of counts should decrease + expect_true(filtered_count_sum < original_count_sum) + + # Calculate exact expected count + # For each age group, count rows that should be filtered out + expected_filtered_rows <- test.data[ + (cat1 == "Age" & cat1_group == "<18" & chi_age >= 18) | + (cat1 == "Age" & cat1_group == "18-24" & (chi_age < 18 | chi_age > 24)) | + (cat1 == "Age" & cat1_group == "25-44" & (chi_age < 25 | chi_age > 44)) | + (cat1 == "Age" & cat1_group == "45-64" & (chi_age < 45 | chi_age > 64)) | + (cat1 == "Age" & cat1_group == "65-74" & (chi_age < 65 | chi_age > 74)) | + (cat1 == "Age" & cat1_group == "75+" & chi_age < 75) | + (cat2 == "Age" & cat2_group == "0-4" & (chi_age < 0 | chi_age > 4)) | + (cat2 == "Age" & cat2_group == "5-9" & (chi_age < 5 | chi_age > 9)) | + (cat2 == "Age" & cat2_group == "10-14" & (chi_age < 10 | chi_age > 14)) | + (cat2 == "Age" & cat2_group == "15-17" & (chi_age < 15 | chi_age > 17)) + ] + + # Count rows that should remain + rows_with_age_categories <- test.data[(cat1 == "Age") | (cat2 == "Age")] + non_age_categories_rows <- test.data[(cat1 != "Age") & (cat2 != "Age")] + + expected_age_rows_after_filtering <- nrow(rows_with_age_categories) - nrow(expected_filtered_rows) + expected_total_rows <- expected_age_rows_after_filtering + nrow(non_age_categories_rows) + + # Check if we have the expected number of rows after filtering + expect_equal(nrow(filtered_data), expected_total_rows) +}) + From ea9ff1ae056cb8ec291432e9b75a56b5fc99956a Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 11 Apr 2025 15:57:00 -0700 Subject: [PATCH 7/9] chi_chars_ccs & chi_chars_injury proper ages - both now use chi_keep_proper_ages to keep only relevant chi_age values --- R/chi_chars_ccs.R | 3 +++ R/chi_chars_injury.R | 3 +++ 2 files changed, 6 insertions(+) diff --git a/R/chi_chars_ccs.R b/R/chi_chars_ccs.R index d3835dc..d575805 100644 --- a/R/chi_chars_ccs.R +++ b/R/chi_chars_ccs.R @@ -419,6 +419,9 @@ chi_chars_ccs <- function(ph.indicator = NA, } + # Drop irrelevant ages ---- + result <- chi_keep_proper_ages(result) + # Return data.table ---- return(result) } diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R index b149696..722b079 100644 --- a/R/chi_chars_injury.R +++ b/R/chi_chars_injury.R @@ -338,6 +338,9 @@ chi_chars_injury <- function(ph.indicator = NA, )) } + # Drop irrelevant ages ---- + result <- chi_keep_proper_ages(result) + # Return data.table ---- return(result) } From 092f83b15e49ff46d1b38467abef3ac266e94ea9 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 14 Apr 2025 09:19:13 -0700 Subject: [PATCH 8/9] Fix single year & properly order columns - '2022-2022' is now '2022' - enure final output columns have desired order --- R/chi_chars_ccs.R | 6 ++++++ R/chi_chars_injury.R | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/R/chi_chars_ccs.R b/R/chi_chars_ccs.R index d575805..f6290a7 100644 --- a/R/chi_chars_ccs.R +++ b/R/chi_chars_ccs.R @@ -422,6 +422,12 @@ chi_chars_ccs <- function(ph.indicator = NA, # Drop irrelevant ages ---- result <- chi_keep_proper_ages(result) + # Tidy year (e.g., '2025-2025' becomes just '2025') ---- + result[, year := gsub("^(\\d{4})-(\\1)$", "\\1", year)] # \\1 is a back reference to what was found before hyphen + + # Order columns ---- + setcolorder(result, c("indicator_key", "year", "chi_age", "hospitalizations", "tab", "cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group")) + # Return data.table ---- return(result) } diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R index 722b079..a136d0d 100644 --- a/R/chi_chars_injury.R +++ b/R/chi_chars_injury.R @@ -341,6 +341,12 @@ chi_chars_injury <- function(ph.indicator = NA, # Drop irrelevant ages ---- result <- chi_keep_proper_ages(result) + # Tidy year (e.g., '2025-2025' becomes just '2025') ---- + result[, year := gsub("^(\\d{4})-(\\1)$", "\\1", year)] # \\1 is a back reference to what was found before hyphen + + # Order columns ---- + setcolorder(result, c("indicator_key", "year", "chi_age", "hospitalizations", "tab", "cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group")) + # Return data.table ---- return(result) } From 1cddf4ed300875db33789c6ca41d498fa36cf860 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 14 Apr 2025 09:21:22 -0700 Subject: [PATCH 9/9] return all data when cat1 & cat2 are not age - filtering by age doesn't make sense if cat1 | cat2 are not Age, so return entire unfiltered dataste --- R/chi_keep_proper_ages.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/chi_keep_proper_ages.R b/R/chi_keep_proper_ages.R index 02f1982..3e39880 100644 --- a/R/chi_keep_proper_ages.R +++ b/R/chi_keep_proper_ages.R @@ -122,6 +122,9 @@ chi_keep_proper_ages <- function(ph.data = NULL) { # Create reference table of age groups and corresponding min_age and max_age all_groups <- unique(c(ph.data$cat1_group[ph.data$cat1 == "Age"], ph.data$cat2_group[ph.data$cat2 == "Age"])) + + if(length(all_groups) == 0){return(ph.data)} # no need to continue if cat1 or cat2 are not 'Age' + all_bounds <- rbindlist(lapply(all_groups, function(mygroup) { bounds <- parse_age_group(mygroup)[[1]] data.table(age_group = mygroup, min_age = bounds[1], max_age = bounds[2])