diff --git a/NAMESPACE b/NAMESPACE index de2cc8a..6248b5c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,8 @@ # Generated by roxygen2: do not edit by hand export(chi_calc) +export(chi_chars_ccs) +export(chi_chars_injury) export(chi_compare_estimates) export(chi_count_by_age) export(chi_drop_illogical_ages) @@ -11,12 +13,15 @@ export(chi_generate_tro_shell) export(chi_get_cols) export(chi_get_proper_pop) export(chi_get_yaml) +export(chi_keep_proper_ages) export(chi_qa_tro) export(chi_update_sql) +import(data.table) import(dtsurvey) import(future) import(future.apply) import(progressr) +import(rads) importFrom(DBI,Id) importFrom(DBI,dbConnect) importFrom(DBI,dbDisconnect) @@ -29,6 +34,7 @@ importFrom(data.table,.GRP) importFrom(data.table,.SD) importFrom(data.table,CJ) importFrom(data.table,`:=`) +importFrom(data.table,as.data.table) importFrom(data.table,between) importFrom(data.table,copy) importFrom(data.table,data.table) @@ -62,6 +68,7 @@ importFrom(stats,na.omit) importFrom(stats,qnorm) importFrom(tidyr,crossing) importFrom(tools,toTitleCase) +importFrom(utils,capture.output) importFrom(utils,tail) importFrom(yaml,read_yaml) importFrom(yaml,yaml.load) diff --git a/R/chi_chars_ccs.R b/R/chi_chars_ccs.R new file mode 100644 index 0000000..f6290a7 --- /dev/null +++ b/R/chi_chars_ccs.R @@ -0,0 +1,433 @@ +# chi_chars_ccs ---- +#' Generate CHI CHARS counts by CCS category +#' +#' @description +#' Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +#' System (CHARS) data based on Clinical Classification Software (CCS) categories +#' for CHI. This function processes instructions for specific indicators and +#' summarizes CHARS data accordingly. It automatically handles the ICD-9 to ICD-10 +#' transition that occurred in 2016. +#' +#' @param ph.indicator A character string of length 1. The indicator key to process, +#' which must exist in the chars.defs data table. +#' @param ph.data A data.table containing the CHARS data to be processed. +#' @param myinstructions A data.table containing processing instructions for each indicator. +#' Default is the output from chi_generate_tro_shell(). +#' @param chars.defs A data.table containing definitions for each indicator. It +#' should have the following columns: `indicator_name`, `indicator_key`, `intent`, +#' `mechanism`, `superlevel`, `broad`, `midlevel`, `detailed`, `age_start`, and +#' `age_end`. +#' +#' @return A data.table containing the summarized CHARS hospitalization data for the +#' specified indicator, stratified by the requested demographic variables. +#' +#' @details +#' The function processes multiple instructions for the same indicator sequentially. +#' +#' The function automatically detects whether data spans the ICD-9 to ICD-10 transition in 2016 +#' and processes each part with the appropriate ICD version. Results are then combined seamlessly. +#' +#' @examples +#' \dontrun{ +#' # Example of how to run with future_lapply for memory efficiency +#' library(future) +#' library(future.apply) +#' +#' plan(multisession, workers = future::availableCores() - 1) +#' +#' countsCCS <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { +#' chi_chars_ccs( +#' ph.indicator = indicator, +#' ph.data = chars, +#' myinstructions = myinstructions, +#' chars.defs = chars.defs) +#' }, future.seed = TRUE)) +#' +#' plan(sequential) +#' +#' } +#' +#' @seealso +#' \code{\link[rads]{get_data_chars}}, which provides creates ph.data +#' +#' \code{\link[rads]{chars_icd_ccs_count}}, which is the engine used by this function +#' +#' \code{\link{chi_generate_tro_shell}}, which creates myinstructions +#' +#' @import data.table +#' @import rads +#' @importFrom utils capture.output +#' @export +chi_chars_ccs <- function(ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL) { + # Input validation ----- + if (is.na(ph.indicator)) { + stop("\n\U1F6D1 ph.indicator must be provided.") + } + + for (arg_name in c("ph.data", "myinstructions", "chars.defs")) { + arg_value <- get(arg_name) + if (is.null(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be specified.")) + } + if (is.data.frame(arg_value)) { + setDT(arg_value) + } else if (!is.data.frame(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be a data.table or data.frame.")) + } + } + + # Ensure indicator exists + instructions <- myinstructions[indicator_key == ph.indicator] + + if (nrow(instructions) == 0) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in myinstructions.")) + } + + if (!ph.indicator %in% chars.defs$indicator_key) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in chars.defs.")) + } + + + # Ensure cat1_varname and cat2_varname exist in ph.data + unique_cat_vars <- unique(c( + instructions$cat1_varname[!is.na(instructions$cat1_varname)], + instructions$cat2_varname[!is.na(instructions$cat2_varname)] + )) + + missing_cols <- unique_cat_vars[!unique_cat_vars %in% names(ph.data)] + if (length(missing_cols) > 0) { + stop(paste0("\n\U1F47F The following columns referenced in instructions don't exist in ph.data: ", + paste(missing_cols, collapse = ", "))) + } + + # Extract indicator parameters + indicator_def <- chars.defs[indicator_key == ph.indicator] + age_start <- indicator_def$age_start + age_end <- indicator_def$age_end + + # Verify that this is not an injury indicator (those should use chi_chars_injury) + has_mechanism <- "mechanism" %in% names(indicator_def) && !is.na(indicator_def$mechanism) + has_intent <- "intent" %in% names(indicator_def) && !is.na(indicator_def$intent) + + if (has_mechanism || has_intent) { + stop(paste0("\n\U1F6D1 Indicator '", ph.indicator, "' appears to be an injury indicator. ", + "You must using chi_chars_injury() instead.")) + } + + # Check for required parameters + if (!all(c("superlevel", "broad", "midlevel", 'detailed') %in% names(indicator_def))) { + stop("\n\U1F6D1 chars.defs must contain columns 'superlevel', 'broad', 'midlevel', and 'detailed' for CCS indicators.") + } + + # Import CM reference table ---- + message("Loading ICD-CM reference table (this will happen only once per ph.indicator)") + if (any(instructions$start < 2016, na.rm = TRUE)){ICDCM9_table <- rads::chars_icd_ccs(icdcm_version = 9)} + ICDCM10_table <- rads::chars_icd_ccs(icdcm_version = 10) + + # Helper function to process ICD data (will be passed to lapply) ---- + process_icd_data <- function(icdcm_version, + current_start, + current_end, + current_row, + age_start, + age_end, + ph_data, + ICDCM_table, + indicator_def, + indicator_key) { + # Determine geographical filter based on tab ---- + is_wastate <- current_row$tab == "_wastate" + + # Filter data for appropriate years and geography ---- + if (is_wastate) { + data_subset <- data.table::copy(ph.data[wastate == 'Washington State' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } else { + data_subset <- data.table::copy(ph.data[chi_geo_kc == 'King County' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } + + if (nrow(data_subset) == 0) { + return(NULL) + } + + # Add year information ---- + data_subset[, year_range := paste0(current_start, "-", current_end)] + + # Get CCS table based on ICD version ---- + ICDCM_table<- if (icdcm_version == 9) ICDCM9_table else ICDCM10_table + + # Generate arguments for chars_icd_ccs_count ---- + superlevel_arg <- if (!is.na(indicator_def$superlevel)) indicator_def$superlevel else NULL + broad_arg <- if (!is.na(indicator_def$broad)) indicator_def$broad else NULL + midlevel_arg <- if (!is.na(indicator_def$midlevel)) indicator_def$midlevel else NULL + detailed_arg <- if (!is.na(indicator_def$detailed)) indicator_def$detailed else NULL + + # Generate group_by argument ---- + if (is.na(current_row$cat2_varname)) { + group_by_arg <- current_row$cat1_varname + } else { + group_by_arg <- c(current_row$cat1_varname, current_row$cat2_varname) + } + group_by_arg <- c('chi_age', unique(group_by_arg)) + + # Process data with rads::chars_icd_ccs_count---- + result <- rads::chars_icd_ccs_count( + ph.data = data_subset, + icdcm_version = icdcm_version, + CMtable= ICDCM_table, + icdcm = NULL, + superlevel = superlevel_arg, + broad = broad_arg, + midlevel = midlevel_arg, + detailed = detailed_arg, + icdcol = 'diag1', + group_by = group_by_arg, + kingco = FALSE # Already filtered data above, so always FALSE + ) + + # Format the results ---- + if (nrow(result) > 0) { + # Add metadata columns + result[, icd_version := icdcm_version] + result[, original_year_range := paste0(current_row$start, "-", current_row$end)] + result[, tab := current_row$tab] + result[, cat1 := current_row$cat1] + result[, cat2 := current_row$cat2] + result[, cat1_varname := gsub("_hispanic", "", current_row$cat1_varname)] + result[, cat2_varname := gsub("_hispanic", "", current_row$cat2_varname)] + + # Rename category columns + data.table::setnames(result, current_row$cat1_varname, "cat1_group") + + if (!is.na(current_row$cat2_varname)) { + data.table::setnames(result, current_row$cat2_varname, "cat2_group", skip_absent = TRUE) + } + + # Filter and format + result <- result[!is.na(cat1_group)] + result[cat2_varname == 'chi_geo_kc', cat2_group := 'King County'] + result <- result[is.na(cat2_varname) | (!is.na(cat2_varname) & !is.na(cat2_group))] + result[, ph.indicator := ph.indicator] + + } + + return(result) + } + + # Process sequentially ---- + message('\U023F3 Processing calculations... this may take a while') + + # Process each row + results_list <- lapply( + X = seq_len(nrow(instructions)), + FUN = function(i) { + # Progress meter ---- + current_row <- instructions[i] + row_info <- paste( + ph.indicator, + current_row$tab, + current_row$cat1, + current_row$cat1_varname, + current_row$cat2, + current_row$cat2_varname, + current_row$end, + current_row$start, + sep = "|" + ) + message(paste0("myinstructions row ", i, "/", nrow(instructions), ": \n - ", row_info)) + + # Determine if we need ICD-9, ICD-10, or both ---- + current_start <- as.integer(instructions[i, start]) + current_end <- as.integer(instructions[i, end]) + need_icd9 <- current_start < 2016 # Includes any year before 2016 + need_icd10 <- current_end >= 2016 # Includes any year from 2016 onwards + + # Results for this instruction row ---- + row_results <- list() + + # Process with ICD-9 if needed ---- + if (need_icd9) { + message(paste0(" - Processing ICD-9 portion (", current_start, "-", min(current_end, 2015), ")")) + row_results$icd9 <- process_icd_data( + icdcm_version = 9, + current_start = current_start, + current_end = min(current_end, 2015), + current_row = instructions[i,], + age_start = age_start, + age_end = age_end, + ph_data = ph.data, + ICDCM_table= ICDCM9_table, + indicator_def = indicator_def, + indicator_key = ph.indicator + ) + } + + # Process with ICD-10 if needed ---- + if (need_icd10) { + message(paste0(" - Processing ICD-10 portion (", max(current_start, 2016), "-", current_end, ")")) + row_results$icd10 <- process_icd_data( + icdcm_version = 10, + current_start = max(current_start, 2016), + current_end = current_end, + current_row = instructions[i,], + age_start = age_start, + age_end = age_end, + ph_data = ph.data, + ICDCM_table= ICDCM10_table, + indicator_def = indicator_def, + indicator_key = ph.indicator + ) + } + + # If we have both ICD-9 and ICD-10 results, merge them ---- + # When data spans both ICD-9 and ICD-10 periods, we need to merge the results + # This handles the ICD transition in 2016 by combining hospitalization counts + # while preserving all demographic stratifications + if (!is.null(row_results$icd9) && !is.null(row_results$icd10) && + nrow(row_results$icd9) > 0 && nrow(row_results$icd10) > 0) { + + # Identify common columns for merging + merge_cols <- intersect( + names(row_results$icd9)[!names(row_results$icd9) %in% c("hospitalizations", "icd_version", "year_range")], + names(row_results$icd10)[!names(row_results$icd10) %in% c("hospitalizations", "icd_version", "year_range")] + ) + + # Merge data + combined <- merge( + row_results$icd9, + row_results$icd10, + by = merge_cols, + suffixes = c("_icd9", "_icd10"), + all = TRUE + ) + + # Sum hospitalizations (handling missing values) + combined[, hospitalizations := sum( + ifelse(is.na(hospitalizations_icd9), 0, hospitalizations_icd9), + ifelse(is.na(hospitalizations_icd10), 0, hospitalizations_icd10) + ), by = merge_cols] + + # Set year to the full original range + combined[, year := original_year_range] + + return(combined) + } else if (!is.null(row_results$icd9) && nrow(row_results$icd9) > 0) { + # Only ICD-9 results available + row_results$icd9[, year := original_year_range] + return(row_results$icd9) + } else if (!is.null(row_results$icd10) && nrow(row_results$icd10) > 0) { + # Only ICD-10 results available + row_results$icd10[, year := original_year_range] + return(row_results$icd10) + } else { + # No results available + return(NULL) + } + } + ) + + # Filter out NULL results + results_list <- results_list[!sapply(results_list, is.null)] + + # Combine results if any exist + if (length(results_list) > 0) { + result <- data.table::rbindlist(results_list, fill = TRUE, use.names = TRUE) + } else { + warning("\u26A0\ufe0f No results found for indicator: ", ph.indicator) + result <- data.table::data.table() + } + + # Remove temporary columns if they exist + cols_to_remove <- intersect(names(result), + c('icd_version_icd9', 'icd_version_icd10', 'hospitalizations_icd9', + 'hospitalizations_icd10', 'icd_version', 'original_year_range')) + if (length(cols_to_remove) > 0) { + result[, (cols_to_remove) := NULL] + } + + # Ensure we have complete data (for all ages withing the specified range) ---- + # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results + template <- unique(result[, list(tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + indicator_key = ph.indicator, year)]) + + template <- template[, list(chi_age = seq(age_start, age_end)), by = names(template)] + + result <- merge(template, + result[, chi_age := as.numeric(chi_age)], + by = intersect(names(template), names(result)), + all = TRUE) + + result[is.na(hospitalizations), hospitalizations := 0] + + result <- result[, list(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + + setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) + + # Identify instructions that caused all data to be filtered out ---- + # this helps diagnose data quality issues, either in myinstructions or in ph.data + if (nrow(result) > 0) { + # Get combinations actually used in results + result_combos <- unique(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + + # Handle the special case for race3_hispanic + result_combos[cat1_varname == 'race3' & cat1 == 'Ethnicity', cat1_varname := 'race3_hispanic'] + + # Use fsetdiff to find instructions that didn't produce results + unused_instructions <- fsetdiff( + setcolorder(instructions, names(result_combos)), + result_combos + ) + + # Sort for warning display + setorder(unused_instructions, indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, start, end) + } else { + unused_instructions <- copy(instructions) + } + + # Generate warnings and attach to result + if (nrow(unused_instructions) > 0) { + # Capture the formatted output directly + empty_table <- paste( + utils::capture.output( + print(unused_instructions, + row.names = FALSE, + class = FALSE, + printClassOfColumns = FALSE) + ), + collapse = "\n" + ) + + warning(paste0( + "\n\u26A0\ufe0f No data found for the following ", nrow(unused_instructions), + " instruction(s) for indicator '", ph.indicator, "':\n", + empty_table + )) + + } + + # Drop irrelevant ages ---- + result <- chi_keep_proper_ages(result) + + # Tidy year (e.g., '2025-2025' becomes just '2025') ---- + result[, year := gsub("^(\\d{4})-(\\1)$", "\\1", year)] # \\1 is a back reference to what was found before hyphen + + # Order columns ---- + setcolorder(result, c("indicator_key", "year", "chi_age", "hospitalizations", "tab", "cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group")) + + # Return data.table ---- + return(result) +} diff --git a/R/chi_chars_injury.R b/R/chi_chars_injury.R new file mode 100644 index 0000000..a136d0d --- /dev/null +++ b/R/chi_chars_injury.R @@ -0,0 +1,352 @@ +# chi_chars_injury ---- +#' Generate CHI CHARS counts for injury indicators +#' +#' @description +#' Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +#' System (CHARS) data for injury indicators including falls, poisoning, +#' self-harm, and other injury mechanisms for CHI. This function processes +#' instructions for specific indicators and summarizes CHARS data accordingly. +#' +#' @param ph.indicator A character string of length 1. The indicator key to process, +#' which must exist in the chars.defs data table. +#' @param ph.data A data.table containing the CHARS data to be processed. +#' @param myinstructions A data.table containing processing instructions for each indicator. +#' Default is the output from \code{\link{chi_generate_tro_shell}}. +#' @param chars.defs A data.table containing definitions for each indicator. It +#' should have the following columns: `indicator_name`, `indicator_key`, `intent`, +#' `mechanism`, `age_start`, and `age_end`. +#' @param def A character string indicating which injury definition to use. +#' Default is \code{def = 'narrow'}. +#' +#' @return A data.table containing the summarized CHARS injury hospitalization data for the +#' specified indicator, stratified by the requested demographic variables. +#' +#' @details +#' This function processes instructions for a specific indicator sequentially. +#' +#' Note that injury data is only available for 2012 and later years. Unlike chi_chars_ccs(), +#' this function doesn't need to handle the ICD-9 to ICD-10 transition explicitly as the +#' mechanism and intent columns have been standardized across coding systems. +#' +#' @examples +#' \dontrun{ +#' # Example of how to run with future_lapply for memory efficiency +#' library(future) +#' library(future.apply) +#' +#' plan(multisession, workers = future::availableCores() - 1) +#' +#' countsINJURY <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { +#' chi_chars_injury( +#' ph.indicator = indicator, +#' ph.data = chars, +#' myinstructions = myinstructions, +#' chars.defs = chars.defs, +#' def = 'narrow') +#' }, future.seed = TRUE)) +#' +#' plan(sequential) +#' +#' } +#' +#' @seealso +#' \code{\link[rads]{get_data_chars}}, which provides creates ph.data +#' +#' \code{\link[rads]{chars_injury_matrix_count}}, which is the engine used by this function +#' +#' \code{\link{chi_generate_tro_shell}}, which creates myinstructions +#' +#' @import data.table +#' @import rads +#' @importFrom utils capture.output +#' @export +chi_chars_injury <- function(ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL, + def = 'narrow') { + # Input validation ----- + if (is.na(ph.indicator)) { + stop("\n\U1F6D1 ph.indicator must be provided.") + } + + for (arg_name in c("ph.data", "myinstructions", "chars.defs")) { + arg_value <- get(arg_name) + if (is.null(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be specified.")) + } + if (is.data.frame(arg_value)) { + setDT(arg_value) + } else if (!is.data.frame(arg_value)) { + stop(paste0("\n\U1F6D1 ", arg_name, " must be a data.table or data.frame.")) + } + } + + # Ensure indicator exists + instructions <- myinstructions[indicator_key == ph.indicator] + + if (nrow(instructions) == 0) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in myinstructions.")) + } + + if (!ph.indicator %in% chars.defs$indicator_key) { + stop(paste0("\n\U1F6D1 ph.indicator '", ph.indicator, "' not found in chars.defs.")) + } + + + # Ensure cat1_varname and cat2_varname exist in ph.data + unique_cat_vars <- unique(c( + instructions$cat1_varname[!is.na(instructions$cat1_varname)], + instructions$cat2_varname[!is.na(instructions$cat2_varname)] + )) + + missing_cols <- unique_cat_vars[!unique_cat_vars %in% names(ph.data)] + if (length(missing_cols) > 0) { + stop(paste0("\n\U1F47F The following columns referenced in instructions don't exist in ph.data: ", + paste(missing_cols, collapse = ", "))) + } + + # Extract indicator parameters + indicator_def <- chars.defs[indicator_key == ph.indicator] + age_start <- indicator_def$age_start + age_end <- indicator_def$age_end + if (!is.numeric(age_start) || !is.numeric(age_end)) { + stop("\n\U1F6D1 age_start and age_end must be numeric.") + } + + # Verify that this is an injury indicator (those should use chi_chars_injury) + has_mechanism <- "mechanism" %in% names(indicator_def) && !is.na(indicator_def$mechanism) + has_intent <- "intent" %in% names(indicator_def) && !is.na(indicator_def$intent) + + if (!has_mechanism && !has_intent) { + stop(paste0("\n\U1F6D1 Indicator '", ph.indicator, "' does not appear to be an injury indicator. ", + "You must use chi_chars_ccs() instead.")) + } + + # Check for required parameters + if (!all(c("intent", "mechanism") %in% names(indicator_def))) { + stop("\n\U1F6D1 chars.defs must contain columns 'intent' and 'mechanism' for injury indicators.") + } + + # Validate injury-specific parameters + if (!is.character(def) || length(def) != 1 || !def %in% c('narrow', 'broad')) { + stop("\n\U1F6D1 'def' must be either 'narrow' or 'broad'.") + } + + # Process sequentially ---- + message('\U023F3 Processing calculations... this may take a while') + + # Process each row + results_list <- lapply( + X = seq_len(nrow(instructions)), + FUN = function(i) { + # Progress meter + current_row <- instructions[i] + row_info <- paste( + ph.indicator, + current_row$tab, + current_row$cat1, + current_row$cat1_varname, + current_row$cat2, + current_row$cat2_varname, + current_row$end, + current_row$start, + sep = "|" + ) + message(paste0("myinstructions row ", i, "/", nrow(instructions), ": \n - ", row_info)) + + # Get current instruction parameters + current_start <- as.integer(current_row$start) + current_end <- as.integer(current_row$end) + + # For injury data, check if the year range is valid (only available from 2012+) + if (current_start < 2012) { + message(paste0(" - \u26A0\ufe0f Warning: Injury data only available from 2012. Adjusting start year from ", + current_start, " to 2012.")) + current_start <- 2012 + } + + # If start year is now after end year, skip this row + if (current_start > current_end) { + message(paste0(" - \u26A0\ufe0f Skipping: After year adjustment, start year ", current_start, + " is after end year ", current_end)) + return(NULL) + } + + # Determine geographical filter based on tab + is_wastate <- current_row$tab == "_wastate" + + # Filter data for appropriate years, geography, and age + if (is_wastate) { + data_subset <- data.table::copy(ph.data[wastate == 'Washington State' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } else { + data_subset <- data.table::copy(ph.data[chi_geo_kc == 'King County' & + chi_year >= current_start & + chi_year <= current_end & + chi_age >= age_start & + chi_age <= age_end]) + } + + # Return NULL if no data after filtering + if (nrow(data_subset) == 0) { + return(NULL) + } + + # Add year information + data_subset[, year_range := paste0(current_start, "-", current_end)] + + # Generate group_by argument + if (is.na(current_row$cat2_varname)) { + group_by_arg <- current_row$cat1_varname + } else { + group_by_arg <- c(current_row$cat1_varname, current_row$cat2_varname) + } + group_by_arg <- c('chi_age', unique(group_by_arg)) + + # Process data with rads::chars_injury_matrix_count + result <- rads::chars_injury_matrix_count( + ph.data = data_subset, + intent = indicator_def$intent, + mechanism = indicator_def$mechanism, + group_by = group_by_arg, + def = def, + kingco = FALSE # Already filtered data above + ) + + # Return NULL if no results + if (nrow(result) == 0) { + return(NULL) + } + + # For ICD10cm, poisoning is split into drug & non-drug, collapse them if needed + if (!is.null(indicator_def$mechanism) && !is.na(indicator_def$mechanism) && + indicator_def$mechanism == 'poisoning') { + result <- result[, list(mechanism = 'poisoning', hospitalizations = sum(hospitalizations)), + by = setdiff(names(result), c('mechanism', 'hospitalizations'))] + } + + # Remove mechanism and intent columns + if ("mechanism" %in% names(result)) { + result[, mechanism := NULL] + } + if ("intent" %in% names(result)) { + result[, intent := NULL] + } + + # Format the results + # Add metadata columns + result[, year := paste0(current_start, "-", current_end)] + result[, tab := current_row$tab] + result[, cat1 := current_row$cat1] + result[, cat2 := current_row$cat2] + result[, cat1_varname := gsub("_hispanic", "", current_row$cat1_varname)] + result[, cat2_varname := gsub("_hispanic", "", current_row$cat2_varname)] + + # Rename category columns + data.table::setnames(result, current_row$cat1_varname, "cat1_group") + + if (!is.na(current_row$cat2_varname)) { + data.table::setnames(result, current_row$cat2_varname, "cat2_group", skip_absent = TRUE) + } + + # Filter and format + result <- result[!is.na(cat1_group)] # Drop when group not identified + result[cat2_varname == 'chi_geo_kc', cat2_group := 'King County'] + result <- result[is.na(cat2_varname) | (!is.na(cat2_varname) & !is.na(cat2_group))] + result[, ph.indicator := ph.indicator] + + return(result) + } + ) + + # Filter out NULL results + results_list <- results_list[!sapply(results_list, is.null)] + + # Combine results if any exist + if (length(results_list) > 0) { + result <- data.table::rbindlist(results_list, fill = TRUE, use.names = TRUE) + } else { + warning("\u26A0\ufe0f No results found for indicator: ", ph.indicator) + result <- data.table::data.table() + } + + # Ensure we have complete data (for all ages within the specified range) ---- + # This creates a template with all possible combinations to avoid gaps when needing to age standardize the results + template <- unique(result[, list(tab, + cat1, cat1_varname, cat1_group, + cat2, cat2_varname, cat2_group, + indicator_key = ph.indicator, year)]) + + template <- template[, list(chi_age = seq(age_start, age_end)), by = names(template)] + + result <- merge(template, + result[, chi_age := as.numeric(chi_age)], + by = intersect(names(template), names(result)), + all = TRUE) + + result[is.na(hospitalizations), hospitalizations := 0] + + result <- result[, list(indicator_key, year, chi_age = as.integer(chi_age), hospitalizations, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + + setorder(result, tab, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, hospitalizations) + + # Identify instructions that caused all data to be filtered out ---- + # this helps diagnose data quality issues, either in myinstructions or in ph.data + if (nrow(result) > 0) { + # Get combinations actually used in results + result_combos <- unique(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + + # Handle the special case for race3_hispanic (format it like the instructions) + result_combos[cat1_varname == 'race3' & cat1 == 'Ethnicity', cat1_varname := 'race3_hispanic'] + result_combos[cat2_varname == 'race3' & cat2 == 'Ethnicity', cat2_varname := 'race3_hispanic'] + + # Use fsetdiff to find instructions that didn't produce results + unused_instructions <- fsetdiff( + setcolorder(instructions[, start := pmax(2012, start)], names(result_combos)), + result_combos + ) + + # Sort for warning display + setorder(unused_instructions, indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, start, end) + } else { + unused_instructions <- copy(instructions) + } + + # Generate warnings and attach to result + if (nrow(unused_instructions) > 0) { + # Capture the formatted output directly + empty_table <- paste( + utils::capture.output( + print(unused_instructions, + row.names = FALSE, + class = FALSE, + printClassOfColumns = FALSE) + ), + collapse = "\n" + ) + + warning(paste0( + "\n\u26A0\ufe0f No data found for the following ", nrow(unused_instructions), + " instruction(s) for indicator '", ph.indicator, "':\n", + empty_table + )) + } + + # Drop irrelevant ages ---- + result <- chi_keep_proper_ages(result) + + # Tidy year (e.g., '2025-2025' becomes just '2025') ---- + result[, year := gsub("^(\\d{4})-(\\1)$", "\\1", year)] # \\1 is a back reference to what was found before hyphen + + # Order columns ---- + setcolorder(result, c("indicator_key", "year", "chi_age", "hospitalizations", "tab", "cat1", "cat1_varname", "cat1_group", "cat2", "cat2_varname", "cat2_group")) + + # Return data.table ---- + return(result) +} diff --git a/R/chi_keep_proper_ages.R b/R/chi_keep_proper_ages.R new file mode 100644 index 0000000..3e39880 --- /dev/null +++ b/R/chi_keep_proper_ages.R @@ -0,0 +1,151 @@ +# chi_keep_proper_ages ---- +#' Keep data for appropriate ages only +#' +#' @description +#' Internal function to filter data based on age groups in cat1_group and cat2_group, +#' preserving only the rows where chi_age falls within the implied age ranges of +#' any age groups in the data. +#' +#' @param ph.data A data.table containing the data to be filtered. Should have the +#' following columns: cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, +#' indicator_key, year, chi_age, tab -- all of the appropriate classes. +#' +#' @return A filtered data.table with only the rows where chi_age is compatible +#' with the age ranges implied by age groups. +#' +#' @details +#' When \code{cat# == 'Age'}, the function parses \code{cat#_group} labels (e.g., +#' "<18", "18-24", "75+") to determine minimum and maximum ages, then filters the +#' data accordingly. It handles age groups in both cat1 and cat2 categories. +#' +#' @importFrom data.table is.data.table as.data.table rbindlist setnames data.table +#' +#' @examples +#' library(data.table) +#' +#' # Make test data with all combinations of categories and age +#' test_data <- CJ( +#' cat1 = "Gender", +#' cat1_varname = "chi_sex", +#' cat1_group = c("Female", "Male"), +#' cat2 = "Age", +#' cat2_varname = "yage4", +#' cat2_group = c("0-4", "5-9", "10-14"), +#' chi_age = 1:20, +#' unique = TRUE +#' ) +#' +#' # Add extra required columns +#' test_data[, `:=`( +#' indicator_key = "indicator1", +#' year = "2023", +#' tab = "mytab", +#' count = 1 +#' )] +#' +#' # Example usage +#' filtered_data <- chi_keep_proper_ages(test_data) +#' filtered_data[cat2_group == "10-14", unique(chi_age)] +#' +#' @export +chi_keep_proper_ages <- function(ph.data = NULL) { + # Validate inputs + if (is.null(ph.data)) { + stop("\n\U1F6D1 ph.data must be provided") + } + + if (!data.table::is.data.table(ph.data)) { + ph.data <- data.table::as.data.table(ph.data) + } + + # Check for required columns + required_columns <- c("cat2_group", "cat1_group", "indicator_key", "year", + "chi_age", "tab", "cat1", "cat1_varname", + "cat2", "cat2_varname") + + missing_columns <- required_columns[!required_columns %in% names(ph.data)] + if (length(missing_columns) > 0) { + stop(paste0("\n\U1F6D1 ph.data is missing required columns: ", + paste(missing_columns, collapse = ", "))) + } + + # Check expected column classes + expected_classes <- list( + cat1 = c("character", "factor"), + cat1_varname = c("character", "factor"), + cat1_group = c("character", "factor"), + cat2 = c("character", "factor"), + cat2_varname = c("character", "factor"), + cat2_group = c("character", "factor"), + indicator_key = c("character", "factor"), + year = "character", + chi_age = c("numeric", "integer"), + tab = c("character", "factor") + ) + + for (col_name in names(expected_classes)) { + expected <- expected_classes[[col_name]] + valid_class <- FALSE + + # Check if column inherits from any of the expected classes + for (expected_class in expected) { + if (inherits(ph.data[[col_name]], expected_class)) { + valid_class <- TRUE + break # once a valid class is established, no need to keep checking for that col_name + } + } + + if (isFALSE(valid_class)) { + expected_str <- paste(expected, collapse = " or ") + actual_class <- class(ph.data[[col_name]])[1] # Get primary class + stop(paste0("\n\U1F6D1 Column '", col_name, "' should be of class '", + expected_str, "', but is '", actual_class, "'")) + } + } + + # Make function used in lapply below to get min and max age from age group + parse_age_group <- function(group) { + + group <- gsub(" ", "", group) + if (grepl("^<[0-9]+$", group)) { + return(list(c(NA, as.numeric(sub("<", "", group)) - 1))) + } else if (grepl("^[0-9]+-[0-9]+$", group)) { + return(list(as.numeric(unlist(strsplit(group, "-"))))) + } else if (grepl("^[0-9]+\\+$", group)) { + return(list(c(as.numeric(sub("\\+", "", group)), NA))) + } else { + return(list(c(NA, NA))) + } + } + + + # Create reference table of age groups and corresponding min_age and max_age + all_groups <- unique(c(ph.data$cat1_group[ph.data$cat1 == "Age"], + ph.data$cat2_group[ph.data$cat2 == "Age"])) + + if(length(all_groups) == 0){return(ph.data)} # no need to continue if cat1 or cat2 are not 'Age' + + all_bounds <- rbindlist(lapply(all_groups, function(mygroup) { + bounds <- parse_age_group(mygroup)[[1]] + data.table(age_group = mygroup, min_age = bounds[1], max_age = bounds[2]) + })) + + # Merge cat1 bounds + setnames(all_bounds, c("age_group", "min_age", "max_age"), c("cat1_group", "min1", "max1")) + ph.data <- merge(ph.data, all_bounds, by = "cat1_group", all.x = TRUE, sort = FALSE) + + # Merge cat2 bounds + setnames(all_bounds, c("cat1_group", "min1", "max1"), c("cat2_group", "min2", "max2")) + ph.data <- merge(ph.data, all_bounds, by = "cat2_group", all.x = TRUE, sort = FALSE) + + # Create filter to identify rows to drop + ph.data[, keepme := TRUE] + + ph.data[cat1 == "Age", keepme := keepme & + (is.na(min1) | chi_age >= min1) & (is.na(max1) | chi_age <= max1)] + ph.data[cat2 == "Age", keepme := keepme & + (is.na(min2) | chi_age >= min2) & (is.na(max2) | chi_age <= max2)] + + # Clean up and return + return(ph.data[keepme == TRUE][, c("keepme", "min1", "max1", "min2", "max2") := NULL]) +} diff --git a/R/globals.R b/R/globals.R index 72a30e3..e51db41 100644 --- a/R/globals.R +++ b/R/globals.R @@ -36,8 +36,13 @@ utils::globalVariables(c( "group", "group_by1", "group_by2", + "hospitalizations", + "hospitalizations_icd9", + "hospitalizations_icd10", "hra20_name", + "icd_version", "indicator_key", + "intent", "keepme", "latest_year", "latest_year_count", @@ -49,11 +54,17 @@ utils::globalVariables(c( "latest_yearx", "level", "lower_bound", + "mechanism", + "min1", + "min2", "min_age", + "max1", + "max2", "max_age", "mykey", "notable", "numerator", + "original_year_range", "overall", "pattern", "pop", @@ -91,6 +102,8 @@ utils::globalVariables(c( "valid_years", "varname", "vebrose", + "wastate", + "year_range", "year.span", "year", "_kingcounty.x", diff --git a/man/chi_chars_ccs.Rd b/man/chi_chars_ccs.Rd new file mode 100644 index 0000000..21095f7 --- /dev/null +++ b/man/chi_chars_ccs.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_chars_ccs.R +\name{chi_chars_ccs} +\alias{chi_chars_ccs} +\title{Generate CHI CHARS counts by CCS category} +\usage{ +chi_chars_ccs( + ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL +) +} +\arguments{ +\item{ph.indicator}{A character string of length 1. The indicator key to process, +which must exist in the chars.defs data table.} + +\item{ph.data}{A data.table containing the CHARS data to be processed.} + +\item{myinstructions}{A data.table containing processing instructions for each indicator. +Default is the output from chi_generate_tro_shell().} + +\item{chars.defs}{A data.table containing definitions for each indicator. It +should have the following columns: `indicator_name`, `indicator_key`, `intent`, +`mechanism`, `superlevel`, `broad`, `midlevel`, `detailed`, `age_start`, and +`age_end`.} +} +\value{ +A data.table containing the summarized CHARS hospitalization data for the + specified indicator, stratified by the requested demographic variables. +} +\description{ +Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +System (CHARS) data based on Clinical Classification Software (CCS) categories +for CHI. This function processes instructions for specific indicators and +summarizes CHARS data accordingly. It automatically handles the ICD-9 to ICD-10 +transition that occurred in 2016. +} +\details{ +The function processes multiple instructions for the same indicator sequentially. + +The function automatically detects whether data spans the ICD-9 to ICD-10 transition in 2016 +and processes each part with the appropriate ICD version. Results are then combined seamlessly. +} +\examples{ +\dontrun{ +# Example of how to run with future_lapply for memory efficiency +library(future) +library(future.apply) + +plan(multisession, workers = future::availableCores() - 1) + +countsCCS <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = chars, + myinstructions = myinstructions, + chars.defs = chars.defs) +}, future.seed = TRUE)) + +plan(sequential) + +} + +} +\seealso{ +\code{\link[rads]{get_data_chars}}, which provides creates ph.data + +\code{\link[rads]{chars_icd_ccs_count}}, which is the engine used by this function + +\code{\link{chi_generate_tro_shell}}, which creates myinstructions +} diff --git a/man/chi_chars_injury.Rd b/man/chi_chars_injury.Rd new file mode 100644 index 0000000..a0a2fed --- /dev/null +++ b/man/chi_chars_injury.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_chars_injury.R +\name{chi_chars_injury} +\alias{chi_chars_injury} +\title{Generate CHI CHARS counts for injury indicators} +\usage{ +chi_chars_injury( + ph.indicator = NA, + ph.data = NULL, + myinstructions = NULL, + chars.defs = NULL, + def = "narrow" +) +} +\arguments{ +\item{ph.indicator}{A character string of length 1. The indicator key to process, +which must exist in the chars.defs data table.} + +\item{ph.data}{A data.table containing the CHARS data to be processed.} + +\item{myinstructions}{A data.table containing processing instructions for each indicator. +Default is the output from \code{\link{chi_generate_tro_shell}}.} + +\item{chars.defs}{A data.table containing definitions for each indicator. It +should have the following columns: `indicator_name`, `indicator_key`, `intent`, +`mechanism`, `age_start`, and `age_end`.} + +\item{def}{A character string indicating which injury definition to use. +Default is \code{def = 'narrow'}.} +} +\value{ +A data.table containing the summarized CHARS injury hospitalization data for the + specified indicator, stratified by the requested demographic variables. +} +\description{ +Generate hospitalization counts from Comprehensive Hospital Abstract Reporting +System (CHARS) data for injury indicators including falls, poisoning, +self-harm, and other injury mechanisms for CHI. This function processes +instructions for specific indicators and summarizes CHARS data accordingly. +} +\details{ +This function processes instructions for a specific indicator sequentially. + +Note that injury data is only available for 2012 and later years. Unlike chi_chars_ccs(), +this function doesn't need to handle the ICD-9 to ICD-10 transition explicitly as the +mechanism and intent columns have been standardized across coding systems. +} +\examples{ +\dontrun{ +# Example of how to run with future_lapply for memory efficiency +library(future) +library(future.apply) + +plan(multisession, workers = future::availableCores() - 1) + +countsINJURY <- rbindlist(future_lapply(VectorOfIndicators, function(indicator) { + chi_chars_injury( + ph.indicator = indicator, + ph.data = chars, + myinstructions = myinstructions, + chars.defs = chars.defs, + def = 'narrow') +}, future.seed = TRUE)) + +plan(sequential) + +} + +} +\seealso{ +\code{\link[rads]{get_data_chars}}, which provides creates ph.data + +\code{\link[rads]{chars_injury_matrix_count}}, which is the engine used by this function + +\code{\link{chi_generate_tro_shell}}, which creates myinstructions +} diff --git a/man/chi_get_cols.Rd b/man/chi_get_cols.Rd index 8580d92..c1c75d8 100644 --- a/man/chi_get_cols.Rd +++ b/man/chi_get_cols.Rd @@ -4,10 +4,13 @@ \alias{chi_get_cols} \title{Get CHI variable column names} \usage{ -chi_get_cols() +chi_get_cols(metadata = FALSE) +} +\arguments{ +\item{metadata}{returns metadata column names instead of primary data} } \value{ -A character vector of column names +A character vector of column names for the chi data (Default) or metadata } \description{ Returns a character vector of column names defined in the CHI YAML reference file. diff --git a/man/chi_keep_proper_ages.Rd b/man/chi_keep_proper_ages.Rd new file mode 100644 index 0000000..49d44f3 --- /dev/null +++ b/man/chi_keep_proper_ages.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_keep_proper_ages.R +\name{chi_keep_proper_ages} +\alias{chi_keep_proper_ages} +\title{Keep data for appropriate ages only} +\usage{ +chi_keep_proper_ages(ph.data = NULL) +} +\arguments{ +\item{ph.data}{A data.table containing the data to be filtered. Should have the +following columns: cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, +indicator_key, year, chi_age, tab -- all of the appropriate classes.} +} +\value{ +A filtered data.table with only the rows where chi_age is compatible +with the age ranges implied by age groups. +} +\description{ +Internal function to filter data based on age groups in cat1_group and cat2_group, +preserving only the rows where chi_age falls within the implied age ranges of +any age groups in the data. +} +\details{ +When \code{cat# == 'Age'}, the function parses \code{cat#_group} labels (e.g., +"<18", "18-24", "75+") to determine minimum and maximum ages, then filters the +data accordingly. It handles age groups in both cat1 and cat2 categories. +} +\examples{ +library(data.table) + +# Make test data with all combinations of categories and age +test_data <- CJ( + cat1 = "Gender", + cat1_varname = "chi_sex", + cat1_group = c("Female", "Male"), + cat2 = "Age", + cat2_varname = "yage4", + cat2_group = c("0-4", "5-9", "10-14"), + chi_age = 1:20, + unique = TRUE +) + +# Add extra required columns +test_data[, `:=`( + indicator_key = "indicator1", + year = "2023", + tab = "mytab", + count = 1 +)] + +# Example usage +filtered_data <- chi_keep_proper_ages(test_data) +filtered_data[cat2_group == "10-14", unique(chi_age)] + +} diff --git a/tests/testthat/test-chi_chars_ccs.R b/tests/testthat/test-chi_chars_ccs.R new file mode 100644 index 0000000..3ea6aee --- /dev/null +++ b/tests/testthat/test-chi_chars_ccs.R @@ -0,0 +1,387 @@ +library(testthat) +library(data.table) +library(rads) + +# Create mock_chars data ---- + set.seed(98104) + + # ICD codes for asthma and non-asthma conditions + icd9_asthma <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392") + + icd10_asthma <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998") + + icd9_non_asthma <- c("E9855", "36190", "V1508", "76425", "36642", "66061", "V7040", + "V7710", "V4530", "V0660", "76830", "85406") + + icd10_non_asthma <- c("S80241D", "S52134B", "S62256S", "S60922S", "V629", "K9420", + "S59241G", "S52042R", "J8417", "S83136D", "M06851", "T363X3D") + + # Create 1000 sample CHARS records + n_samples <- 1000 + half_samples <- n_samples / 2 + + # Create ICD-9 era data (pre-2016) + icd9_data <- data.table( + seq_no = 1:half_samples, # Unique identifier + chi_year = sample(2013:2015, half_samples, replace = TRUE), + chi_age = sample(0:99, half_samples, replace = TRUE), + chi_geo_kc = rep("King County", half_samples), + wastate = rep("Washington State", half_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), half_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), half_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), half_samples, replace = TRUE) + ) + + # Ensure at least 100 asthma cases + asthma_cases <- 100 + non_asthma_cases <- half_samples - asthma_cases + + icd9_data[1:asthma_cases, diag1 := sample(icd9_asthma, asthma_cases, replace = TRUE)] + icd9_data[(asthma_cases+1):half_samples, diag1 := sample(icd9_non_asthma, non_asthma_cases, replace = TRUE)] + + # Create ICD-10 era data (2016 and after) + icd10_data <- data.table( + seq_no = (half_samples+1):n_samples, # Continue sequence numbers + chi_year = sample(2016:2022, half_samples, replace = TRUE), + chi_age = sample(0:99, half_samples, replace = TRUE), + chi_geo_kc = rep("King County", half_samples), + wastate = rep("Washington State", half_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), half_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), half_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), half_samples, replace = TRUE) + ) + + # Ensure at least 100 asthma cases + icd10_data[1:asthma_cases, diag1 := sample(icd10_asthma, asthma_cases, replace = TRUE)] + icd10_data[(asthma_cases+1):half_samples, diag1 := sample(icd10_non_asthma, non_asthma_cases, replace = TRUE)] + + # Combine the data + mock_chars <- rbindlist(list(icd9_data, icd10_data)) + +# Create mock_instructions ---- + mock_instructions <- data.table( + indicator_key = rep(c("hos1803000_v1", "hos1803000_v2"), 3), + tab = rep(c("trends", "_wastate"), each = 3), + cat1 = rep("Ethnicity", 6), + cat1_varname = rep("race3_hispanic", 6), + cat2 = c(NA_character_, NA_character_, "Sex", "Sex", NA_character_, NA_character_), + cat2_varname = c(NA_character_, NA_character_, "chi_sex", "chi_sex", NA_character_, NA_character_), + end = c(2017, 2022, 2017, 2022, 2015, 2015), + start = c(2013, 2018, 2014, 2016, 2013, 2013) + ) + +# Create mock_chars_def ---- + mock_chars_def <- data.table( + indicator_name = c("Asthma hospitalizations (all ages)", "Asthma hospitalizations (children)"), + indicator_key = c("hos1803000_v1", "hos1803000_v2"), + intent = c(NA_character_, NA_character_), + mechanism = c(NA_character_, NA_character_), + superlevel = c(NA_character_, NA_character_), + broad = c("RESP", NA_character_), + midlevel = c(NA_character_, "Asthma"), + detailed = c(NA_character_, NA_character_), + age_start = c(0, 0), + age_end = c(120, 17) + ) + +# Create function for mock ccs_table ---- + # Function to create mock CCS reference tables + create_mock_ccs_table <- function(icdcm_version) { + if (icdcm_version == 9) { + icd_codes <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392", + "E9855", "36190", "V1508", "76425", "36642", "66061", "V7040", + "V7710", "V4530", "V0660", "76830", "85406") + + # Ensure all asthma codes are properly classified + asthma_codes <- c("49300", "49301", "49302", "49310", "49311", "49312", "49320", + "49321", "49322", "49380", "49381", "49382", "49390", "49391", "49392") + } else { # ICD-10 + icd_codes <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998", + "S80241D", "S52134B", "S62256S", "S60922S", "V629", "K9420", + "S59241G", "S52042R", "J8417", "S83136D", "M06851", "T363X3D") + + # Ensure all asthma codes are properly classified + asthma_codes <- c("J45", "J452", "J4520", "J4521", "J4522", "J453", "J4530", "J4531", + "J4532", "J454", "J4540", "J4541", "J4542", "J455", "J4550", "J4551", + "J4552", "J459", "J4590", "J45901", "J45902", "J45909", "J4599", + "J45990", "J45991", "J45998") + } + + # Create the data table + ccs_table <- data.table( + icdcm_code = icd_codes, + icdcm = paste0("Description for ", icd_codes), + superlevel = NA_character_, + broad = NA_character_, + midlevel = NA_character_, + detailed = NA_character_, + icdcm_version = icdcm_version + ) + + # Assign RESP to all asthma codes + ccs_table[icdcm_code %in% asthma_codes, broad := "RESP"] + + # Assign Asthma to all asthma codes + ccs_table[icdcm_code %in% asthma_codes, midlevel := "Asthma"] + + return(ccs_table) + } + +# Create vector of expected column order ---- + expectedCols <- c('indicator_key', 'year', 'chi_age', 'hospitalizations', 'tab', 'cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group') + +# Test validation ---- + test_that("chi_chars_ccs validates inputs correctly", { + # Test missing ph.indicator + expect_error(chi_chars_ccs(ph.indicator = NA, + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.indicator must be provided") + + # Test missing ph.data + expect_error(chi_chars_ccs(ph.indicator = "hos1803000_v1", + ph.data = NULL, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.data must be specified") + + # Test indicator not found in instructions + expect_error(chi_chars_ccs(ph.indicator = "not_an_indicator", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "not found in myinstructions") + + # Test invalid column in instructions + bad_instructions <- copy(mock_instructions) + bad_instructions[1, cat1_varname := "not_a_column"] + + expect_error(chi_chars_ccs(ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = bad_instructions, + chars.defs = mock_chars_def), + "don't exist in ph.data") + }) + +# Test function handles ICD-9 data correctly ---- + test_that("chi_chars_ccs processes ICD-9 data correctly", { + # Filter instructions to only include pre-2016 data + icd9_instructions <- mock_instructions[end < 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = icd9_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all data is from ICD-9 era (pre-2016) + expect_true(all(grepl("201[3-5]", result$year))) + + # Check hospitalization count is as expected + expect_equal(sum(result$hospitalizations), 100) + + # Check age filtering worked correctly + expect_gt(max(as.numeric(result$chi_age)), 50) # is for all ages, so should definitely have some > 50 yrs old + + # Check correct indicator is used + expect_true(all(result$indicator_key == "hos1803000_v1")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test function handles ICD-10 data correctly ---- + test_that("chi_chars_ccs processes ICD-10 data correctly", { + # Filter instructions to only include post-2016 data + icd10_instructions <- mock_instructions[start >= 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v2", + ph.data = mock_chars, + myinstructions = icd10_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all data is from ICD-10 era (2016+) + expect_true(all(grepl("201[6-9]|202[0-2]", result$year))) + + # Check hospitalization count is as expected + expect_lt(sum(result$hospitalizations), 100) # All ICD10 should be 100, so limiting to up to 17 should be less + + # Check age filtering worked correctly + expect_equal(max(as.numeric(result$chi_age)), 17) # for children, so max is 17 yrs old + + # Check correct indicator is used + expect_true(all(result$indicator_key == "hos1803000_v2")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test function handles mixed ICD-9/ICD-10 data correctly ---- + test_that("chi_chars_ccs processes mixed ICD-9/ICD-10 data correctly", { + # Filter data to include instructions that span the 2016 transition + mixed_instructions <- mock_instructions[start < 2016 & end >= 2016] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = mixed_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Verify that year ranges are formatted correctly + expect_true(all(grepl("^\\d{4}-\\d{4}$", result$year))) + + # Verify that the range spans the transition + expect_true(any(grepl("201[3-5]-201[6-9]", result$year))) + + # Verify we have all cat1_group values + expect_equal(unique(c("Hispanic", "Non-Hispanic")), unique(result$cat1_group) ) + + # Verify we have the proper year spans + expect_equal(sort(unique(result$year)), + sort(unique(mixed_instructions[, year := paste0(start, '-', end)]$year))) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test that instructions with different indicator variables work ---- + test_that("chi_chars_ccs handles different indicator variables correctly", { + # Instructions with two category variables + cat2_instructions <- mock_instructions[!is.na(cat2_varname)] + + # Run function + result <- rbindlist(lapply(c("hos1803000_v1", "hos1803000_v2"), function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = mock_chars, + myinstructions = cat2_instructions, + chars.defs = mock_chars_def) + }), fill = TRUE) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that both category values are correctly filled + expect_true(all(!is.na(result$cat1_group))) + expect_true(all(!is.na(result$cat2_group))) + + # Check all ph.indicator (indicator_keys) exist + expect_equal(sort(unique(cat2_instructions$indicator_key)), + sort(unique(result$indicator_key))) + + # Check cat1_group and cat2_group are all there + expect_equal( + setorder(unique(result[, .(cat1_group, cat2_group)]), cat1_group, cat2_group), + data.table(cat1_group = rep(c("Hispanic", "Non-Hispanic"), each = 2), cat2_group = rep(c("Female", "Male"), times = 2)) + ) + + # Check if have the same number of observations for each age (within an indicator) + expect_equal(uniqueN(result[indicator_key == "hos1803000_v1", .N, chi_age]$N), 1) + expect_equal(uniqueN(result[indicator_key == "hos1803000_v2", .N, chi_age]$N), 1) + }) + +# Test that WA state filtering works ---- + test_that("chi_chars_ccs handles WA state filtering correctly", { + # Instructions for WA state + wa_instructions <- mock_instructions[tab == "_wastate"] + + # Run function + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v1", + ph.data = mock_chars, + myinstructions = wa_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all rows are for WA state + expect_true(all(result$tab == "_wastate")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test child-specific age filtering ---- + test_that("chi_chars_ccs handles age filtering correctly", { + # Test children-only indicator (hos1803000_v2 has age_end = 17) + result <- chi_chars_ccs( + ph.indicator = "hos1803000_v2", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1803000_v2"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # check correct age range + expect_true(all(result$chi_age %in% 0:17)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) + }) + +# Test what happens when instructions filter out all rows---- + test_that("When some instructions filter out all rows, expect it to work but returning warnings", { + warnings <- capture_warnings({ + result <- rbindlist(lapply(c("hos1803000_v1", "hos1803000_v2"), function(indicator) { + chi_chars_ccs( + ph.indicator = indicator, + ph.data = copy(mock_chars)[, chi_geo_kc := 'KC'], + myinstructions = mock_instructions, + chars.defs = mock_chars_def) + }), fill = TRUE) + }) + + # Check that we got exactly 2 warnings + expect_length(warnings, 2) + + # Check content of first warning + expect_match(warnings[1], + "No data found for the following 2 instruction\\(s\\) for indicator 'hos1803000_v1'") + + # Check content of second warning + expect_match(warnings[2], + "No data found for the following 1 instruction\\(s\\) for indicator 'hos1803000_v2'") + + expect_equal( # expect three of the instructions 'worked' because _wastate was not corrupted + 3, + uniqueN(result[, list(indicator_key, tab, cat1, cat1_varname, cat2, cat2_varname, + start = as.numeric(substr(year, 1, 4)), + end = as.numeric(substr(year, nchar(year) - 3, nchar(year))) )]) + ) + + }) diff --git a/tests/testthat/test-chi_chars_injury.R b/tests/testthat/test-chi_chars_injury.R new file mode 100644 index 0000000..6f51b70 --- /dev/null +++ b/tests/testthat/test-chi_chars_injury.R @@ -0,0 +1,403 @@ +library(testthat) +library(data.table) +library(rads) + +# Create mock_chars data for injury ---- +set.seed(98104) + +# Create 1000 sample CHARS records +n_samples <- 1000 + +# Define all injury mechanisms and intents as needed by chars_injury_matrix_count +mechanisms <- c("any", "bites_stings", "cut_pierce", "drowning", "fall", "fire", + "fire_burn", "firearm", "machinery", "motor_vehicle_nontraffic", + "mvt_motorcyclist", "mvt_occupant", "mvt_other", "mvt_pedal_cyclist", + "mvt_pedestrian", "mvt_traffic", "mvt_unspecified", "natural", + "natural_environmental", "other", "other_land_transport", "other_spec", + "other_specified", "other_transport", "overexertion", "pedal_cyclist", + "pedestrian", "poisoning", "poisoning_drug", "poisoning_nondrug", "struck", + "struck_by_against", "suffocation", "transport_other", "unspecified") + +intents <- c("any", "unintentional", "intentional", "assault", "legal", "undetermined") + +# Initialize the data.table with demographic columns +mock_chars <- data.table( + seq_no = 1:n_samples, # Unique identifier + injury_nature_narrow = sample(c(TRUE, FALSE), n_samples, replace = TRUE, prob = c(0.8, 0.2)), + injury_nature_broad = NA, + chi_year = sample(2012:2022, n_samples, replace = TRUE), + chi_age = sample(0:99, n_samples, replace = TRUE), + chi_geo_kc = rep("King County", n_samples), + wastate = rep("Washington State", n_samples), + race3_hispanic = sample(c("Hispanic", "Non-Hispanic"), n_samples, replace = TRUE), + chi_geo_region = sample(c("Seattle", "South", "East", "North"), n_samples, replace = TRUE), + chi_sex = sample(c("Female", "Male"), n_samples, replace = TRUE) +) +mock_chars[, injury_nature_broad := !injury_nature_narrow] # Define broad based on narrow + + +# Add mechanism columns (all 0 initially) +for (mech in mechanisms) { + mock_chars[, paste0("mechanism_", mech) := 0] +} + +# Add intent columns (all 0 initially) +for (int in intents) { + mock_chars[, paste0("intent_", int) := 0] +} + +# First, set our fixed test cases +# Set exactly 100 fall injuries with unintentional intent (rows 1-100) +mock_chars[1:100, `:=`( + mechanism_fall = 1, + intent_unintentional = 1, + injury_intent = "unintentional" +)] + +# Set exactly 50 poisoning self-harm cases (rows 101-150) +mock_chars[101:150, `:=`( + mechanism_poisoning = 1, + intent_intentional = 1, + injury_intent = "intentional" +)] + +# Now randomly assign mechanism and intent to the remaining records (151-1000) +remaining_rows <- 151:n_samples +for (i in remaining_rows) { + # Select random mechanism and intent + mech <- sample(mechanisms, 1) + int <- sample(intents, 1) + + # Set the selected mechanism and intent to 1 + mock_chars[i, paste0("mechanism_", mech) := 1] + mock_chars[i, paste0("intent_", int) := 1] + + # Set the injury_intent string value based on which intent_* column is 1 + # Skip "any" since it's not a real intent category for injury_intent + if (int != "any") { + mock_chars[i, injury_intent := int] + } else { + # If "any" was selected, choose one of the real intents + real_int <- sample(setdiff(intents, "any"), 1) + mock_chars[i, injury_intent := real_int] + mock_chars[i, paste0("intent_", real_int) := 1] + } +} + +# Create mock_instructions ---- +mock_instructions <- data.table( + indicator_key = rep(c("hos1901000_v1", "hos1901000_v2"), 3), + tab = rep(c("trends", "_wastate"), each = 3), + cat1 = rep("Ethnicity", 6), + cat1_varname = rep("race3_hispanic", 6), + cat2 = c(NA_character_, NA_character_, "Sex", "Sex", NA_character_, NA_character_), + cat2_varname = c(NA_character_, NA_character_, "chi_sex", "chi_sex", NA_character_, NA_character_), + end = c(2017, 2022, 2017, 2022, 2015, 2015), + start = c(2012, 2018, 2014, 2016, 2012, 2012) +) + +# Create mock_chars_def ---- +mock_chars_def <- data.table( + indicator_name = c("Fall injuries (all ages)", "Fall injuries (children)"), + indicator_key = c("hos1901000_v1", "hos1901000_v2"), + intent = c("unintentional", "unintentional"), + mechanism = c("fall", "fall"), + age_start = c(0, 0), + age_end = c(120, 17) +) + +# Add poisoning self-harm indicator +mock_chars_def <- rbind(mock_chars_def, data.table( + indicator_name = "Self-harm poisoning", + indicator_key = "hos1902000_v1", + intent = "intentional", # Changed from intentional_self_harm to match intent column names + mechanism = "poisoning", + age_start = 10, + age_end = 120 +)) + +# Add row to instructions for self-harm +mock_instructions <- rbind(mock_instructions, data.table( + indicator_key = "hos1902000_v1", + tab = "trends", + cat1 = "Ethnicity", + cat1_varname = "race3_hispanic", + cat2 = "Sex", + cat2_varname = "chi_sex", + end = 2022, + start = 2012 +)) + +# Create vector of expected column order ---- +expectedCols <- c('indicator_key', 'year', 'chi_age', 'hospitalizations', 'tab', 'cat1', 'cat1_varname', 'cat1_group', 'cat2', 'cat2_varname', 'cat2_group') + +# Test validation ---- +test_that("chi_chars_injury validates inputs correctly", { + # Test missing ph.indicator + expect_error(chi_chars_injury(ph.indicator = NA, + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.indicator must be provided") + + # Test missing ph.data + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = NULL, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "ph.data must be specified") + + # Test indicator not found in instructions + expect_error(chi_chars_injury(ph.indicator = "not_an_indicator", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def), + "not found in myinstructions") + + # Test invalid column in instructions + bad_instructions <- copy(mock_instructions) + bad_instructions[1, cat1_varname := "not_a_column"] + + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = bad_instructions, + chars.defs = mock_chars_def), + "don't exist in ph.data") + + # Test invalid def parameter + expect_error(chi_chars_injury(ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions, + chars.defs = mock_chars_def, + def = "invalid_def"), + "must be either 'narrow' or 'broad'") +}) + +# Test function processes fall injury data correctly ---- +test_that("chi_chars_injury processes fall injury data correctly", { + # Run function with default parameters + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that data matches the fall/unintentional criteria + expect_true(sum(result$hospitalizations) > 0) + + # Check age filtering worked correctly + expect_gt(max(as.numeric(result$chi_age)), 50) # is for all ages, so should definitely have some > 50 yrs old + + # Check if have consistent number of rows per age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles age filtering correctly ---- +test_that("chi_chars_injury handles age filtering correctly", { + # Test children-only indicator (hos1901000_v2 has age_end = 17) + result <- chi_chars_injury( + ph.indicator = "hos1901000_v2", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v2"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check correct age range + expect_true(all(result$chi_age %in% 0:17)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test function handles different injury types correctly ---- +test_that("chi_chars_injury handles different injury types correctly", { + # Test self-harm poisoning indicator + result <- chi_chars_injury( + ph.indicator = "hos1902000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1902000_v1"], + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that we have counts > 0 + expect_true(sum(result$hospitalizations) > 0) + + # Check age filtering worked correctly (min age should be 10) + expect_equal(min(as.numeric(result$chi_age)), 10) +}) + +# Test def parameter variation ---- +test_that("chi_chars_injury handles 'def' parameter correctly", { + # Test with narrow definition (default) + narrow_result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"][1], + chars.defs = mock_chars_def, + def = "narrow" + ) + + # Test with broad definition + broad_result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = mock_instructions[indicator_key == "hos1901000_v1"][1], + chars.defs = mock_chars_def, + def = "broad" + ) + + # Both should have valid structure + expect_true(is.data.table(narrow_result)) + expect_true(is.data.table(broad_result)) + + # Results can be the same in our test data, but function should run without error + expect_true(TRUE) +}) + +# Test that WA state filtering works ---- +test_that("chi_chars_injury handles WA state filtering correctly", { + # Instructions for WA state + wa_instructions <- mock_instructions[tab == "_wastate" & indicator_key == "hos1901000_v1"] + + # Run function + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = wa_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + + # Check that all rows are for WA state + expect_true(all(result$tab == "_wastate")) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test processing multiple instructions ---- +test_that("chi_chars_injury processes multiple instructions correctly", { + # Use multiple instructions for the same indicator + multiple_instructions <- mock_instructions[indicator_key == "hos1901000_v1"] + + # Run function + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = multiple_instructions, + chars.defs = mock_chars_def + ) + + # Check if result has expected structure with multiple rows + expect_true(is.data.table(result)) + expect_equal(names(result), expectedCols) + expect_true(nrow(result) > 0) + + # confirm we processed the all of the instruction sets + expect_equal(uniqueN(result[, .(indicator_key, tab, cat1_varname, cat2_varname)]), + nrow(multiple_instructions)) + + # Check if have the same number of observations for each age + expect_equal(uniqueN(result[, .N, chi_age]$N), 1) +}) + +# Test what happens when instructions filter out all rows---- +test_that("When some instructions filter out all rows, expect it to work but returning warnings", { + warnings <- capture_warnings({ + result <- rbindlist(lapply(c("hos1901000_v1", "hos1901000_v2"), function(indicator) { + chi_chars_injury( + ph.indicator = indicator, + ph.data = copy(mock_chars)[, chi_geo_kc := 'KC'], # Corrupt the data + myinstructions = mock_instructions[indicator_key %in% c("hos1901000_v1", "hos1901000_v2")], + chars.defs = mock_chars_def) + }), fill = TRUE) + }) + + # Check that we got exactly 2 warnings (1 per indicator) + expect_length(warnings, 2) + + # Check content of first warning + expect_match(warnings[1], + "No data found for the following .* instruction\\(s\\) for indicator 'hos1901000_v1'") + + # Check content of second warning + expect_match(warnings[2], + "No data found for the following .* instruction\\(s\\) for indicator 'hos1901000_v2'") + + # Check for any data from _wastate (which should still work) + expect_gt(nrow(result), 0) +}) + +# *Test year restriction for injury data (only 2012+) ---- +test_that("chi_chars_injury correctly handles pre-2012 years", { + # Create instructions with pre-2012 years + early_instructions <- copy(mock_instructions[1]) + early_instructions[, `:=`(start = 2010, end = 2015)] + + # Run the function with pre-2012 years + result <- chi_chars_injury( + ph.indicator = "hos1901000_v1", + ph.data = mock_chars, + myinstructions = early_instructions, + chars.defs = mock_chars_def + ) + + # Check the results + expect_true(is.data.table(result)) + expect_true(nrow(result) > 0) + + # The year range in the result should be 2012-2015, not 2010-2015 + expect_true(all(result$year == "2012-2015")) +}) + +# Test poisoning mechanism handling with drug & non-drug ---- +test_that("chi_chars_injury correctly handles poisoning with ICD-10", { + # Create mock data with both poisoning_drug and poisoning_nondrug + poisoning_test_chars <- copy(mock_chars) + + # Add some specific poisoning_drug and poisoning_nondrug cases + poisoning_test_chars[201:225, `:=`( + mechanism_poisoning_drug = 1, + mechanism_poisoning = 0, + intent_intentional = 1, + injury_intent = "intentional" + )] + + poisoning_test_chars[226:250, `:=`( + mechanism_poisoning_nondrug = 1, + mechanism_poisoning = 0, + intent_intentional = 1, + injury_intent = "intentional" + )] + + # Create a test for poisoning indicator (which may need to collapse drug & non-drug) + result <- chi_chars_injury( + ph.indicator = "hos1902000_v1", + ph.data = poisoning_test_chars, + myinstructions = mock_instructions[indicator_key == "hos1902000_v1"], + chars.defs = mock_chars_def + ) + + # Check the results + expect_true(is.data.table(result)) + expect_true(nrow(result) > 0) + + # The result should include data from the poisoning mechanism + expect_true(sum(result$hospitalizations) > 0) +}) diff --git a/tests/testthat/test-chi_keep_proper_ages.R b/tests/testthat/test-chi_keep_proper_ages.R new file mode 100644 index 0000000..f5b88d3 --- /dev/null +++ b/tests/testthat/test-chi_keep_proper_ages.R @@ -0,0 +1,172 @@ +# Helper function to create test data (not in helper.R because very specific to this function) +create_test_data <- function() { + cat1_age6 <- data.table(cat1 = 'Age', cat1_varname = 'age6', + cat1_group = c('<18', '18-24', '25-44', '45-64', '65-74', '75+'), + mykey = 1) + cat2_sex <- data.table(cat2 = 'Gender', cat2_varname = 'chi_sex', + cat2_group = c('Female', 'Male'), mykey = 1) + cat1_sex <- setnames(copy(cat2_sex), gsub('cat2', 'cat1', names(cat2_sex))) + cat2_yage4 <- data.table(cat2 = 'Age', cat2_varname = 'yage4', + cat2_group = c('0-4', '10-14', '15-17', '5-9'), mykey = 1) + cat1_geo <- data.table(cat1 = 'King County', cat1_varname = 'chi_geo_kc', + cat1_group = 'King County', mykey = 1) + chi_ages <- data.table(chi_age = 0:100, mykey = 1) + + test.counts <- rbind( + merge(cat1_age6, cat2_sex, allow.cartesian = TRUE), + merge(cat1_sex, cat2_yage4, allow.cartesian = TRUE), + merge(cat1_geo, cat2_sex, allow.cartesian = TRUE)) + + test.counts <- merge(test.counts, chi_ages, allow.cartesian = TRUE)[, mykey := NULL] + + test.counts[, `:=` (indicator_key = 'indicator1', + year = '2020', + tab = 'mytab', + count = 1)] + + return(test.counts) +} + +test_that("Input validation works correctly", { + # Test NULL input + expect_error(chi_keep_proper_ages(NULL), "ph.data must be provided") + + # Test missing columns + incomplete_data <- data.table(cat1 = "Age", cat1_group = "<18") + expect_error(chi_keep_proper_ages(incomplete_data), "ph.data is missing required columns") + + # Test incorrect column classes + bad_class_data <- create_test_data() + bad_class_data[, chi_age := as.character(chi_age)] + expect_error(chi_keep_proper_ages(bad_class_data), "should be of class 'numeric or integer'") +}) + +test_that("Age filtering works correctly for cat1 Age groups", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test age group '<18' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "<18"] + expect_true(all(age_group_rows$chi_age < 18)) + + # Test age group '18-24' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "18-24"] + expect_true(all(age_group_rows$chi_age >= 18 & age_group_rows$chi_age <= 24)) + + # Test age group '25-44' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "25-44"] + expect_true(all(age_group_rows$chi_age >= 25 & age_group_rows$chi_age <= 44)) + + # Test age group '45-64' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "45-64"] + expect_true(all(age_group_rows$chi_age >= 45 & age_group_rows$chi_age <= 64)) + + # Test age group '65-74' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "65-74"] + expect_true(all(age_group_rows$chi_age >= 65 & age_group_rows$chi_age <= 74)) + + # Test age group '75+' + age_group_rows <- filtered_data[cat1 == "Age" & cat1_group == "75+"] + expect_true(all(age_group_rows$chi_age >= 75)) +}) + +test_that("Age filtering works correctly for cat2 Age groups", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test age group '0-4' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "0-4"] + expect_true(all(age_group_rows$chi_age >= 0 & age_group_rows$chi_age <= 4)) + + # Test age group '5-9' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "5-9"] + expect_true(all(age_group_rows$chi_age >= 5 & age_group_rows$chi_age <= 9)) + + # Test age group '10-14' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "10-14"] + expect_true(all(age_group_rows$chi_age >= 10 & age_group_rows$chi_age <= 14)) + + # Test age group '15-17' + age_group_rows <- filtered_data[cat2 == "Age" & cat2_group == "15-17"] + expect_true(all(age_group_rows$chi_age >= 15 & age_group_rows$chi_age <= 17)) +}) + +test_that("Non-Age categories are not filtered", { + # Create test data + test.data <- create_test_data() + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Test for geographic category (completely unrelated to Age) + geo_rows_original <- test.data[cat1 == "King County"] + geo_rows_filtered <- filtered_data[cat1 == "King County"] + + # For completely non-Age categories, all rows should remain + # (note: filtering only applies when cat1 or cat2 is "Age") + expect_equal(nrow(geo_rows_original), nrow(geo_rows_filtered)) + + # Check that all unique chi_age values remain for King County rows + geo_original_ages <- sort(unique(geo_rows_original$chi_age)) + geo_filtered_ages <- sort(unique(geo_rows_filtered$chi_age)) + + expect_equal(geo_original_ages, geo_filtered_ages) + + # Also check that Gender categories not paired with Age are preserved + gender_only_original <- test.data[cat1 == "Gender" & cat2 != "Age"] + gender_only_filtered <- filtered_data[cat1 == "Gender" & cat2 != "Age"] + + expect_equal(nrow(gender_only_original), nrow(gender_only_filtered)) +}) + +test_that("Count sums decrease after filtering", { + # Create test data + test.data <- create_test_data() + + # Sum of counts before filtering + original_count_sum <- sum(test.data$count) + + # Filter data + filtered_data <- chi_keep_proper_ages(test.data) + + # Sum of counts after filtering + filtered_count_sum <- sum(filtered_data$count) + + # There should be fewer rows after filtering + expect_true(nrow(filtered_data) < nrow(test.data)) + + # Sum of counts should decrease + expect_true(filtered_count_sum < original_count_sum) + + # Calculate exact expected count + # For each age group, count rows that should be filtered out + expected_filtered_rows <- test.data[ + (cat1 == "Age" & cat1_group == "<18" & chi_age >= 18) | + (cat1 == "Age" & cat1_group == "18-24" & (chi_age < 18 | chi_age > 24)) | + (cat1 == "Age" & cat1_group == "25-44" & (chi_age < 25 | chi_age > 44)) | + (cat1 == "Age" & cat1_group == "45-64" & (chi_age < 45 | chi_age > 64)) | + (cat1 == "Age" & cat1_group == "65-74" & (chi_age < 65 | chi_age > 74)) | + (cat1 == "Age" & cat1_group == "75+" & chi_age < 75) | + (cat2 == "Age" & cat2_group == "0-4" & (chi_age < 0 | chi_age > 4)) | + (cat2 == "Age" & cat2_group == "5-9" & (chi_age < 5 | chi_age > 9)) | + (cat2 == "Age" & cat2_group == "10-14" & (chi_age < 10 | chi_age > 14)) | + (cat2 == "Age" & cat2_group == "15-17" & (chi_age < 15 | chi_age > 17)) + ] + + # Count rows that should remain + rows_with_age_categories <- test.data[(cat1 == "Age") | (cat2 == "Age")] + non_age_categories_rows <- test.data[(cat1 != "Age") & (cat2 != "Age")] + + expected_age_rows_after_filtering <- nrow(rows_with_age_categories) - nrow(expected_filtered_rows) + expected_total_rows <- expected_age_rows_after_filtering + nrow(non_age_categories_rows) + + # Check if we have the expected number of rows after filtering + expect_equal(nrow(filtered_data), expected_total_rows) +}) +