From 0faa90281ed7f81c898d14e8a1db6a05b8a78e8d Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 24 Feb 2025 17:13:05 -0800 Subject: [PATCH 01/12] Fixed contradictory warnings / errors with race3 - before created race3_hispanic when needed but stopped if there was race3_hispanic. It was non-sensical. Now gives informative messages --- R/chi_calc.R | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/R/chi_calc.R b/R/chi_calc.R index 338d343..1b869c6 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -1,7 +1,7 @@ #' Calculate CHI Estimates #' #' @description -#' Generates CHI estimates from input data according to provided instructions. +#' Generates CHI estimates from input data according to provided instructions. #' Handles both proportions and rates, with options for suppression of small numbers. #' #' @param ph.data data.frame or data.table. Input data containing analytic read data. @@ -110,15 +110,23 @@ chi_calc <- function(ph.data = NULL, # Check to make sure all variables needed exist in the data ---- neededbyvars <- unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname))) - if("race3" %in% neededbyvars & !"race3_hispanic" %in% neededbyvars){neededbyvars <- c(neededbyvars, 'race3_hispanic')} # By definition, Hispanic cannot be contained within race3 + + # Handle the race3/race3_hispanic relationship + if("race3" %in% neededbyvars & !"race3_hispanic" %in% neededbyvars) { + neededbyvars <- c(neededbyvars, 'race3_hispanic') + message("\U00002139 Note: Adding 'race3_hispanic' as a required variable because 'race3' is present. By definition, race3 requires separate Hispanic ethnicity information.") + } + + if(!"race3" %in% neededbyvars & "race3_hispanic" %in% neededbyvars) { + neededbyvars <- c(neededbyvars, 'race3') + message("\U00002139 Note: Adding 'race3' as a required variable because 'race3_hispanic' is present. These two variables work together to represent race/ethnicity.") + } neededvars <- unique(na.omit(c(ph.instructions$indicator_key, neededbyvars))) missingvars <- setdiff(neededvars, names(ph.data)) if(length(missingvars) > 0 ){ - stop(paste0("\n\U2620 ph.data is missing the following columns that are specified in ph.instructions: ", paste0(missingvars, collapse = ', '), ". ", - "\nIf `race3_hispanic` is listed, that is because, by definition, `race3` cannot have a Hispanic ethnicity in the same variable. So, two ", - "\nvariables (`race3` & `race3_hispanic`) will be processed and in the output, it will be called `race3`")) + stop(paste0("\n\U2620 ph.data is missing the following columns that are required: ", paste0(missingvars, collapse = ', '))) } else{message("\U0001f642 All specified variables exist in ph.data")} # Check to make sure all byvariables have the CHI specified encoding ---- From e4a1d745c110053e9b5f1a4ddb4d247bbf97780c Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 25 Feb 2025 12:25:06 -0800 Subject: [PATCH 02/12] added `non_chi_byvars` parameter to chi_calc - chi_calc checks to make sure that all CHI cat1_varname and cat2_varname are coded as would be expected based upon rads.data:: misc_chi_byvars. However, there are times we have non-CHI variables that are created alongside CHI (for example for BSK, COO, etc.). This allows the function to maintain strict standards for CHI variables but with flexibility if we are producing non-CHI estimates --- R/chi_calc.R | 53 +++++++++++++++++++++++++++++++++++++++---------- man/chi_calc.Rd | 7 +++++-- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/R/chi_calc.R b/R/chi_calc.R index 1b869c6..b43852d 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -14,6 +14,7 @@ #' @param suppress_high numeric. Upper bound for suppression. Default: \code{9}. #' @param source_name character. Name of data source. Default: \code{NULL}. #' @param source_date Date. Date ph.data was created. Default: \code{NULL}. +#' @param non_chi_byvars character vector. Variable names to exclude from CHI byvar encoding validation. Default: \code{NULL}. #' #' @return A data.table containing CHI estimates with the following columns: #' \itemize{ @@ -65,7 +66,8 @@ chi_calc <- function(ph.data = NULL, suppress_low = 0, suppress_high = 9, source_name = NULL, - source_date = NULL){ + source_date = NULL, + non_chi_byvars = NULL){ # Input validation ---- if (is.null(ph.data)) stop("\n\U1F6D1 ph.data must be provided") if (!is.data.frame(ph.data)) stop("\n\U1F6D1 ph.data must be a data.frame or data.table") @@ -102,6 +104,10 @@ chi_calc <- function(ph.data = NULL, if (is.null(source_date)) stop("\n\U1F6D1 source_date must be provided") if (!inherits(source_date, "Date")) stop("\n\U1F6D1 source_date must be a be a Date object") + # Validate non_chi_byvars + if(!is.null(non_chi_byvars)) { + if(!is.character(non_chi_byvars)) stop("\n\U1F6D1 non_chi_byvars must be a character vector") + } # Create 'Overall' if needed for crosstabs ---- if(!'overall' %in% names(ph.data)){ @@ -130,16 +136,41 @@ chi_calc <- function(ph.data = NULL, } else{message("\U0001f642 All specified variables exist in ph.data")} # Check to make sure all byvariables have the CHI specified encoding ---- - stdbyvars <- rads.data::misc_chi_byvars[varname %in% unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)))][, list(varname, group, keepme, reference = 1)] - stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data - phbyvars <- rbindlist(lapply( - X=as.list(neededbyvars), - FUN = function(X){data.table::data.table(varname = X, group = unique(na.omit(ph.data[[X]])), ph.data = 1)})) - compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) - if(nrow(compbyvars[is.na(reference)| is.na(ph.data)]) > 0){ - print(compbyvars[is.na(reference)| is.na(ph.data)]) - stop("\n\U2620 the table above shows the varname/group combinations that do not align between the reference table and your ph.data.") - } else {message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.")} + # Filter out non-CHI variables if specified + chi_byvars <- neededbyvars + if(!is.null(non_chi_byvars)) { + chi_byvars <- setdiff(neededbyvars, non_chi_byvars) + if(length(setdiff(non_chi_byvars, neededbyvars)) > 0) { + message("\U00002139 Note: Some specified non_chi_byvars are not used in the analysis: ", + paste0(setdiff(non_chi_byvars, neededbyvars), collapse = ", ")) + } + if(length(setdiff(neededbyvars, chi_byvars)) > 0) { + message("\U00002139 Note: The following variables will be excluded from CHI encoding validation: ", + paste0(setdiff(neededbyvars, chi_byvars), collapse = ", ")) + } + } + + # Only validate CHI variables + stdbyvars <- rads.data::misc_chi_byvars[varname %in% unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)))] + stdbyvars <- stdbyvars[!varname %in% non_chi_byvars][, list(varname, group, keepme, reference = 1)] + stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data + + phbyvars <- rbindlist(lapply( + X=as.list(chi_byvars), + FUN = function(X){data.table::data.table(varname = X, group = unique(na.omit(ph.data[[X]])), ph.data = 1)})) + + # Skip validation if there are no CHI variables to validate after excluding non_chi_byvars + if(nrow(phbyvars) > 0 && nrow(stdbyvars) > 0) { + compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) + if(nrow(compbyvars[is.na(reference) | is.na(ph.data)]) > 0){ + print(compbyvars[is.na(reference) | is.na(ph.data)]) + stop("\n\U2620 the table above shows the varname/group combinations that do not align between the reference table and your ph.data.") + } else {message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.")} + } else if(length(chi_byvars) > 0) { + message("\U00002139 Note: No CHI variables to validate after excluding non_chi_byvars.") + } else { + message("\U00002139 Note: No variables to validate for CHI encoding.") + } # Use rads::calc to generate estimates for each row of ph.instructions ---- message("\U023F3 Be patient! The function is generating estimates for each row of ph.instructions.") diff --git a/man/chi_calc.Rd b/man/chi_calc.Rd index 60c69e5..2cb954a 100644 --- a/man/chi_calc.Rd +++ b/man/chi_calc.Rd @@ -14,7 +14,8 @@ chi_calc( suppress_low = 0, suppress_high = 9, source_name = NULL, - source_date = NULL + source_date = NULL, + non_chi_byvars = NULL ) } \arguments{ @@ -37,6 +38,8 @@ chi_calc( \item{source_name}{character. Name of data source. Default: \code{NULL}.} \item{source_date}{Date. Date ph.data was created. Default: \code{NULL}.} + +\item{non_chi_byvars}{character vector. Variable names to exclude from CHI byvar encoding validation. Default: \code{NULL}.} } \value{ A data.table containing CHI estimates with the following columns: @@ -66,7 +69,7 @@ A data.table containing CHI estimates with the following columns: } } \description{ -Generates CHI estimates from input data according to provided instructions. +Generates CHI estimates from input data according to provided instructions. Handles both proportions and rates, with options for suppression of small numbers. } \seealso{ From c93375b2951301e7d45cc91f3d39e1e3ea7fff23 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 25 Feb 2025 14:13:16 -0800 Subject: [PATCH 03/12] Made calculation of trends conditional - previously gave empty rows for trends when there were no real trends to calculate --- R/chi_generate_tro_shell.R | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 6c55703..33c7f2a 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -102,7 +102,9 @@ chi_generate_tro_shell <- function(ph.analysis_set, # split trends from other tabs because processed for multiple years template.trends <- template[tab=='trends'] - template <- template[tab != 'trends'] + if(nrow(template.trends) > 0){ + template <- template[tab != 'trends'] + } # add years to template (non-trend) template[, end := end.year] @@ -111,14 +113,16 @@ chi_generate_tro_shell <- function(ph.analysis_set, template[tab == '_kingcounty'][, tab := 'metadata'][, start := end.year]) # add years to template (trends) - trend.years <- chi_process_trends(indicator_key = unique(template$indicator_key), - trend.span = trend.span, - end.year = end.year, - trend.periods = trend.periods) - template.trends <- merge(template.trends, trend.years, by = 'indicator_key', all = T, allow.cartesian = T) - - # append trends template to main template - template <- rbind(template, template.trends) + if(nrow(template.trends) > 0){ + trend.years <- chi_process_trends(indicator_key = unique(template$indicator_key), + trend.span = trend.span, + end.year = end.year, + trend.periods = trend.periods) + template.trends <- merge(template.trends, trend.years, by = 'indicator_key', all = T, allow.cartesian = T) + + # append trends template to main template + template <- rbind(template, template.trends) + } return(template) } From 58bea519ea7c5ed6d841a35d2d4088a1f390e5f0 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 25 Feb 2025 14:34:26 -0800 Subject: [PATCH 04/12] generate trends template for specific indicators - previously if there were any indicators that had trends, then rows for trends would be created for all variables. Now it will only create trend rows for specific variables that should have trends calculated --- R/chi_generate_tro_shell.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 33c7f2a..be6aa35 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -114,7 +114,7 @@ chi_generate_tro_shell <- function(ph.analysis_set, # add years to template (trends) if(nrow(template.trends) > 0){ - trend.years <- chi_process_trends(indicator_key = unique(template$indicator_key), + trend.years <- chi_process_trends(indicator_key = intersect(unique(template$indicator_key), unique(template.trends$indicator_key)), trend.span = trend.span, end.year = end.year, trend.periods = trend.periods) From 17900015b4ae672c010ad53e7ec9829f319220db Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 25 Feb 2025 16:19:43 -0800 Subject: [PATCH 05/12] Added Wilson Score for CI & level to output - Wilson score gives better CI when 0% or 100% - Needed to add level because when using factor variables in calc, will get estimates for each level and need a way to filter out undesired levels from CHI output --- NAMESPACE | 3 +-- R/chi_calc.R | 33 +++++++++++++++++++++++++++++---- R/globals.R | 2 ++ man/chi_calc.Rd | 1 + 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 862c77c..3badaeb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,14 +48,13 @@ importFrom(glue,glue) importFrom(glue,glue_sql) importFrom(odbc,odbc) importFrom(rads,calc) -importFrom(rads,chi_cols) -importFrom(rads,compare_estimate) importFrom(rads,round2) importFrom(rads,string_clean) importFrom(rads,substrRight) importFrom(rads,suppress) importFrom(rads,tsql_validate_field_types) importFrom(stats,na.omit) +importFrom(stats,qnorm) importFrom(tidyr,crossing) importFrom(tools,toTitleCase) importFrom(utils,tail) diff --git a/R/chi_calc.R b/R/chi_calc.R index b43852d..48d2834 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -20,6 +20,7 @@ #' \itemize{ #' \item{\code{data_source}} Data source (e.g., acs, brfss, etc.) #' \item{\code{indicator_key}} Unique indicator key +#' \item{\code{level}} factor level of the indicator_key (e.g., Breech vs Cephalic vs other for fetal_pres in birth data) #' \item{\code{tab}} Type of analysis (e.g., demgroups, _kingcounty, etc.) #' \item{\code{year}} Year(s) of data #' \item{\code{cat1}} Describes data field (e.g., Gender, Ethnicity, etc.) @@ -49,10 +50,10 @@ #' #' \code{\link{chi_generate_metadata}} for creating metadata from results #' -#' @importFrom data.table setDT copy setnames := setorder set .SD data.table -#' @importFrom rads calc compare_estimate suppress chi_cols round2 +#' @importFrom data.table setDT copy setnames := setorder set .SD data.table setcolorder +#' @importFrom rads calc suppress round2 #' @importFrom future.apply future_lapply -#' @importFrom stats na.omit +#' @importFrom stats na.omit qnorm #' @import progressr #' @export #' @@ -250,6 +251,10 @@ chi_calc <- function(ph.data = NULL, c("mean", "mean_lower", "mean_upper", "mean_se"), c("result", "lower_bound", "upper_bound", "se")) + if(!"level" %in% names(tempest)) { + tempest[, level := NA_character_] + } + # set correct data types for TSQL database tempest[, denominator := as.numeric(denominator)] tempest[, numerator := as.numeric(numerator)] @@ -267,6 +272,23 @@ chi_calc <- function(ph.data = NULL, # drop when cat2_group is missing but cat2 is not missing ---- tempCHIest <- tempCHIest[!(is.na(cat2_group) & !is.na(cat2))] + # Apply Wilson Score method for confidence intervals when result is 0% or 100% ---- + # This handles cases where standard methods fail at the extremes by providing more appropriate + # bounds that don't exceed the logical limits while maintaining the specified confidence level + + # Calculate z-value based on the provided confidence interval + z_value <- qnorm(1-0.5*(1-ci)) + + # Lower bound using Wilson Score method + tempCHIest[result %in% c(0, 1) & se == 0 & denominator != 0, + lower_bound := (2 * numerator + z_value^2 - z_value * sqrt(z_value^2 + 4 * numerator * (1 - numerator/denominator))) / + (2 * (denominator + z_value^2))] + + # Upper bound using Wilson Score method + tempCHIest[result %in% c(0, 1) & se == 0 & denominator != 0, + upper_bound := (2 * numerator + z_value^2 + z_value * sqrt(z_value^2 + 4 * numerator * (1 - numerator/denominator))) / + (2 * (denominator + z_value^2))] + # drop if cat1_group | cat2_group had `keepme == "No"` in the reference table ---- dropme <- unique(stdbyvars[keepme == 'No'][, reference := NULL]) tempCHIest <- merge(tempCHIest, @@ -346,11 +368,14 @@ chi_calc <- function(ph.data = NULL, # Keep and order standard CHI columns ---- - tempCHIest <- tempCHIest[, chi_get_cols(), with = F] + all_cols <- c(chi_get_cols(), "level") + all_cols <- unique(all_cols) # In case level is already included + tempCHIest <- tempCHIest[, all_cols, with = F] tempCHIest <- tempCHIest[, cat1 := factor(cat1, levels = c("King County", sort(setdiff(unique(tempCHIest$cat1), "King County"))) )] tempCHIest <- tempCHIest[, tab := factor(tab, levels = c(c("_kingcounty","demgroups", "trends"), sort(setdiff(unique(tempCHIest$tab), c("_kingcounty","demgroups", "trends")))) )] setorder(tempCHIest, indicator_key, tab, -year, cat1, cat1_group, cat2, cat2_group) + setcolorder(tempCHIest, c('data_source', 'indicator_key', 'level')) # return the CHI table ---- return(tempCHIest) diff --git a/R/globals.R b/R/globals.R index e9614f6..d646335 100644 --- a/R/globals.R +++ b/R/globals.R @@ -46,6 +46,7 @@ utils::globalVariables(c( "latest_year_result", "latest_year_resultx", "latest_yearx", + "level", "lower_bound", "notable", "numerator", @@ -69,6 +70,7 @@ utils::globalVariables(c( "run_date", "run_datex", "s2t_fraction", + "se", "source_id", "span", "start", diff --git a/man/chi_calc.Rd b/man/chi_calc.Rd index 2cb954a..30b2783 100644 --- a/man/chi_calc.Rd +++ b/man/chi_calc.Rd @@ -46,6 +46,7 @@ A data.table containing CHI estimates with the following columns: \itemize{ \item{\code{data_source}} Data source (e.g., acs, brfss, etc.) \item{\code{indicator_key}} Unique indicator key + \item{\code{level}} factor level of the indicator_key (e.g., Breech vs Cephalic vs other for fetal_pres in birth data) \item{\code{tab}} Type of analysis (e.g., demgroups, _kingcounty, etc.) \item{\code{year}} Year(s) of data \item{\code{cat1}} Describes data field (e.g., Gender, Ethnicity, etc.) From 918ea26397c2eeccaba88415a39adad97aa4928d Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Thu, 27 Feb 2025 12:11:02 -0800 Subject: [PATCH 06/12] Updated QA function for ACS & RSE issues - ACS does not need to have cat1_varname, cat2_varname, numerator, and denominator - RSE is legitimately NA (actually undefined) when numerator is zero and therefore the result would be zero, which puts zero in the denominator for calculating RSE --- R/chi_qa_tro.R | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index 0529102..328f6b0 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -187,12 +187,26 @@ chi_qa_tro <- function(CHIestimates, } } - for(mycol in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator", "chi", "source_date", "run_date")){ + if(isFALSE(acs)) + for(mycol in c("numerator", "denominator")){ if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ status <- 0 warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) } } + + for(mycol in c("result", "lower_bound", "upper_bound", "se", "chi", "source_date", "run_date")){ + if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ + status <- 0 + warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) + } + } + + if(nrow(CHIestimates[is.na(rse) & is.na(suppression) & numerator != 0]) > 0){ + status <- 0 + warning("\U00026A0 Warning: 'rse' is missing in at least one row of the CHI data where numerator is not 0.") + } + for(mycol in names(unlist(chi_get_yaml()$metadata))){ if(nrow(CHImetadata[is.na(get(mycol))]) > 0){ status <- 0 @@ -290,7 +304,7 @@ chi_qa_tro <- function(CHIestimates, message("Checking that RSE is between 0 and 100") } # confirmed with Abby 2/7/2020 that want RSE * 100 - if(nrow(CHIestimates[!rse %between% c(0, 100)]) > 0 ){ + if(nrow(CHIestimates[!is.na(rse) & !rse %between% c(0, 100)]) > 0 ){ status <- 0 if(verbose){ warning(paste("There is at least one row where the RSE (relative standard error) is outside the range of (0, 100].", @@ -314,7 +328,7 @@ chi_qa_tro <- function(CHIestimates, if(verbose){ message("Checking that caution flag exists if RSE >= 30%") } - if(nrow(CHIestimates[rse>=30 & (caution != "!" | is.na(caution)) ]) > 0 ){ + if(nrow(CHIestimates[!is.na(rse) & rse>=30 & (caution != "!" | is.na(caution)) ]) > 0 ){ status <- 0 if(verbose){ warning("There is at least one row where a caution flag ('!') is not used and rse >= 30% or is.na(rse) == T. @@ -385,7 +399,7 @@ chi_qa_tro <- function(CHIestimates, } } } - if(acs==F){ + if(isFALSE(acs)){ if(nrow(CHIestimates[is.na(cat1_varname)]) > 0 ){ status <- 0 if(verbose){ @@ -403,7 +417,7 @@ chi_qa_tro <- function(CHIestimates, } } } - if(acs==F){ + if(isFALSE(acs)){ if(nrow(CHIestimates[tab=="crosstabs" & is.na(cat2_varname)]) > 0 ){ status <- 0 if(verbose){ @@ -416,7 +430,7 @@ chi_qa_tro <- function(CHIestimates, if(verbose){ message("Checking that results are present if row is not suppressed") } - for(var in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator")){ + for(var in c("result", "lower_bound", "upper_bound", "se", "numerator", "denominator")){ if(nrow(CHIestimates[suppression != "^" & is.na(get(var))]) > 0 ){ status <- 0 if(verbose){ @@ -426,6 +440,14 @@ chi_qa_tro <- function(CHIestimates, } } + if(nrow(CHIestimates[suppression != "^" & is.na(rse) & numerator != 0]) > 0 ){ + status <- 0 + if(verbose){ + warning(glue::glue("There is at least one row that is not suppressed, where 'rse' is missing and numerator is not 0. + Please fill in the missing value before rerunning chi_qa_tro()")) + } + } + ## Ensure cat1/cat2 values meet CHI standards ---- if(verbose) { message("Checking that category combinations align with CHI standards") From d8e4e269c5e275f7f2e2be75ddbff4cd6a74d0f3 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 28 Feb 2025 17:46:13 -0800 Subject: [PATCH 07/12] Bug fixes in chi_calc - properly deal with times when numerator == 0 - Wilson socre no longer depends on se == 0, but only that result is 0 or 1 - caution flag now noted when numerator == 0 (in addition to RSE > 30) --- R/chi_calc.R | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/R/chi_calc.R b/R/chi_calc.R index 48d2834..052ad13 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -266,6 +266,16 @@ chi_calc <- function(ph.data = NULL, # Tidy results ---- + # When there are no observation in a row, rads::calc() gives NA for numerator, but want zero ---- + tempCHIest[is.na(numerator), `:=` ( + result = 0, + se = 0, + lower_bound = 0, + upper_bound = 0, + numerator = 0, + rse = NA # undefined because mean will be zero + )] + # drop when cat1_group is missing (e.g., cat1 == 'Regions' and region is NA) ---- tempCHIest <- tempCHIest[!is.na(cat1_group)] @@ -280,12 +290,12 @@ chi_calc <- function(ph.data = NULL, z_value <- qnorm(1-0.5*(1-ci)) # Lower bound using Wilson Score method - tempCHIest[result %in% c(0, 1) & se == 0 & denominator != 0, + tempCHIest[result %in% c(0, 1) & denominator > 10, lower_bound := (2 * numerator + z_value^2 - z_value * sqrt(z_value^2 + 4 * numerator * (1 - numerator/denominator))) / (2 * (denominator + z_value^2))] # Upper bound using Wilson Score method - tempCHIest[result %in% c(0, 1) & se == 0 & denominator != 0, + tempCHIest[result %in% c(0, 1) & denominator > 10, upper_bound := (2 * numerator + z_value^2 + z_value * sqrt(z_value^2 + 4 * numerator * (1 - numerator/denominator))) / (2 * (denominator + z_value^2))] @@ -362,7 +372,7 @@ chi_calc <- function(ph.data = NULL, secondary_exclude = cat1_varname != 'race3') } else {tempCHIest[, suppression := NA_character_]} - tempCHIest[rse>=30, caution := "!"] + tempCHIest[rse>=30 | numerator == 0, caution := "!"] tempCHIest[, c('cat2', 'cat2_group', 'cat2_varname') := lapply(.SD, as.character), .SDcols = c('cat2', 'cat2_group', 'cat2_varname')] From b678b629a984eed43ff73874abbbafbdb6a58bb6 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 28 Feb 2025 17:48:41 -0800 Subject: [PATCH 08/12] CHANGED chi_sql_update to chi_update_sql - now follows standard naming convention in this pacackage: chi _ verb _ noun --- NAMESPACE | 2 +- R/chi_compare_estimates.R | 2 +- R/{chi_sql_update.R => chi_update_sql.R} | 4 +- man/chi_compare_estimates.Rd | 2 +- man/{chi_sql_update.Rd => chi_update_sql.Rd} | 10 ++-- quarto_docs/Calculating_Prevalences.qmd | 49 ++++++++++++++------ tests/testthat/test-chi_sql_update.R | 12 ++--- 7 files changed, 51 insertions(+), 30 deletions(-) rename R/{chi_sql_update.R => chi_update_sql.R} (99%) rename man/{chi_sql_update.Rd => chi_update_sql.Rd} (92%) diff --git a/NAMESPACE b/NAMESPACE index 3badaeb..e0ff381 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,7 @@ export(chi_generate_tro_shell) export(chi_get_cols) export(chi_get_yaml) export(chi_qa_tro) -export(chi_sql_update) +export(chi_update_sql) import(dtsurvey) import(future) import(future.apply) diff --git a/R/chi_compare_estimates.R b/R/chi_compare_estimates.R index 13cc186..d784505 100644 --- a/R/chi_compare_estimates.R +++ b/R/chi_compare_estimates.R @@ -35,7 +35,7 @@ #' @seealso #' \code{\link{chi_qa_tro}} for validating individual datasets #' -#' \code{\link{chi_sql_update}} for uploading validated results +#' \code{\link{chi_update_sql}} for uploading validated results #' #' @return data.table ordered by absolute difference, containing: #' - Difference metrics (absolute_diff, relative_diff) diff --git a/R/chi_sql_update.R b/R/chi_update_sql.R similarity index 99% rename from R/chi_sql_update.R rename to R/chi_update_sql.R index 1d5bd4d..d34571c 100644 --- a/R/chi_sql_update.R +++ b/R/chi_update_sql.R @@ -25,7 +25,7 @@ #' @examples #' \dontrun{ #' # Update development database -#' chi_sql_update( +#' chi_update_sql( #' CHIestimates = final_estimates, #' CHImetadata = final_metadata, #' table_name = "birth", @@ -52,7 +52,7 @@ #' #' @export #' -chi_sql_update <- function(CHIestimates = NULL, +chi_update_sql <- function(CHIestimates = NULL, CHImetadata = NULL, table_name = NULL, server = 'development', # options include c('development', 'production') diff --git a/man/chi_compare_estimates.Rd b/man/chi_compare_estimates.Rd index 1eb4830..896f800 100644 --- a/man/chi_compare_estimates.Rd +++ b/man/chi_compare_estimates.Rd @@ -59,5 +59,5 @@ comparison <- chi_compare_estimates( \seealso{ \code{\link{chi_qa_tro}} for validating individual datasets -\code{\link{chi_sql_update}} for uploading validated results +\code{\link{chi_update_sql}} for uploading validated results } diff --git a/man/chi_sql_update.Rd b/man/chi_update_sql.Rd similarity index 92% rename from man/chi_sql_update.Rd rename to man/chi_update_sql.Rd index 7ec1e5e..f134dc7 100644 --- a/man/chi_sql_update.Rd +++ b/man/chi_update_sql.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/chi_sql_update.R -\name{chi_sql_update} -\alias{chi_sql_update} +% Please edit documentation in R/chi_update_sql.R +\name{chi_update_sql} +\alias{chi_update_sql} \title{CHI SQL Update} \usage{ -chi_sql_update( +chi_update_sql( CHIestimates = NULL, CHImetadata = NULL, table_name = NULL, @@ -45,7 +45,7 @@ SharePoint > DPH-KCCross-SectorData > Documents > References > SQL > SQL Server \examples{ \dontrun{ # Update development database -chi_sql_update( +chi_update_sql( CHIestimates = final_estimates, CHImetadata = final_metadata, table_name = "birth", diff --git a/quarto_docs/Calculating_Prevalences.qmd b/quarto_docs/Calculating_Prevalences.qmd index 6cf6856..db37ad1 100644 --- a/quarto_docs/Calculating_Prevalences.qmd +++ b/quarto_docs/Calculating_Prevalences.qmd @@ -217,7 +217,27 @@ Here are two rows from `myestimates` that will show the structure and contents o pretty_kable(head(myestimates[sample(1:.N, 2)])) ``` -# Tidying Race/Ethnicity Categories +## Restoring Sequential Processing + +After completing the parallel processing job, it's important to restore the system returns to its normal state. + +```{r future-revert} +future::plan(future::sequential) +``` + +# Tidying `chi_calc()` output + +## Drop the `level` column + +The `chi_calc()` function returns a column called `level`, which contains the specific factor level for categorical variables. For example, if `indicator_key == 'fetal_pres'`, the function would return separate rows for each possible presentation: a row with `level == 'Breech'`, another with `level == 'Cephalic'`, and another with `level == 'Other'`. In these cases, we would use the `level` column to filter for our factor level of interest. + +In our current example, since `bw_norm` is not a categorical variable, the `level` column contains only `NA` values and can be safely dropped. + +```{r tidy-level} +myestimates[, level := NULL] +``` + +## Race/Ethnicity Due to the way that APDE decided to display race3 (Hispanic as ethnicity) and race4 (Hispanic as race), the calculation and preparation of these variables can be complex. We need to manipulate the results to align them with CHI standards. @@ -227,26 +247,27 @@ For future reference, all CHI standards can be found in `rads.data::misc_chi_byv #| warning: false #| message: false -# Update race4 categories -myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race4', +# For race4 categories (both cat1 and cat2) +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race4', cat1 := "Birthing person's race"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race4', +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race4', cat2 := "Birthing person's race"] -# Update race3 categories - separate race and ethnicity -myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group != 'Hispanic', +# For race3 categories - default to race, override for Hispanic ethnicity +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3', cat1 := "Birthing person's race"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group == 'Hispanic', +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group == 'Hispanic', cat1 := "Birthing person's ethnicity"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group != 'Hispanic', + +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3', cat2 := "Birthing person's race"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group == 'Hispanic', +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group == 'Hispanic', cat2 := "Birthing person's ethnicity"] -# Update trend data labels -myestimates[tab == 'trends' & cat1_varname %in% c('race3', 'race4'), +# Update trend data labels (both race3 and race4 at once) +myestimates[tab == 'trends' & cat1_varname %in% c('race3', 'race4'), cat1 := "Birthing person's race/ethnicity"] -myestimates[tab == 'trends' & cat2_varname %in% c('race3', 'race4'), +myestimates[tab == 'trends' & cat2_varname %in% c('race3', 'race4'), cat2 := "Birthing person's race/ethnicity"] ``` @@ -375,7 +396,7 @@ We need to save our results and metadata to the development SQL Server. Later, o ```{r save-sql} #| warning: false -chi_sql_update(CHIestimates = myestimates, +chi_update_sql(CHIestimates = myestimates, CHImetadata = mymetadata, table_name = 'junk', # replace with data source name, e.g., 'birth', 'brfss', etc. server = 'development', @@ -464,7 +485,7 @@ New functions you used: - `chi_generate_metadata()` to create an updated metadata table - `chi_qa_tro()` to perform quality assurance checks - `chi_compare_estimates()` to identify notable differences compared to previous estimates -- `chi_sql_update()` to save estimates and metadata to SQL Server +- `chi_update_sql()` to save estimates and metadata to SQL Server If you encounter issues or believe you've found a bug, please submit a GitHub issue at [Issues ยท PHSKC-APDE/apde.chi.tools](https://github.com/PHSKC-APDE/apde.chi.tools/issues). diff --git a/tests/testthat/test-chi_sql_update.R b/tests/testthat/test-chi_sql_update.R index 0ce77ad..3962127 100644 --- a/tests/testthat/test-chi_sql_update.R +++ b/tests/testthat/test-chi_sql_update.R @@ -1,9 +1,9 @@ -# Tests for chi_sql_update -test_that("chi_sql_update validates inputs", { +# Tests for chi_update_sql +test_that("chi_update_sql validates inputs", { test_data <- setup_test_data() expect_warning( - chi_sql_update( + chi_update_sql( CHIestimates = test_data$my.estimate, CHImetadata = test_data$my.metadata, table_name = 'JustTesting', @@ -13,11 +13,11 @@ test_that("chi_sql_update validates inputs", { "Validation may be flawed for the following variables because they are 100% missing" ) - expect_error(chi_sql_update(), + expect_error(chi_update_sql(), "The results table to push to SQL \\(CHIestimates\\) is missing") - expect_error(suppressWarnings(chi_sql_update(CHIestimates = test_data$my.estimate)), + expect_error(suppressWarnings(chi_update_sql(CHIestimates = test_data$my.estimate)), "The metadata table to push to SQL \\(CHImetadata\\) is missing") - expect_error(suppressWarnings(chi_sql_update(CHIestimates = test_data$my.estimate, + expect_error(suppressWarnings(chi_update_sql(CHIestimates = test_data$my.estimate, CHImetadata = test_data$my.metadata)), "The table_name argument is missing") }) From 72bde55ac5a7cf099eb172bdfc73bd5717575714 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 28 Feb 2025 17:50:41 -0800 Subject: [PATCH 09/12] Improve chi_generate_metadata - now allows a specific indicator to be missing a specific bit of information. For example, in ACS medinc variable does not have latest_year_count --- R/chi_generate_metadata.R | 71 ++++++++++++++++++++++++++---------- man/chi_generate_metadata.Rd | 14 ++++++- 2 files changed, 64 insertions(+), 21 deletions(-) diff --git a/R/chi_generate_metadata.R b/R/chi_generate_metadata.R index a5e6801..0d0aca6 100644 --- a/R/chi_generate_metadata.R +++ b/R/chi_generate_metadata.R @@ -1,10 +1,13 @@ #' CHI Generate Metadata #' #' @description -#' function to generate metadata table combining existing metadata and latest estimates. +#' Function to generate metadata table combining existing metadata and latest estimates. #' #' @param meta.old Name of a data.table with the prior year's metadata #' @param est.current current year's tableau ready output with completed estimates +#' @param allowed_missing A list of lists specifying allowed missing metadata values. Each inner list +#' should have \code{indicator_key} and \code{column} elements. +#' @param warn_only Logical, if TRUE, only warns about unexpected missing values instead of stopping #' #' @return table of metadata #' @@ -19,30 +22,30 @@ #' @export #' chi_generate_metadata <- function(meta.old = NULL, - est.current = NULL){ + est.current = NULL, + allowed_missing = list( + list(indicator_key = "medinc", column = "latest_year_count") + # Add more exceptions here as needed + ), + warn_only = FALSE){ # Input validation ---- if (is.null(meta.old)) stop("\n\U1F6D1 meta.old must be provided") if (!is.data.frame(meta.old)) stop("\n\U1F6D1 meta.old must be a data.frame or data.table") - if (is.null(est.current)) stop("\n\U1F6D1 est.current must be provided") if (!is.data.frame(est.current)) stop("\n\U1F6D1 est.current must be a data.frame or data.table") - # Convert to data.table if needed ---- if (!is.data.table(meta.old)) setDT(meta.old) if (!is.data.table(est.current)) setDT(est.current) - # get new metadata ---- meta.new <- unique(est.current[tab == "metadata", list(indicator_key, - latest_yearx = as.integer(year), - latest_year_resultx = result, - run_datex = run_date, - latest_year_countx = as.integer(numerator), - latest_year_kc_popx = as.integer(denominator))]) - + latest_yearx = as.integer(year), + latest_year_resultx = result, + run_datex = run_date, + latest_year_countx = as.integer(numerator), + latest_year_kc_popx = as.integer(denominator))]) # merge new metadata onto old metadata ---- meta.new <- merge(meta.old, meta.new, by = c("indicator_key"), all = T) - # update with newest data ---- # only replace old data when there is new data because may stop calculating indicators, in which case, would want to keep old data meta.new[!is.na(latest_yearx), latest_year := as.numeric(latest_yearx)] @@ -51,11 +54,9 @@ chi_generate_metadata <- function(meta.old = NULL, meta.new[!is.na(latest_year_countx), latest_year_count := latest_year_countx] meta.new[!is.na(latest_year_kc_popx), latest_year_kc_pop := latest_year_kc_popx] meta.new[, c("latest_yearx", "latest_year_resultx", "run_datex", "latest_year_countx", "latest_year_kc_popx") := NULL] - # update valid_years ---- meta.new[as.integer(latest_year) > suppressWarnings(as.integer(rads::substrRight(valid_years, 1, 4))), valid_years := suppressWarnings(paste(as.integer(substr(valid_years, 1, 4)):as.integer(latest_year), collapse = " "))] - # Since trends only have 10 years of data, valid_years should be limited to 10 years max ---- meta.new[, valid_years := { allyears <- sort(as.integer(strsplit(valid_years, " ")[[1]])) # convert valid_years to a vector of numbers @@ -66,15 +67,47 @@ chi_generate_metadata <- function(meta.old = NULL, } }, by = indicator_key] - # Ensure there are no missing important metadata cells ---- - missing.per.col <- sapply(meta.new, function(x) sum(is.na(x))) - if(sum(missing.per.col) > 0){ - stop("You are missing at least one critical meta.new[] value.") + # Ensure there are no missing important metadata cells (with exceptions) ---- + unexpected_missing <- data.frame() + + for(col in names(meta.new)) { + na_rows <- which(is.na(meta.new[[col]])) + + if(length(na_rows) > 0) { + for(row in na_rows) { + ind_key <- meta.new$indicator_key[row] + + # Check if this is an allowed exception + is_allowed <- any(sapply(allowed_missing, function(exc) { + exc$indicator_key == ind_key && exc$column == col + })) + + if(!is_allowed) { + unexpected_missing <- rbind(unexpected_missing, data.frame( + indicator_key = ind_key, + column = col, + stringsAsFactors = FALSE + )) + } + } + } + } + + if(nrow(unexpected_missing) > 0) { + msg <- paste0("Unexpected missing metadata values:\n", + paste(apply(unexpected_missing, 1, function(row) { + paste0("Indicator '", row[1], "' is missing value for '", row[2], "'") + }), collapse = "\n")) + + if(warn_only) { + warning(paste0('\U00026A0 ', msg)) + } else { + stop(paste0('\U1F6D1 ', msg)) + } } # order metadata table ---- setorder(meta.new, indicator_key) - # return table ---- return(meta.new) } diff --git a/man/chi_generate_metadata.Rd b/man/chi_generate_metadata.Rd index 65c8a50..e53a5bb 100644 --- a/man/chi_generate_metadata.Rd +++ b/man/chi_generate_metadata.Rd @@ -4,18 +4,28 @@ \alias{chi_generate_metadata} \title{CHI Generate Metadata} \usage{ -chi_generate_metadata(meta.old = NULL, est.current = NULL) +chi_generate_metadata( + meta.old = NULL, + est.current = NULL, + allowed_missing = list(list(indicator_key = "medinc", column = "latest_year_count")), + warn_only = FALSE +) } \arguments{ \item{meta.old}{Name of a data.table with the prior year's metadata} \item{est.current}{current year's tableau ready output with completed estimates} + +\item{allowed_missing}{A list of lists specifying allowed missing metadata values. Each inner list +should have \code{indicator_key} and \code{column} elements.} + +\item{warn_only}{Logical, if TRUE, only warns about unexpected missing values instead of stopping} } \value{ table of metadata } \description{ -function to generate metadata table combining existing metadata and latest estimates. +Function to generate metadata table combining existing metadata and latest estimates. } \seealso{ \code{\link{chi_calc}} for generating estimates From a78bb3730b9a9d0b2bc57be5ca59f93412479474 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 28 Feb 2025 17:52:37 -0800 Subject: [PATCH 10/12] chi_generate_tro_shell etter with race4 - does not allow race4 to be 'Ethnicity' ... another of many tweaks to deal with insanity of race3/race4 --- R/chi_generate_tro_shell.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index be6aa35..05bc574 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -94,6 +94,8 @@ chi_generate_tro_shell <- function(ph.analysis_set, #advisory messages if("x" %in% ph.analysis_set$trends) {message("Note: trends are applied backwards from end.year")} + # Race / ethnicity is a chronic headache with CHI. Need to remove rows for race4 & Ethnicity because should be Race/ethnicity + ph.analysis_set <- ph.analysis_set[!(cat1_varname == 'race4' & cat1 == 'Ethnicity')] # apply the template generating function template <- rbindlist( From e43c3d02d72800abe527e2b5a5724e45c8f6a62d Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 28 Feb 2025 17:53:58 -0800 Subject: [PATCH 11/12] more functionality for chi_gerate_analysis_set - previously only could read relevant tables from prod server - Now can use a data.frame/data.table in memory to make an anlysis set --- R/chi_generate_analysis_set.R | 118 +++++++++++------- man/chi_generate_analysis_set.Rd | 52 ++++---- .../testthat/test-chi_generate_analysis_set.R | 5 +- 3 files changed, 105 insertions(+), 70 deletions(-) diff --git a/R/chi_generate_analysis_set.R b/R/chi_generate_analysis_set.R index e92825b..42a2be1 100644 --- a/R/chi_generate_analysis_set.R +++ b/R/chi_generate_analysis_set.R @@ -2,40 +2,45 @@ #' #' @description #' Creates sets of indicator keys that share common analysis patterns within CHI -#' (Community Health Indicators) data. This function reads the most recent production -#' version of CHI estimates for a specified data source and groups indicators that -#' use the same combinations of \code{cat1}, \code{cat1_varname}, and \code{trends} -#' columns. +#' (Community Health Indicators) data. This function can either: +#' +#' 1. Read the most recent production version of CHI estimates from the TSQL Production Server +#' for a specified data source, or +#' +#' 2. Process a provided data.table/data.frame containing CHI data +#' +#' It groups indicators that use the same combinations of \code{cat1}, \code{cat1_varname}, and +#' tab columns (\code{_kingcounty}, \code{_wastate}, \code{demgroups}, \code{crosstabs}, \code{trends}). #' #' @param data_source Character string (length 1) specifying the data source to analyze #' (e.g., \code{'birth'}, \code{'brfss'}, \code{'chars'}, \code{'death'}, \code{'hys'}). This corresponds to a table -#' name in the PHExtractStore database. +#' name in the PHExtractStore database on the TSQL Production Server. Required if \code{CHIestimates} is not provided. +#' @param CHIestimates A data.table or data.frame containing CHI data to analyze. If provided, \code{data_source} +#' is ignored. Must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function in this package. #' #' @return A data.table containing analysis sets with the following columns: #' \itemize{ #' \item \code{set}: Integer identifying groups of indicators with identical analysis patterns #' \item \code{cat1}: Category name as expected in CHI TRO #' \item \code{cat1_varname}: Category variable name as expected in CHI TRO -#' \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab -#' \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab -#' \item \code{demgroups}: \code{'x'} if analysis includes demographic groups -#' \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations -#' \item \code{trends}: \code{'x'} if analysis includes trends analysis +#' \item \code{_kingcounty}: \code{'x'} if analysis includes '_kingcounty' in the tab column +#' \item \code{_wastate}: \code{'x'} if analysis includes '_wastate' in the tab column +#' \item \code{demgroups}: \code{'x'} if analysis includes 'demgroups' in the tab column +#' \item \code{crosstabs}: \code{'x'} if analysis includes 'crosstabs' in the tab column +#' \item \code{trends}: \code{'x'} if analysis includes 'trends' in the tab column #' \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern #' } #' #' @details #' This function generates a table for \code{\link{chi_generate_tro_shell}}, providing the -#' structure required for generating analysis instructions. It connects to the -#' \code{[PHExtractStore]} database on \code{KCITSQLPRPHIP40} to retrieve the latest production data. +#' structure required for generating analysis instructions. +#' +#' When using \code{data_source}, the function connects to the +#' \code{[PHExtractStore]} database on the TSQL Production Server \code{KCITSQLPRPHIP40} to retrieve the latest production estimates. #' Users need appropriate database credentials - contact your manager if you need access. #' -#' Typically, the analysis will not change from year to year. However, you should -#' compare the output of this function with the 'CHI-Variables_ByDataSource' worksheet -#' in -#' \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ -#' SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} -#' and update it as necessary. +#' When using \code{CHIestimates}, the provided estimates are processed directly, bypassing the database connection. +#' The data must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function. #' #' The output structure directly informs \code{\link{chi_generate_tro_shell}} about which #' indicators should be analyzed together based on their shared patterns of @@ -52,28 +57,46 @@ #' #' @examples #' \dontrun{ -#' # Generate analysis sets for birth data +#' # Generate analysis sets for birth data from the database #' birth_sets <- chi_generate_analysis_set("birth") #' -#' # Generate analysis sets for BRFSS data +#' # Generate analysis sets for BRFSS data from the database #' brfss_sets <- chi_generate_analysis_set("brfss") +#' +#' # Generate analysis sets from an existing data.table +#' my_data <- data.table::fread("my_chi_data.csv") +#' custom_sets <- chi_generate_analysis_set(CHIestimates = my_data) #' } #' -chi_generate_analysis_set <- function(data_source = NULL) { - # Input validation - if (is.null(data_source)) { - stop("\n\U1F6D1 data_source parameter must be provided") +chi_generate_analysis_set <- function(data_source = NULL, + CHIestimates = NULL) { + + # Input validation ---- + if (is.null(data_source) && is.null(CHIestimates)) { + stop("\n\U1F6D1 Either data_source parameter or CHIestimates parameter must be provided") + } + + if (!is.null(data_source) && !is.null(CHIestimates)) { + warning("\n\U26A0 Both data_source and CHIestimates provided. Using CHIestimates and ignoring data_source.") } - if (!is.character(data_source) || length(data_source) != 1) { + + if (!is.null(data_source) && !(is.character(data_source) && length(data_source) == 1)) { stop("\n\U1F6D1 data_source must be a single character string, e.g., 'birth'") } + if (!is.null(CHIestimates) && !inherits(CHIestimates, c("data.table", "data.frame"))) { + stop("\n\U1F6D1 CHIestimates must be a data.table or data.frame") + } + # Get data ---- - # Construct the full table name for error messages + if (!is.null(CHIestimates)) { + CHIestimates <- data.table::setDT(data.table::copy(CHIestimates)) + } else { + # Construct the full table name for error messages full_table_name <- paste0("[APDE].[", data_source, "_results]") server_info <- "KCITSQLPRPHIP40 > PHExtractStore" - # try to make a database connection + # try to make a database connection tryCatch({ # Establish connection cnxn <- odbc::dbConnect( @@ -92,55 +115,56 @@ chi_generate_analysis_set <- function(data_source = NULL) { } }) - - # Check if table exists before attempting query + # Check if table exists before attempting query table_exists <- DBI::dbExistsTable( conn = cnxn, name = paste0(data_source, "_results"), schema = "APDE" ) - # If table exists, load into memory + # If table exists, load into memory if (!table_exists) { stop(paste0("\U1F6D1 You specified data_source = '", data_source, "', which attempted to download ", full_table_name, " from ", server_info, ". This table does not exist.")) } else { - tempdt <- data.table::setDT(DBI::dbGetQuery( - conn = cnxn, - statement = paste0("SELECT * FROM ", full_table_name) - )) } + CHIestimates <- data.table::setDT(DBI::dbGetQuery( + conn = cnxn, + statement = paste0("SELECT * FROM ", full_table_name) + )) + } - # Close database connection + # Close database connection if (exists("cnxn") && !is.null(cnxn)) { DBI::dbDisconnect(cnxn) } + } # Recodes for race3 & race4 ---- # Necessary because they are wonky as heck due to how APDE decided to code/display them - race3_remix1 <- tempdt[(grepl('race/ethnicity$', cat1, ignore.case = T) & cat1_varname == 'race3')] - tempdt <- rbind( - fsetdiff(tempdt, race3_remix1), + race3_remix1 <- CHIestimates[(grepl('race/ethnicity$', cat1, ignore.case = T) & cat1_varname == 'race3')] + CHIestimates <- rbind( + fsetdiff(CHIestimates, race3_remix1), copy(race3_remix1)[, cat1 := gsub('race/ethnicity$', 'race', cat1)][, cat1 := gsub('Race/ethnicity$', 'Race', cat1)], copy(race3_remix1)[, cat1 := gsub('race/ethnicity$', 'ethnicity', cat1)][, cat1 := gsub('Race/ethnicity$', 'Ethnicity', cat1)] ) - race3_remix2 <- tempdt[(grepl('race/ethnicity$', cat2, ignore.case = T) & cat2_varname == 'race3')] - tempdt <- rbind( - fsetdiff(tempdt, race3_remix2), + race3_remix2 <- CHIestimates[(grepl('race/ethnicity$', cat2, ignore.case = T) & cat2_varname == 'race3')] + CHIestimates <- rbind( + fsetdiff(CHIestimates, race3_remix2), copy(race3_remix2)[, cat2 := gsub('race/ethnicity$', 'race', cat2)][, cat2 := gsub('Race/ethnicity$', 'Race', cat2)], copy(race3_remix2)[, cat2 := gsub('race/ethnicity$', 'ethnicity', cat2)][, cat2 := gsub('Race/ethnicity$', 'Ethnicity', cat2)] ) - tempdt[cat1_varname == 'race4', cat1 := gsub('race$', 'race/ethnicity', cat1)] - tempdt[cat1_varname == 'race4', cat1 := gsub('Race$', 'Race/ethnicity', cat1)] + CHIestimates[cat1_varname == 'race4', cat1 := gsub('race$', 'race/ethnicity', cat1)] + CHIestimates[cat1_varname == 'race4', cat1 := gsub('Race$', 'Race/ethnicity', cat1)] - tempdt[cat2_varname == 'race4', cat2 := gsub('race$', 'race/ethnicity', cat2)] - tempdt[cat2_varname == 'race4', cat2 := gsub('Race$', 'Race/ethnicity', cat2)] + CHIestimates[cat2_varname == 'race4', cat2 := gsub('race$', 'race/ethnicity', cat2)] + CHIestimates[cat2_varname == 'race4', cat2 := gsub('Race$', 'Race/ethnicity', cat2)] # Table of categories and tabs per indicator ---- # For cat1 combinations - tab_patterns <- tempdt[, list( + tab_patterns <- CHIestimates[, list( `_kingcounty` = fifelse(any(tab == "_kingcounty"), "x", ""), `_wastate` = fifelse(any(tab == "_wastate"), "x", ""), demgroups = fifelse(any(tab == "demgroups"), "x", ""), @@ -149,7 +173,7 @@ chi_generate_analysis_set <- function(data_source = NULL) { ), by = list(indicator_key, cat1, cat1_varname)] # For cat2 combinations - tab_patterns2 <- tempdt[!is.na(cat2), list( + tab_patterns2 <- CHIestimates[!is.na(cat2), list( `_kingcounty` = fifelse(any(tab == "_kingcounty"), "x", ""), `_wastate` = fifelse(any(tab == "_wastate"), "x", ""), demgroups = fifelse(any(tab == "demgroups"), "x", ""), diff --git a/man/chi_generate_analysis_set.Rd b/man/chi_generate_analysis_set.Rd index e8e45fd..1de8132 100644 --- a/man/chi_generate_analysis_set.Rd +++ b/man/chi_generate_analysis_set.Rd @@ -4,12 +4,15 @@ \alias{chi_generate_analysis_set} \title{Generate Analysis Sets for CHI} \usage{ -chi_generate_analysis_set(data_source = NULL) +chi_generate_analysis_set(data_source = NULL, CHIestimates = NULL) } \arguments{ \item{data_source}{Character string (length 1) specifying the data source to analyze (e.g., \code{'birth'}, \code{'brfss'}, \code{'chars'}, \code{'death'}, \code{'hys'}). This corresponds to a table -name in the PHExtractStore database.} +name in the PHExtractStore database on the TSQL Production Server. Required if \code{CHIestimates} is not provided.} + +\item{CHIestimates}{A data.table or data.frame containing CHI data to analyze. If provided, \code{data_source} +is ignored. Must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function in this package.} } \value{ A data.table containing analysis sets with the following columns: @@ -17,33 +20,36 @@ A data.table containing analysis sets with the following columns: \item \code{set}: Integer identifying groups of indicators with identical analysis patterns \item \code{cat1}: Category name as expected in CHI TRO \item \code{cat1_varname}: Category variable name as expected in CHI TRO - \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab - \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab - \item \code{demgroups}: \code{'x'} if analysis includes demographic groups - \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations - \item \code{trends}: \code{'x'} if analysis includes trends analysis + \item \code{_kingcounty}: \code{'x'} if analysis includes '_kingcounty' in the tab column + \item \code{_wastate}: \code{'x'} if analysis includes '_wastate' in the tab column + \item \code{demgroups}: \code{'x'} if analysis includes 'demgroups' in the tab column + \item \code{crosstabs}: \code{'x'} if analysis includes 'crosstabs' in the tab column + \item \code{trends}: \code{'x'} if analysis includes 'trends' in the tab column \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern } } \description{ Creates sets of indicator keys that share common analysis patterns within CHI -(Community Health Indicators) data. This function reads the most recent production -version of CHI estimates for a specified data source and groups indicators that -use the same combinations of \code{cat1}, \code{cat1_varname}, and \code{trends} -columns. +(Community Health Indicators) data. This function can either: + +1. Read the most recent production version of CHI estimates from the TSQL Production Server + for a specified data source, or + +2. Process a provided data.table/data.frame containing CHI data + +It groups indicators that use the same combinations of \code{cat1}, \code{cat1_varname}, and +tab columns (\code{_kingcounty}, \code{_wastate}, \code{demgroups}, \code{crosstabs}, \code{trends}). } \details{ This function generates a table for \code{\link{chi_generate_tro_shell}}, providing the -structure required for generating analysis instructions. It connects to the -\code{[PHExtractStore]} database on \code{KCITSQLPRPHIP40} to retrieve the latest production data. +structure required for generating analysis instructions. + +When using \code{data_source}, the function connects to the +\code{[PHExtractStore]} database on the TSQL Production Server \code{KCITSQLPRPHIP40} to retrieve the latest production estimates. Users need appropriate database credentials - contact your manager if you need access. -Typically, the analysis will not change from year to year. However, you should -compare the output of this function with the 'CHI-Variables_ByDataSource' worksheet -in -\href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ -SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} -and update it as necessary. +When using \code{CHIestimates}, the provided estimates are processed directly, bypassing the database connection. +The data must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function. The output structure directly informs \code{\link{chi_generate_tro_shell}} about which indicators should be analyzed together based on their shared patterns of @@ -51,11 +57,15 @@ categories and analysis types. } \examples{ \dontrun{ -# Generate analysis sets for birth data +# Generate analysis sets for birth data from the database birth_sets <- chi_generate_analysis_set("birth") -# Generate analysis sets for BRFSS data +# Generate analysis sets for BRFSS data from the database brfss_sets <- chi_generate_analysis_set("brfss") + +# Generate analysis sets from an existing data.table +my_data <- data.table::fread("my_chi_data.csv") +custom_sets <- chi_generate_analysis_set(CHIestimates = my_data) } } diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R index 011233e..c5a0e95 100644 --- a/tests/testthat/test-chi_generate_analysis_set.R +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -1,5 +1,6 @@ # Tests for chi_generate_analysis_set test_that("chi_generate_analysis_set validates inputs", { - expect_error(chi_generate_analysis_set(), "data_source parameter must be provided") - expect_error(chi_generate_analysis_set(123), "data_source must be a single character string") + expect_error(chi_generate_analysis_set(), "Either data_source parameter or CHIestimates parameter must be provided") + expect_error(chi_generate_analysis_set(data_source = 123), "data_source must be a single character string") + expect_error(chi_generate_analysis_set(CHIestimates = 123), "CHIestimates must be a data.table or data.frame") }) From b3ce4f207bfa3ba1e7ffcc0432785d6edef9345d Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 28 Feb 2025 17:57:22 -0800 Subject: [PATCH 12/12] Improved chi_qa_tro - allow for proper rounding based on whether integer, proportion, or rate - ensures there are no gaps in single years for trends - warns if there appears to be more than one distinct multi year 'year' value - streamlined messaging and warnings depdenent upon verbose argument --- R/chi_qa_tro.R | 598 +++++++++++++++++++++++++++++-------------------- 1 file changed, 357 insertions(+), 241 deletions(-) diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index 328f6b0..17a1db1 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -56,68 +56,69 @@ chi_qa_tro <- function(CHIestimates, verbose = TRUE){ status <- 1 + ## Helper functions + report_issue <- function(message) { + status <<- 0 + if(verbose) { + warning(message) + } + return(invisible(NULL)) + } + + report_message <- function(message) { + if(verbose) { + message(message) + } + return(invisible(NULL)) + } + ## Check arguments ---- if(!is.logical(verbose)){ stop('verbose must a logical, i.e., TRUE or FALSE') } - - if(verbose){ - message("Checking that that `acs` is logical") - } + report_message("Checking that that `acs` is logical") if(!is.logical(acs)){ stop('acs must a logical, i.e., TRUE or FALSE') } - - if(verbose){ - message("Checking that both the results and the metadata were provided") + report_message("Checking that both the results and the metadata were provided") + if(is.null(CHIestimates)){ + status <- 0 + if(verbose) { + warning("You must provide the name of a data.frame or data.table that contains the CHI results.") + } } - if(is.null(CHIestimates)){ - status <- 0 - if(verbose) { - warning("You must provide the name of a data.frame or data.table that contains the CHI results.") - } + if(is.null(CHImetadata)){ + status <- 0 + if(verbose){ + warning("You must provide the name of a data.frame or data.table that contains the CHI metadata ") } - if(is.null(CHImetadata)){ - status <- 0 - if(verbose){ - warning("You must provide the name of a data.frame or data.table that contains the CHI metadata ") - } + } + #if both data sets are not provided, abort check + if(status == 0) { + if(verbose){ + stop("Check incomplete. Please correct errors to proceed") } - #if both data sets are not provided, abort check - if(status == 0) { - if(verbose){ - stop("Check incomplete. Please correct errors to proceed") - } - return(status) - } + return(status) + } - CHIestimates <- data.table::setDT(copy(CHIestimates)) - CHImetadata <- data.table::setDT(copy(CHImetadata)) + CHIestimates <- data.table::setDT(copy(CHIestimates)) + CHImetadata <- data.table::setDT(copy(CHImetadata)) ## Check columns ---- - if(verbose) { - message("Checking that all column names are unique") - } + report_message("Checking that all column names are unique") if(length(names(CHIestimates)) != length(unique(names(CHIestimates)))) { - status <- 0 - if(verbose){ - warning("You submitted a dataset where at least two columns have the same name. All names in CHIestimates must be unique.") - } + report_issue("You submitted a dataset where at least two columns have the same name. All names in CHIestimates must be unique.") } + if(length(names(CHImetadata)) != length(unique(names(CHImetadata)))) { - status <- 0 - if(verbose){ - warning("You submitted a metadata table where at least two columns have the same name. All names in CHImetadata must be unique.") - } + report_issue("You submitted a metadata table where at least two columns have the same name. All names in CHImetadata must be unique.") } - if(verbose){ - message("Checking that all necessary columns exist") - } + report_message("Checking that all necessary columns exist") missing.var <- setdiff(chi_get_cols(), names(CHIestimates)) if(length(missing.var) > 0){ status <- 0 @@ -136,9 +137,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking for unexpected columns") - } + report_message("Checking for unexpected columns") missing.var <- setdiff(names(CHIestimates), chi_get_cols()) if(length(missing.var) > 0){ status <- 0 @@ -160,25 +159,16 @@ chi_qa_tro <- function(CHIestimates, } ## Confirm variable class ---- - if(verbose){ - message("Checking that variables are of the proper class") - } - - if(verbose){ - message("Validating CHI estimates: ") - } + report_message("Checking that variables are of the proper class") + report_message("Validating CHI estimates: ") rads::tsql_validate_field_types(ph.data = CHIestimates, field_types = unlist(chi_get_yaml()$vars)) # check CHI estimate table - if(verbose){ - message(paste("", "Validating CHI metadata: ", sep = "\n")) - } + report_message(paste("", "Validating CHI metadata: ", sep = "\n")) rads::tsql_validate_field_types(ph.data = CHImetadata, field_types = unlist(chi_get_yaml()$metadata)) # check CHI metadata table if(verbose) message(paste("", "", sep = "\n")) ## Check for missingness ---- - if(verbose){ - message("Checking that critical columns are not missing any values") - } + report_message("Checking that critical columns are not missing any values") for(mycol in c("indicator_key", "year", "data_source", "tab", "cat1", "cat1_group", "run_date")){ if(nrow(CHIestimates[is.na(get(mycol))]) > 0){ @@ -188,12 +178,12 @@ chi_qa_tro <- function(CHIestimates, } if(isFALSE(acs)) - for(mycol in c("numerator", "denominator")){ - if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ - status <- 0 - warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) + for(mycol in c("numerator", "denominator")){ + if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ + status <- 0 + warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) + } } - } for(mycol in c("result", "lower_bound", "upper_bound", "se", "chi", "source_date", "run_date")){ if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ @@ -210,7 +200,7 @@ chi_qa_tro <- function(CHIestimates, for(mycol in names(unlist(chi_get_yaml()$metadata))){ if(nrow(CHImetadata[is.na(get(mycol))]) > 0){ status <- 0 - warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI metadata. \n", "Fix the error and run this QA script again.")) + warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI metadata.")) } } @@ -227,23 +217,19 @@ chi_qa_tro <- function(CHIestimates, ## Basic logic checks for estimates ---- - if(verbose){ - message("Checking for infinite values, which cannot be pushed to SQL") - } + report_message("Checking for infinite values, which cannot be pushed to SQL") for(var in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator")){ if(nrow(CHIestimates[is.infinite(get(var))]) > 0 ){ status <- 0 if(verbose){ - warning(glue::glue("There is at least one row where is.infinite({var}) == T. + warning(glue::glue("There is at least one row where is.infinite({var}) == T. Please fix this problem before rerunning chi_qa_tro() (e.g., by setting it equal to NA) You can view the problematic data by typing something like: View(CHIestimates[is.infinite({var}), ])")) } } } - if(verbose){ - message("Checking that proportions are between zero and one") - } + report_message("Checking that proportions are between zero and one") CHIestimates <- merge(CHIestimates, CHImetadata[, list(indicator_key, result_type)], by = "indicator_key", all.x = TRUE, all.y = FALSE) # merge on result_type if(nrow(CHIestimates[result_type=="proportion" & !result %between% c(0, 1)]) > 0){ status <- 0 @@ -253,9 +239,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking if upper_bound is greater than lower_bound") - } + report_message("Checking if upper_bound is greater than lower_bound") if(nrow(CHIestimates[upper_bound < lower_bound, ])){ status <- 0 if(verbose){ @@ -265,9 +249,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that result is less than or equal to the upper bound") - } + report_message("Checking that result is less than or equal to the upper bound") if(nrow(CHIestimates[!(result <= upper_bound)])){ status <- 0 if(verbose){ @@ -277,9 +259,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that result is greater than or equal to the lower_bound") - } + report_message("Checking that result is greater than or equal to the lower_bound") if(nrow(CHIestimates[!(result >= lower_bound)])){ status <- 0 if(verbose){ @@ -288,34 +268,29 @@ chi_qa_tro <- function(CHIestimates, You can view the problematic data by typing something like: View(CHIestimates[!(result >= lower_bound)])") } } - if(verbose){ - message("Checking that lower_bound is not less than zero") - } + + report_message("Checking that lower_bound is not less than zero") if(nrow(CHIestimates[lower_bound < 0])){ status <- 0 if(verbose){ - warning("There is at least one row where the lower_bound is less than zero (i.e., it is negative). + warning("There is at least one row where the lower_bound is less than zero (i.e., it is negative). Please fix this error prior to rerunning the chi_qa_tro() function. You can view the problematic data by typing something like: View(CHIestimates[lower_bound < 0])") } } - if(verbose){ - message("Checking that RSE is between 0 and 100") - } - # confirmed with Abby 2/7/2020 that want RSE * 100 + report_message("Checking that RSE is between 0 and 100") + # Modified to only check non-NA RSE values if(nrow(CHIestimates[!is.na(rse) & !rse %between% c(0, 100)]) > 0 ){ status <- 0 if(verbose){ warning(paste("There is at least one row where the RSE (relative standard error) is outside the range of (0, 100].", - "This is not necessarily an error, but you should examine the data to make sure it makes sense.", - "You can view the data in question by typing something like: View(CHIestimates[!rse %between% c(0, 100)])", sep = "\n")) + "This is not necessarily an error, but you should examine the data to make sure it makes sense.", + "You can view the data in question by typing something like: View(CHIestimates[!is.na(rse) & !rse %between% c(0, 100)])", sep = "\n")) } } - if(verbose){ - message("Checking that RSE is on scale of 0-100 (i.e., the proportion should have been multiplied by 100)") - } + report_message("Checking that RSE is on scale of 0-100 (i.e., the proportion should have been multiplied by 100)") if(nrow(CHIestimates[!is.na(rse)]) == nrow(CHIestimates[rse <=1])){ status <- 0 if(verbose){ @@ -325,71 +300,19 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that caution flag exists if RSE >= 30%") - } - if(nrow(CHIestimates[!is.na(rse) & rse>=30 & (caution != "!" | is.na(caution)) ]) > 0 ){ + report_message("Checking that caution flag exists if RSE >= 30%") + # Modified to only check non-NA RSE values + if(nrow(CHIestimates[(numerator == 0 | (!is.na(rse) & rse>=30)) & (caution != "!" | is.na(caution)) ]) > 0 ){ status <- 0 if(verbose){ - warning("There is at least one row where a caution flag ('!') is not used and rse >= 30% or is.na(rse) == T. + warning("There is at least one row where a caution flag ('!') is not used and rse >= 30%. Please fix this error prior to rerunning the chi_qa_tro() function. - You can view the problematic data by typing something like: View(CHIestimates[(rse>=30 | is.na(rse)) & (caution != '!' | is.na(caution))])") - } - } - - if(verbose){ - message("Checking for proper rounding") - } - if(verbose){ - message("Checking that result is rounded to three digits") - } - if(sum(CHIestimates$result != rads::round2(CHIestimates$result, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'result' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("Checking that lower_bound is rounded to three digits") - } - if(sum(CHIestimates$lower_bound != rads::round2(CHIestimates$lower_bound, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'lower_bound' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("checking that upper_bound is rounded to three digits") - } - if(sum(CHIestimates$upper_bound != rads::round2(CHIestimates$upper_bound, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'upper_bound' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("Checking that rse is rounded to three digits") - } - if(sum(CHIestimates$rse != rads::round2(CHIestimates$rse, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'rse' column does not appear to be rounded to 3 digits, as specified in the CHI standards") + You can view the problematic data by typing something like: View(CHIestimates[!is.na(rse) & rse>=30 & (caution != '!' | is.na(caution))])") } } - if(verbose){ - message("Checking that se is rounded to four digits") - } - if(sum(CHIestimates$se != rads::round2(CHIestimates$se, 4), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'se' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("Check that all observations have indicators, categories, tab, and year") - } + report_message("Check that all observations have indicators, categories, tab, and year") for(var in c("indicator_key", "tab", "year", "cat1", "cat1_group", "source_date", "run_date")){ if(nrow(CHIestimates[is.na(get(var))]) > 0 ){ status <- 0 @@ -399,6 +322,7 @@ chi_qa_tro <- function(CHIestimates, } } } + if(isFALSE(acs)){ if(nrow(CHIestimates[is.na(cat1_varname)]) > 0 ){ status <- 0 @@ -408,6 +332,7 @@ chi_qa_tro <- function(CHIestimates, } } } + for(var in c("cat2", "cat2_group")){ if(nrow(CHIestimates[tab=="crosstabs" & is.na(get(var))]) > 0 ){ status <- 0 @@ -417,6 +342,7 @@ chi_qa_tro <- function(CHIestimates, } } } + if(isFALSE(acs)){ if(nrow(CHIestimates[tab=="crosstabs" & is.na(cat2_varname)]) > 0 ){ status <- 0 @@ -427,9 +353,8 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that results are present if row is not suppressed") - } + report_message("Checking that results are present if row is not suppressed") + # Modified to handle rse separately, allowing NA when numerator is 0 for(var in c("result", "lower_bound", "upper_bound", "se", "numerator", "denominator")){ if(nrow(CHIestimates[suppression != "^" & is.na(get(var))]) > 0 ){ status <- 0 @@ -440,120 +365,312 @@ chi_qa_tro <- function(CHIestimates, } } + # Special check for rse, allowing NA when numerator is 0 if(nrow(CHIestimates[suppression != "^" & is.na(rse) & numerator != 0]) > 0 ){ status <- 0 if(verbose){ warning(glue::glue("There is at least one row that is not suppressed, where 'rse' is missing and numerator is not 0. - Please fill in the missing value before rerunning chi_qa_tro()")) + Please fill in the missing value before rerunning chi_qa_tro()")) } } - ## Ensure cat1/cat2 values meet CHI standards ---- - if(verbose) { - message("Checking that category combinations align with CHI standards") - } + ## Check rounding ---- + report_message("Checking for proper rounding based on result_type") - # Get reference data - ref_combos <- rads.data::misc_chi_byvars[, list(cat, varname, group)] + # Make sure we have result_type information for each observation + if(!"result_type" %in% names(CHIestimates)) { + # Merge only if not already merged above + CHIestimates <- merge(CHIestimates, CHImetadata[, list(indicator_key, result_type)], + by = "indicator_key", all.x = TRUE, all.y = FALSE) + } - # For ACS data, we only check cat and group combinations - if(acs) { - # Check cat1 combinations - chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( - cat = cat1, - group = cat1_group - )]) + # Check result rounding + report_message("Checking that result is rounded correctly (3 decimals for proportions, 1 decimal for rates)") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(result) & + result != rads::round2(result, 3)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(result) & + result != rads::round2(result, 1)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(result) & + result != round(result)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'result' column has incorrect rounding. Proportions should be rounded to 3 decimal places, + rates to 1 decimal place, and integers to 0 decimal places.") + } + } - cat1_invalid <- chi_cat1_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + # Check lower_bound rounding + report_message("Checking that lower_bound has same rounding as result") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(lower_bound) & + lower_bound != rads::round2(lower_bound, 3)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(lower_bound) & + lower_bound != rads::round2(lower_bound, 1)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(lower_bound) & + lower_bound != round(lower_bound)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'lower_bound' column has incorrect rounding. Should match result rounding: proportions to 3 decimal places, + rates to 1 decimal place, and integers to 0 decimal places.") + } + } - if(nrow(cat1_invalid) > 0) { - status <- 0 - if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", - paste0(" - cat1='", cat1_invalid$cat, "', cat1_group='", - cat1_invalid$group, "'", collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") - } + # Check upper_bound rounding + report_message("Checking that upper_bound has same rounding as result") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(upper_bound) & + upper_bound != rads::round2(upper_bound, 3)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(upper_bound) & + upper_bound != rads::round2(upper_bound, 1)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(upper_bound) & + upper_bound != round(upper_bound)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'upper_bound' column has incorrect rounding. Should match result rounding: proportions to 3 decimal places, + rates to 1 decimal place, and integers to 0 decimal places.") } + } - # Check cat2 combinations - chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( - cat = cat2, - group = cat2_group - )]) + # Check se rounding (one more decimal than result) + report_message("Checking that se is rounded to one more decimal than result (4 for proportions, 2 for rates)") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(se) & + se != rads::round2(se, 4)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(se) & + se != rads::round2(se, 2)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(se) & + se != rads::round2(se, 1)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'se' column has incorrect rounding. Should be one more decimal than result: + 4 decimal places for proportions, 2 for rates, and 1 for integers.") + } + } - cat2_invalid <- chi_cat2_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + # Check rse rounding (always 3 decimals) + report_message("Checking that rse is rounded to three digits for all result types") + wrong_rounding <- CHIestimates[!is.na(rse) & rse != rads::round2(rse, 3)] - if(nrow(cat2_invalid) > 0) { - status <- 0 - if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", - paste0(" - cat2='", cat2_invalid$cat, "', cat2_group='", - cat2_invalid$group, "'", collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") - } - } - } else { - # For non-ACS data, check complete combinations including varname - # Check cat1 combinations - chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( - cat = cat1, - varname = cat1_varname, - group = cat1_group - )]) - - # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity - chi_cat1_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] - chi_cat1_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] - chi_cat1_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] - chi_cat1_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] - - cat1_invalid <- chi_cat1_combos[!ref_combos, on = list(cat, varname, group)] - - if(nrow(cat1_invalid) > 0) { - status <- 0 - if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", - paste0(" - cat1='", cat1_invalid$cat, "', cat1_varname='", - cat1_invalid$varname, "', cat1_group='", cat1_invalid$group, "'", - collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") - } + if(nrow(wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'rse' column has incorrect rounding. Should be rounded to 3 decimal places for all result types.") } + } + + ## Ensure cat1/cat2 values meet CHI standards ---- + if(verbose) { + message("Checking that category combinations align with CHI standards") + } - # Check cat2 combinations - chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( - cat = cat2, - varname = cat2_varname, - group = cat2_group - )]) + # Get reference data + ref_combos <- rads.data::misc_chi_byvars[, list(cat, varname, group)] - # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity - chi_cat2_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] - chi_cat2_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] - chi_cat2_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] - chi_cat2_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + # For ACS data, we only check cat and group combinations + if(acs) { + # Check cat1 combinations + chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( + cat = cat1, + group = cat1_group + )]) - cat2_invalid <- chi_cat2_combos[!ref_combos, on = list(cat, varname, group)] + cat1_invalid <- chi_cat1_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] - if(nrow(cat2_invalid) > 0) { + if(nrow(cat1_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", + paste0(" - cat1='", cat1_invalid$cat, "', cat1_group='", + cat1_invalid$group, "'", collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + + # Check cat2 combinations + chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( + cat = cat2, + group = cat2_group + )]) + + cat2_invalid <- chi_cat2_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + + if(nrow(cat2_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", + paste0(" - cat2='", cat2_invalid$cat, "', cat2_group='", + cat2_invalid$group, "'", collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + } else { + # For non-ACS data, check complete combinations including varname + # Check cat1 combinations + chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( + cat = cat1, + varname = cat1_varname, + group = cat1_group + )]) + + # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity + chi_cat1_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] + chi_cat1_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] + chi_cat1_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] + chi_cat1_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + + cat1_invalid <- chi_cat1_combos[!ref_combos, on = list(cat, varname, group)] + + if(nrow(cat1_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", + paste0(" - cat1='", cat1_invalid$cat, "', cat1_varname='", + cat1_invalid$varname, "', cat1_group='", cat1_invalid$group, "'", + collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + + # Check cat2 combinations + chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( + cat = cat2, + varname = cat2_varname, + group = cat2_group + )]) + + # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity + chi_cat2_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] + chi_cat2_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] + chi_cat2_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] + chi_cat2_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + + cat2_invalid <- chi_cat2_combos[!ref_combos, on = list(cat, varname, group)] + + if(nrow(cat2_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", + paste0(" - cat2='", cat2_invalid$cat, "', cat2_varname='", + cat2_invalid$varname, "', cat2_group='", cat2_invalid$group, "'", + collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + } + + ## Ensure there are no more than 10 years of trend data ---- + trend.years <- sort(unique(CHIestimates[tab == 'trends']$year)) + if( length(trend.years) > 10){ + warning('\U00026A0 There are more than 10 unique years of trend data:\n', + paste(trend.years, collapse = ', ')) + } + + ## Ensure there are no gaps in year data for any indicator and proper multi-year coverage ---- + report_message("Checking for gaps in year data and proper multi-year coverage") + + # Get all unique indicator keys + unique_keys <- unique(CHIestimates$indicator_key) + + # Get all years and split them into single years and multi-year periods + all_years <- unique(CHIestimates$year) + multi_years <- all_years[grepl("-", all_years)] + single_years <- all_years[!grepl("-", all_years)] + + # Check if there are multiple multi-year periods + if(length(multi_years) > 1) { + report_message(paste("\U00002139 Note: Multiple multi-year periods detected:", paste(multi_years, collapse=", "))) + } + + # Define excluded indicators (those that don't need to be in multi-year periods) + excluded_5y_indicators <- c("uninsured_all", "uninsured_adult", "uninsured_child", "uninsured_senior", "uninsured") + + # For multi-year periods, check that each indicator exists + if(length(multi_years) > 0) { + multi_year_data <- CHIestimates[year %in% multi_years] + indicators_in_multi <- unique(multi_year_data$indicator_key) + + # Check which indicators are missing from multi-year periods + indicators_to_check <- setdiff(unique_keys, excluded_5y_indicators) + missing_indicators <- setdiff(indicators_to_check, indicators_in_multi) + + if(length(missing_indicators) > 0) { status <- 0 if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", - paste0(" - cat2='", cat2_invalid$cat, "', cat2_varname='", - cat2_invalid$varname, "', cat2_group='", cat2_invalid$group, "'", - collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + warning(paste("The following indicators are missing from multi-year periods:", + paste0(missing_indicators, collapse=", "))) } } } - ## Ensure there are no more than 10 years of trend data ---- - trend.years <- sort(unique(CHIestimates[tab == 'trends']$year)) - if( length(trend.years) > 10){ - warning('\U00026A0 There are more than 10 unique years of trend data:\n', - paste(trend.years, collapse = ', ')) - } + # Check for continuity in single-year data + if(length(single_years) > 1) { + # Count observations by year for each indicator key (only for single years) + year_count <- unique(CHIestimates[year %in% single_years, list(year, indicator_key)]) + setorder(year_count, indicator_key, year) + + # For each indicator, check if there are breaks in the year sequence + year_count[, year := as.integer(year)] + + # Process each indicator separately + unique_indicators <- unique(year_count$indicator_key) + + for(ind_key in unique_indicators) { + ind_data <- year_count[indicator_key == ind_key] + if(nrow(ind_data) > 1) { # Only check if there are at least 2 years + ind_data[, diff := c(NA, diff(year))] # get difference between subsequent years in the data. + + # Check for gaps, with special handling for 2020 when acs=TRUE + if(acs) { + # For ACS data, allow gap if it's 2020 + gaps <- ind_data[diff > 1 & (year != 2020 & year-1 != 2020), ] + } else { + # For non-ACS data, any gap is an issue + gaps <- ind_data[diff > 1, ] + } + + if(nrow(gaps) > 0) { + status <- 0 + if(verbose) { + gap_years <- paste(gaps$year - 1, gaps$year, sep="-") + warning(paste("Indicator", ind_key, "has gaps in years:", paste(gap_years, collapse=", "))) + } + } + } + } + } ## Print success statement!!!!!!!! #### if(verbose) { @@ -567,6 +684,5 @@ chi_qa_tro <- function(CHIestimates, ## Return ---- return(status) - }