diff --git a/NAMESPACE b/NAMESPACE index 862c77c..e0ff381 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,7 +11,7 @@ export(chi_generate_tro_shell) export(chi_get_cols) export(chi_get_yaml) export(chi_qa_tro) -export(chi_sql_update) +export(chi_update_sql) import(dtsurvey) import(future) import(future.apply) @@ -48,14 +48,13 @@ importFrom(glue,glue) importFrom(glue,glue_sql) importFrom(odbc,odbc) importFrom(rads,calc) -importFrom(rads,chi_cols) -importFrom(rads,compare_estimate) importFrom(rads,round2) importFrom(rads,string_clean) importFrom(rads,substrRight) importFrom(rads,suppress) importFrom(rads,tsql_validate_field_types) importFrom(stats,na.omit) +importFrom(stats,qnorm) importFrom(tidyr,crossing) importFrom(tools,toTitleCase) importFrom(utils,tail) diff --git a/R/chi_calc.R b/R/chi_calc.R index 338d343..052ad13 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -1,7 +1,7 @@ #' Calculate CHI Estimates #' #' @description -#' Generates CHI estimates from input data according to provided instructions. +#' Generates CHI estimates from input data according to provided instructions. #' Handles both proportions and rates, with options for suppression of small numbers. #' #' @param ph.data data.frame or data.table. Input data containing analytic read data. @@ -14,11 +14,13 @@ #' @param suppress_high numeric. Upper bound for suppression. Default: \code{9}. #' @param source_name character. Name of data source. Default: \code{NULL}. #' @param source_date Date. Date ph.data was created. Default: \code{NULL}. +#' @param non_chi_byvars character vector. Variable names to exclude from CHI byvar encoding validation. Default: \code{NULL}. #' #' @return A data.table containing CHI estimates with the following columns: #' \itemize{ #' \item{\code{data_source}} Data source (e.g., acs, brfss, etc.) #' \item{\code{indicator_key}} Unique indicator key +#' \item{\code{level}} factor level of the indicator_key (e.g., Breech vs Cephalic vs other for fetal_pres in birth data) #' \item{\code{tab}} Type of analysis (e.g., demgroups, _kingcounty, etc.) #' \item{\code{year}} Year(s) of data #' \item{\code{cat1}} Describes data field (e.g., Gender, Ethnicity, etc.) @@ -48,10 +50,10 @@ #' #' \code{\link{chi_generate_metadata}} for creating metadata from results #' -#' @importFrom data.table setDT copy setnames := setorder set .SD data.table -#' @importFrom rads calc compare_estimate suppress chi_cols round2 +#' @importFrom data.table setDT copy setnames := setorder set .SD data.table setcolorder +#' @importFrom rads calc suppress round2 #' @importFrom future.apply future_lapply -#' @importFrom stats na.omit +#' @importFrom stats na.omit qnorm #' @import progressr #' @export #' @@ -65,7 +67,8 @@ chi_calc <- function(ph.data = NULL, suppress_low = 0, suppress_high = 9, source_name = NULL, - source_date = NULL){ + source_date = NULL, + non_chi_byvars = NULL){ # Input validation ---- if (is.null(ph.data)) stop("\n\U1F6D1 ph.data must be provided") if (!is.data.frame(ph.data)) stop("\n\U1F6D1 ph.data must be a data.frame or data.table") @@ -102,6 +105,10 @@ chi_calc <- function(ph.data = NULL, if (is.null(source_date)) stop("\n\U1F6D1 source_date must be provided") if (!inherits(source_date, "Date")) stop("\n\U1F6D1 source_date must be a be a Date object") + # Validate non_chi_byvars + if(!is.null(non_chi_byvars)) { + if(!is.character(non_chi_byvars)) stop("\n\U1F6D1 non_chi_byvars must be a character vector") + } # Create 'Overall' if needed for crosstabs ---- if(!'overall' %in% names(ph.data)){ @@ -110,28 +117,61 @@ chi_calc <- function(ph.data = NULL, # Check to make sure all variables needed exist in the data ---- neededbyvars <- unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname))) - if("race3" %in% neededbyvars & !"race3_hispanic" %in% neededbyvars){neededbyvars <- c(neededbyvars, 'race3_hispanic')} # By definition, Hispanic cannot be contained within race3 + + # Handle the race3/race3_hispanic relationship + if("race3" %in% neededbyvars & !"race3_hispanic" %in% neededbyvars) { + neededbyvars <- c(neededbyvars, 'race3_hispanic') + message("\U00002139 Note: Adding 'race3_hispanic' as a required variable because 'race3' is present. By definition, race3 requires separate Hispanic ethnicity information.") + } + + if(!"race3" %in% neededbyvars & "race3_hispanic" %in% neededbyvars) { + neededbyvars <- c(neededbyvars, 'race3') + message("\U00002139 Note: Adding 'race3' as a required variable because 'race3_hispanic' is present. These two variables work together to represent race/ethnicity.") + } neededvars <- unique(na.omit(c(ph.instructions$indicator_key, neededbyvars))) missingvars <- setdiff(neededvars, names(ph.data)) if(length(missingvars) > 0 ){ - stop(paste0("\n\U2620 ph.data is missing the following columns that are specified in ph.instructions: ", paste0(missingvars, collapse = ', '), ". ", - "\nIf `race3_hispanic` is listed, that is because, by definition, `race3` cannot have a Hispanic ethnicity in the same variable. So, two ", - "\nvariables (`race3` & `race3_hispanic`) will be processed and in the output, it will be called `race3`")) + stop(paste0("\n\U2620 ph.data is missing the following columns that are required: ", paste0(missingvars, collapse = ', '))) } else{message("\U0001f642 All specified variables exist in ph.data")} # Check to make sure all byvariables have the CHI specified encoding ---- - stdbyvars <- rads.data::misc_chi_byvars[varname %in% unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)))][, list(varname, group, keepme, reference = 1)] - stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data - phbyvars <- rbindlist(lapply( - X=as.list(neededbyvars), - FUN = function(X){data.table::data.table(varname = X, group = unique(na.omit(ph.data[[X]])), ph.data = 1)})) - compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) - if(nrow(compbyvars[is.na(reference)| is.na(ph.data)]) > 0){ - print(compbyvars[is.na(reference)| is.na(ph.data)]) - stop("\n\U2620 the table above shows the varname/group combinations that do not align between the reference table and your ph.data.") - } else {message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.")} + # Filter out non-CHI variables if specified + chi_byvars <- neededbyvars + if(!is.null(non_chi_byvars)) { + chi_byvars <- setdiff(neededbyvars, non_chi_byvars) + if(length(setdiff(non_chi_byvars, neededbyvars)) > 0) { + message("\U00002139 Note: Some specified non_chi_byvars are not used in the analysis: ", + paste0(setdiff(non_chi_byvars, neededbyvars), collapse = ", ")) + } + if(length(setdiff(neededbyvars, chi_byvars)) > 0) { + message("\U00002139 Note: The following variables will be excluded from CHI encoding validation: ", + paste0(setdiff(neededbyvars, chi_byvars), collapse = ", ")) + } + } + + # Only validate CHI variables + stdbyvars <- rads.data::misc_chi_byvars[varname %in% unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)))] + stdbyvars <- stdbyvars[!varname %in% non_chi_byvars][, list(varname, group, keepme, reference = 1)] + stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data + + phbyvars <- rbindlist(lapply( + X=as.list(chi_byvars), + FUN = function(X){data.table::data.table(varname = X, group = unique(na.omit(ph.data[[X]])), ph.data = 1)})) + + # Skip validation if there are no CHI variables to validate after excluding non_chi_byvars + if(nrow(phbyvars) > 0 && nrow(stdbyvars) > 0) { + compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) + if(nrow(compbyvars[is.na(reference) | is.na(ph.data)]) > 0){ + print(compbyvars[is.na(reference) | is.na(ph.data)]) + stop("\n\U2620 the table above shows the varname/group combinations that do not align between the reference table and your ph.data.") + } else {message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.")} + } else if(length(chi_byvars) > 0) { + message("\U00002139 Note: No CHI variables to validate after excluding non_chi_byvars.") + } else { + message("\U00002139 Note: No variables to validate for CHI encoding.") + } # Use rads::calc to generate estimates for each row of ph.instructions ---- message("\U023F3 Be patient! The function is generating estimates for each row of ph.instructions.") @@ -211,6 +251,10 @@ chi_calc <- function(ph.data = NULL, c("mean", "mean_lower", "mean_upper", "mean_se"), c("result", "lower_bound", "upper_bound", "se")) + if(!"level" %in% names(tempest)) { + tempest[, level := NA_character_] + } + # set correct data types for TSQL database tempest[, denominator := as.numeric(denominator)] tempest[, numerator := as.numeric(numerator)] @@ -222,12 +266,39 @@ chi_calc <- function(ph.data = NULL, # Tidy results ---- + # When there are no observation in a row, rads::calc() gives NA for numerator, but want zero ---- + tempCHIest[is.na(numerator), `:=` ( + result = 0, + se = 0, + lower_bound = 0, + upper_bound = 0, + numerator = 0, + rse = NA # undefined because mean will be zero + )] + # drop when cat1_group is missing (e.g., cat1 == 'Regions' and region is NA) ---- tempCHIest <- tempCHIest[!is.na(cat1_group)] # drop when cat2_group is missing but cat2 is not missing ---- tempCHIest <- tempCHIest[!(is.na(cat2_group) & !is.na(cat2))] + # Apply Wilson Score method for confidence intervals when result is 0% or 100% ---- + # This handles cases where standard methods fail at the extremes by providing more appropriate + # bounds that don't exceed the logical limits while maintaining the specified confidence level + + # Calculate z-value based on the provided confidence interval + z_value <- qnorm(1-0.5*(1-ci)) + + # Lower bound using Wilson Score method + tempCHIest[result %in% c(0, 1) & denominator > 10, + lower_bound := (2 * numerator + z_value^2 - z_value * sqrt(z_value^2 + 4 * numerator * (1 - numerator/denominator))) / + (2 * (denominator + z_value^2))] + + # Upper bound using Wilson Score method + tempCHIest[result %in% c(0, 1) & denominator > 10, + upper_bound := (2 * numerator + z_value^2 + z_value * sqrt(z_value^2 + 4 * numerator * (1 - numerator/denominator))) / + (2 * (denominator + z_value^2))] + # drop if cat1_group | cat2_group had `keepme == "No"` in the reference table ---- dropme <- unique(stdbyvars[keepme == 'No'][, reference := NULL]) tempCHIest <- merge(tempCHIest, @@ -301,17 +372,20 @@ chi_calc <- function(ph.data = NULL, secondary_exclude = cat1_varname != 'race3') } else {tempCHIest[, suppression := NA_character_]} - tempCHIest[rse>=30, caution := "!"] + tempCHIest[rse>=30 | numerator == 0, caution := "!"] tempCHIest[, c('cat2', 'cat2_group', 'cat2_varname') := lapply(.SD, as.character), .SDcols = c('cat2', 'cat2_group', 'cat2_varname')] # Keep and order standard CHI columns ---- - tempCHIest <- tempCHIest[, chi_get_cols(), with = F] + all_cols <- c(chi_get_cols(), "level") + all_cols <- unique(all_cols) # In case level is already included + tempCHIest <- tempCHIest[, all_cols, with = F] tempCHIest <- tempCHIest[, cat1 := factor(cat1, levels = c("King County", sort(setdiff(unique(tempCHIest$cat1), "King County"))) )] tempCHIest <- tempCHIest[, tab := factor(tab, levels = c(c("_kingcounty","demgroups", "trends"), sort(setdiff(unique(tempCHIest$tab), c("_kingcounty","demgroups", "trends")))) )] setorder(tempCHIest, indicator_key, tab, -year, cat1, cat1_group, cat2, cat2_group) + setcolorder(tempCHIest, c('data_source', 'indicator_key', 'level')) # return the CHI table ---- return(tempCHIest) diff --git a/R/chi_compare_estimates.R b/R/chi_compare_estimates.R index 13cc186..d784505 100644 --- a/R/chi_compare_estimates.R +++ b/R/chi_compare_estimates.R @@ -35,7 +35,7 @@ #' @seealso #' \code{\link{chi_qa_tro}} for validating individual datasets #' -#' \code{\link{chi_sql_update}} for uploading validated results +#' \code{\link{chi_update_sql}} for uploading validated results #' #' @return data.table ordered by absolute difference, containing: #' - Difference metrics (absolute_diff, relative_diff) diff --git a/R/chi_generate_analysis_set.R b/R/chi_generate_analysis_set.R index e92825b..42a2be1 100644 --- a/R/chi_generate_analysis_set.R +++ b/R/chi_generate_analysis_set.R @@ -2,40 +2,45 @@ #' #' @description #' Creates sets of indicator keys that share common analysis patterns within CHI -#' (Community Health Indicators) data. This function reads the most recent production -#' version of CHI estimates for a specified data source and groups indicators that -#' use the same combinations of \code{cat1}, \code{cat1_varname}, and \code{trends} -#' columns. +#' (Community Health Indicators) data. This function can either: +#' +#' 1. Read the most recent production version of CHI estimates from the TSQL Production Server +#' for a specified data source, or +#' +#' 2. Process a provided data.table/data.frame containing CHI data +#' +#' It groups indicators that use the same combinations of \code{cat1}, \code{cat1_varname}, and +#' tab columns (\code{_kingcounty}, \code{_wastate}, \code{demgroups}, \code{crosstabs}, \code{trends}). #' #' @param data_source Character string (length 1) specifying the data source to analyze #' (e.g., \code{'birth'}, \code{'brfss'}, \code{'chars'}, \code{'death'}, \code{'hys'}). This corresponds to a table -#' name in the PHExtractStore database. +#' name in the PHExtractStore database on the TSQL Production Server. Required if \code{CHIestimates} is not provided. +#' @param CHIestimates A data.table or data.frame containing CHI data to analyze. If provided, \code{data_source} +#' is ignored. Must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function in this package. #' #' @return A data.table containing analysis sets with the following columns: #' \itemize{ #' \item \code{set}: Integer identifying groups of indicators with identical analysis patterns #' \item \code{cat1}: Category name as expected in CHI TRO #' \item \code{cat1_varname}: Category variable name as expected in CHI TRO -#' \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab -#' \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab -#' \item \code{demgroups}: \code{'x'} if analysis includes demographic groups -#' \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations -#' \item \code{trends}: \code{'x'} if analysis includes trends analysis +#' \item \code{_kingcounty}: \code{'x'} if analysis includes '_kingcounty' in the tab column +#' \item \code{_wastate}: \code{'x'} if analysis includes '_wastate' in the tab column +#' \item \code{demgroups}: \code{'x'} if analysis includes 'demgroups' in the tab column +#' \item \code{crosstabs}: \code{'x'} if analysis includes 'crosstabs' in the tab column +#' \item \code{trends}: \code{'x'} if analysis includes 'trends' in the tab column #' \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern #' } #' #' @details #' This function generates a table for \code{\link{chi_generate_tro_shell}}, providing the -#' structure required for generating analysis instructions. It connects to the -#' \code{[PHExtractStore]} database on \code{KCITSQLPRPHIP40} to retrieve the latest production data. +#' structure required for generating analysis instructions. +#' +#' When using \code{data_source}, the function connects to the +#' \code{[PHExtractStore]} database on the TSQL Production Server \code{KCITSQLPRPHIP40} to retrieve the latest production estimates. #' Users need appropriate database credentials - contact your manager if you need access. #' -#' Typically, the analysis will not change from year to year. However, you should -#' compare the output of this function with the 'CHI-Variables_ByDataSource' worksheet -#' in -#' \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ -#' SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} -#' and update it as necessary. +#' When using \code{CHIestimates}, the provided estimates are processed directly, bypassing the database connection. +#' The data must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function. #' #' The output structure directly informs \code{\link{chi_generate_tro_shell}} about which #' indicators should be analyzed together based on their shared patterns of @@ -52,28 +57,46 @@ #' #' @examples #' \dontrun{ -#' # Generate analysis sets for birth data +#' # Generate analysis sets for birth data from the database #' birth_sets <- chi_generate_analysis_set("birth") #' -#' # Generate analysis sets for BRFSS data +#' # Generate analysis sets for BRFSS data from the database #' brfss_sets <- chi_generate_analysis_set("brfss") +#' +#' # Generate analysis sets from an existing data.table +#' my_data <- data.table::fread("my_chi_data.csv") +#' custom_sets <- chi_generate_analysis_set(CHIestimates = my_data) #' } #' -chi_generate_analysis_set <- function(data_source = NULL) { - # Input validation - if (is.null(data_source)) { - stop("\n\U1F6D1 data_source parameter must be provided") +chi_generate_analysis_set <- function(data_source = NULL, + CHIestimates = NULL) { + + # Input validation ---- + if (is.null(data_source) && is.null(CHIestimates)) { + stop("\n\U1F6D1 Either data_source parameter or CHIestimates parameter must be provided") + } + + if (!is.null(data_source) && !is.null(CHIestimates)) { + warning("\n\U26A0 Both data_source and CHIestimates provided. Using CHIestimates and ignoring data_source.") } - if (!is.character(data_source) || length(data_source) != 1) { + + if (!is.null(data_source) && !(is.character(data_source) && length(data_source) == 1)) { stop("\n\U1F6D1 data_source must be a single character string, e.g., 'birth'") } + if (!is.null(CHIestimates) && !inherits(CHIestimates, c("data.table", "data.frame"))) { + stop("\n\U1F6D1 CHIestimates must be a data.table or data.frame") + } + # Get data ---- - # Construct the full table name for error messages + if (!is.null(CHIestimates)) { + CHIestimates <- data.table::setDT(data.table::copy(CHIestimates)) + } else { + # Construct the full table name for error messages full_table_name <- paste0("[APDE].[", data_source, "_results]") server_info <- "KCITSQLPRPHIP40 > PHExtractStore" - # try to make a database connection + # try to make a database connection tryCatch({ # Establish connection cnxn <- odbc::dbConnect( @@ -92,55 +115,56 @@ chi_generate_analysis_set <- function(data_source = NULL) { } }) - - # Check if table exists before attempting query + # Check if table exists before attempting query table_exists <- DBI::dbExistsTable( conn = cnxn, name = paste0(data_source, "_results"), schema = "APDE" ) - # If table exists, load into memory + # If table exists, load into memory if (!table_exists) { stop(paste0("\U1F6D1 You specified data_source = '", data_source, "', which attempted to download ", full_table_name, " from ", server_info, ". This table does not exist.")) } else { - tempdt <- data.table::setDT(DBI::dbGetQuery( - conn = cnxn, - statement = paste0("SELECT * FROM ", full_table_name) - )) } + CHIestimates <- data.table::setDT(DBI::dbGetQuery( + conn = cnxn, + statement = paste0("SELECT * FROM ", full_table_name) + )) + } - # Close database connection + # Close database connection if (exists("cnxn") && !is.null(cnxn)) { DBI::dbDisconnect(cnxn) } + } # Recodes for race3 & race4 ---- # Necessary because they are wonky as heck due to how APDE decided to code/display them - race3_remix1 <- tempdt[(grepl('race/ethnicity$', cat1, ignore.case = T) & cat1_varname == 'race3')] - tempdt <- rbind( - fsetdiff(tempdt, race3_remix1), + race3_remix1 <- CHIestimates[(grepl('race/ethnicity$', cat1, ignore.case = T) & cat1_varname == 'race3')] + CHIestimates <- rbind( + fsetdiff(CHIestimates, race3_remix1), copy(race3_remix1)[, cat1 := gsub('race/ethnicity$', 'race', cat1)][, cat1 := gsub('Race/ethnicity$', 'Race', cat1)], copy(race3_remix1)[, cat1 := gsub('race/ethnicity$', 'ethnicity', cat1)][, cat1 := gsub('Race/ethnicity$', 'Ethnicity', cat1)] ) - race3_remix2 <- tempdt[(grepl('race/ethnicity$', cat2, ignore.case = T) & cat2_varname == 'race3')] - tempdt <- rbind( - fsetdiff(tempdt, race3_remix2), + race3_remix2 <- CHIestimates[(grepl('race/ethnicity$', cat2, ignore.case = T) & cat2_varname == 'race3')] + CHIestimates <- rbind( + fsetdiff(CHIestimates, race3_remix2), copy(race3_remix2)[, cat2 := gsub('race/ethnicity$', 'race', cat2)][, cat2 := gsub('Race/ethnicity$', 'Race', cat2)], copy(race3_remix2)[, cat2 := gsub('race/ethnicity$', 'ethnicity', cat2)][, cat2 := gsub('Race/ethnicity$', 'Ethnicity', cat2)] ) - tempdt[cat1_varname == 'race4', cat1 := gsub('race$', 'race/ethnicity', cat1)] - tempdt[cat1_varname == 'race4', cat1 := gsub('Race$', 'Race/ethnicity', cat1)] + CHIestimates[cat1_varname == 'race4', cat1 := gsub('race$', 'race/ethnicity', cat1)] + CHIestimates[cat1_varname == 'race4', cat1 := gsub('Race$', 'Race/ethnicity', cat1)] - tempdt[cat2_varname == 'race4', cat2 := gsub('race$', 'race/ethnicity', cat2)] - tempdt[cat2_varname == 'race4', cat2 := gsub('Race$', 'Race/ethnicity', cat2)] + CHIestimates[cat2_varname == 'race4', cat2 := gsub('race$', 'race/ethnicity', cat2)] + CHIestimates[cat2_varname == 'race4', cat2 := gsub('Race$', 'Race/ethnicity', cat2)] # Table of categories and tabs per indicator ---- # For cat1 combinations - tab_patterns <- tempdt[, list( + tab_patterns <- CHIestimates[, list( `_kingcounty` = fifelse(any(tab == "_kingcounty"), "x", ""), `_wastate` = fifelse(any(tab == "_wastate"), "x", ""), demgroups = fifelse(any(tab == "demgroups"), "x", ""), @@ -149,7 +173,7 @@ chi_generate_analysis_set <- function(data_source = NULL) { ), by = list(indicator_key, cat1, cat1_varname)] # For cat2 combinations - tab_patterns2 <- tempdt[!is.na(cat2), list( + tab_patterns2 <- CHIestimates[!is.na(cat2), list( `_kingcounty` = fifelse(any(tab == "_kingcounty"), "x", ""), `_wastate` = fifelse(any(tab == "_wastate"), "x", ""), demgroups = fifelse(any(tab == "demgroups"), "x", ""), diff --git a/R/chi_generate_metadata.R b/R/chi_generate_metadata.R index a5e6801..0d0aca6 100644 --- a/R/chi_generate_metadata.R +++ b/R/chi_generate_metadata.R @@ -1,10 +1,13 @@ #' CHI Generate Metadata #' #' @description -#' function to generate metadata table combining existing metadata and latest estimates. +#' Function to generate metadata table combining existing metadata and latest estimates. #' #' @param meta.old Name of a data.table with the prior year's metadata #' @param est.current current year's tableau ready output with completed estimates +#' @param allowed_missing A list of lists specifying allowed missing metadata values. Each inner list +#' should have \code{indicator_key} and \code{column} elements. +#' @param warn_only Logical, if TRUE, only warns about unexpected missing values instead of stopping #' #' @return table of metadata #' @@ -19,30 +22,30 @@ #' @export #' chi_generate_metadata <- function(meta.old = NULL, - est.current = NULL){ + est.current = NULL, + allowed_missing = list( + list(indicator_key = "medinc", column = "latest_year_count") + # Add more exceptions here as needed + ), + warn_only = FALSE){ # Input validation ---- if (is.null(meta.old)) stop("\n\U1F6D1 meta.old must be provided") if (!is.data.frame(meta.old)) stop("\n\U1F6D1 meta.old must be a data.frame or data.table") - if (is.null(est.current)) stop("\n\U1F6D1 est.current must be provided") if (!is.data.frame(est.current)) stop("\n\U1F6D1 est.current must be a data.frame or data.table") - # Convert to data.table if needed ---- if (!is.data.table(meta.old)) setDT(meta.old) if (!is.data.table(est.current)) setDT(est.current) - # get new metadata ---- meta.new <- unique(est.current[tab == "metadata", list(indicator_key, - latest_yearx = as.integer(year), - latest_year_resultx = result, - run_datex = run_date, - latest_year_countx = as.integer(numerator), - latest_year_kc_popx = as.integer(denominator))]) - + latest_yearx = as.integer(year), + latest_year_resultx = result, + run_datex = run_date, + latest_year_countx = as.integer(numerator), + latest_year_kc_popx = as.integer(denominator))]) # merge new metadata onto old metadata ---- meta.new <- merge(meta.old, meta.new, by = c("indicator_key"), all = T) - # update with newest data ---- # only replace old data when there is new data because may stop calculating indicators, in which case, would want to keep old data meta.new[!is.na(latest_yearx), latest_year := as.numeric(latest_yearx)] @@ -51,11 +54,9 @@ chi_generate_metadata <- function(meta.old = NULL, meta.new[!is.na(latest_year_countx), latest_year_count := latest_year_countx] meta.new[!is.na(latest_year_kc_popx), latest_year_kc_pop := latest_year_kc_popx] meta.new[, c("latest_yearx", "latest_year_resultx", "run_datex", "latest_year_countx", "latest_year_kc_popx") := NULL] - # update valid_years ---- meta.new[as.integer(latest_year) > suppressWarnings(as.integer(rads::substrRight(valid_years, 1, 4))), valid_years := suppressWarnings(paste(as.integer(substr(valid_years, 1, 4)):as.integer(latest_year), collapse = " "))] - # Since trends only have 10 years of data, valid_years should be limited to 10 years max ---- meta.new[, valid_years := { allyears <- sort(as.integer(strsplit(valid_years, " ")[[1]])) # convert valid_years to a vector of numbers @@ -66,15 +67,47 @@ chi_generate_metadata <- function(meta.old = NULL, } }, by = indicator_key] - # Ensure there are no missing important metadata cells ---- - missing.per.col <- sapply(meta.new, function(x) sum(is.na(x))) - if(sum(missing.per.col) > 0){ - stop("You are missing at least one critical meta.new[] value.") + # Ensure there are no missing important metadata cells (with exceptions) ---- + unexpected_missing <- data.frame() + + for(col in names(meta.new)) { + na_rows <- which(is.na(meta.new[[col]])) + + if(length(na_rows) > 0) { + for(row in na_rows) { + ind_key <- meta.new$indicator_key[row] + + # Check if this is an allowed exception + is_allowed <- any(sapply(allowed_missing, function(exc) { + exc$indicator_key == ind_key && exc$column == col + })) + + if(!is_allowed) { + unexpected_missing <- rbind(unexpected_missing, data.frame( + indicator_key = ind_key, + column = col, + stringsAsFactors = FALSE + )) + } + } + } + } + + if(nrow(unexpected_missing) > 0) { + msg <- paste0("Unexpected missing metadata values:\n", + paste(apply(unexpected_missing, 1, function(row) { + paste0("Indicator '", row[1], "' is missing value for '", row[2], "'") + }), collapse = "\n")) + + if(warn_only) { + warning(paste0('\U00026A0 ', msg)) + } else { + stop(paste0('\U1F6D1 ', msg)) + } } # order metadata table ---- setorder(meta.new, indicator_key) - # return table ---- return(meta.new) } diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 6c55703..05bc574 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -94,6 +94,8 @@ chi_generate_tro_shell <- function(ph.analysis_set, #advisory messages if("x" %in% ph.analysis_set$trends) {message("Note: trends are applied backwards from end.year")} + # Race / ethnicity is a chronic headache with CHI. Need to remove rows for race4 & Ethnicity because should be Race/ethnicity + ph.analysis_set <- ph.analysis_set[!(cat1_varname == 'race4' & cat1 == 'Ethnicity')] # apply the template generating function template <- rbindlist( @@ -102,7 +104,9 @@ chi_generate_tro_shell <- function(ph.analysis_set, # split trends from other tabs because processed for multiple years template.trends <- template[tab=='trends'] - template <- template[tab != 'trends'] + if(nrow(template.trends) > 0){ + template <- template[tab != 'trends'] + } # add years to template (non-trend) template[, end := end.year] @@ -111,14 +115,16 @@ chi_generate_tro_shell <- function(ph.analysis_set, template[tab == '_kingcounty'][, tab := 'metadata'][, start := end.year]) # add years to template (trends) - trend.years <- chi_process_trends(indicator_key = unique(template$indicator_key), - trend.span = trend.span, - end.year = end.year, - trend.periods = trend.periods) - template.trends <- merge(template.trends, trend.years, by = 'indicator_key', all = T, allow.cartesian = T) - - # append trends template to main template - template <- rbind(template, template.trends) + if(nrow(template.trends) > 0){ + trend.years <- chi_process_trends(indicator_key = intersect(unique(template$indicator_key), unique(template.trends$indicator_key)), + trend.span = trend.span, + end.year = end.year, + trend.periods = trend.periods) + template.trends <- merge(template.trends, trend.years, by = 'indicator_key', all = T, allow.cartesian = T) + + # append trends template to main template + template <- rbind(template, template.trends) + } return(template) } diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index 0529102..17a1db1 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -56,68 +56,69 @@ chi_qa_tro <- function(CHIestimates, verbose = TRUE){ status <- 1 + ## Helper functions + report_issue <- function(message) { + status <<- 0 + if(verbose) { + warning(message) + } + return(invisible(NULL)) + } + + report_message <- function(message) { + if(verbose) { + message(message) + } + return(invisible(NULL)) + } + ## Check arguments ---- if(!is.logical(verbose)){ stop('verbose must a logical, i.e., TRUE or FALSE') } - - if(verbose){ - message("Checking that that `acs` is logical") - } + report_message("Checking that that `acs` is logical") if(!is.logical(acs)){ stop('acs must a logical, i.e., TRUE or FALSE') } - - if(verbose){ - message("Checking that both the results and the metadata were provided") + report_message("Checking that both the results and the metadata were provided") + if(is.null(CHIestimates)){ + status <- 0 + if(verbose) { + warning("You must provide the name of a data.frame or data.table that contains the CHI results.") + } } - if(is.null(CHIestimates)){ - status <- 0 - if(verbose) { - warning("You must provide the name of a data.frame or data.table that contains the CHI results.") - } + if(is.null(CHImetadata)){ + status <- 0 + if(verbose){ + warning("You must provide the name of a data.frame or data.table that contains the CHI metadata ") } - if(is.null(CHImetadata)){ - status <- 0 - if(verbose){ - warning("You must provide the name of a data.frame or data.table that contains the CHI metadata ") - } + } + #if both data sets are not provided, abort check + if(status == 0) { + if(verbose){ + stop("Check incomplete. Please correct errors to proceed") } - #if both data sets are not provided, abort check - if(status == 0) { - if(verbose){ - stop("Check incomplete. Please correct errors to proceed") - } - return(status) - } + return(status) + } - CHIestimates <- data.table::setDT(copy(CHIestimates)) - CHImetadata <- data.table::setDT(copy(CHImetadata)) + CHIestimates <- data.table::setDT(copy(CHIestimates)) + CHImetadata <- data.table::setDT(copy(CHImetadata)) ## Check columns ---- - if(verbose) { - message("Checking that all column names are unique") - } + report_message("Checking that all column names are unique") if(length(names(CHIestimates)) != length(unique(names(CHIestimates)))) { - status <- 0 - if(verbose){ - warning("You submitted a dataset where at least two columns have the same name. All names in CHIestimates must be unique.") - } + report_issue("You submitted a dataset where at least two columns have the same name. All names in CHIestimates must be unique.") } + if(length(names(CHImetadata)) != length(unique(names(CHImetadata)))) { - status <- 0 - if(verbose){ - warning("You submitted a metadata table where at least two columns have the same name. All names in CHImetadata must be unique.") - } + report_issue("You submitted a metadata table where at least two columns have the same name. All names in CHImetadata must be unique.") } - if(verbose){ - message("Checking that all necessary columns exist") - } + report_message("Checking that all necessary columns exist") missing.var <- setdiff(chi_get_cols(), names(CHIestimates)) if(length(missing.var) > 0){ status <- 0 @@ -136,9 +137,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking for unexpected columns") - } + report_message("Checking for unexpected columns") missing.var <- setdiff(names(CHIestimates), chi_get_cols()) if(length(missing.var) > 0){ status <- 0 @@ -160,25 +159,16 @@ chi_qa_tro <- function(CHIestimates, } ## Confirm variable class ---- - if(verbose){ - message("Checking that variables are of the proper class") - } - - if(verbose){ - message("Validating CHI estimates: ") - } + report_message("Checking that variables are of the proper class") + report_message("Validating CHI estimates: ") rads::tsql_validate_field_types(ph.data = CHIestimates, field_types = unlist(chi_get_yaml()$vars)) # check CHI estimate table - if(verbose){ - message(paste("", "Validating CHI metadata: ", sep = "\n")) - } + report_message(paste("", "Validating CHI metadata: ", sep = "\n")) rads::tsql_validate_field_types(ph.data = CHImetadata, field_types = unlist(chi_get_yaml()$metadata)) # check CHI metadata table if(verbose) message(paste("", "", sep = "\n")) ## Check for missingness ---- - if(verbose){ - message("Checking that critical columns are not missing any values") - } + report_message("Checking that critical columns are not missing any values") for(mycol in c("indicator_key", "year", "data_source", "tab", "cat1", "cat1_group", "run_date")){ if(nrow(CHIestimates[is.na(get(mycol))]) > 0){ @@ -187,16 +177,30 @@ chi_qa_tro <- function(CHIestimates, } } - for(mycol in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator", "chi", "source_date", "run_date")){ + if(isFALSE(acs)) + for(mycol in c("numerator", "denominator")){ + if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ + status <- 0 + warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) + } + } + + for(mycol in c("result", "lower_bound", "upper_bound", "se", "chi", "source_date", "run_date")){ if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ status <- 0 warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) } } + + if(nrow(CHIestimates[is.na(rse) & is.na(suppression) & numerator != 0]) > 0){ + status <- 0 + warning("\U00026A0 Warning: 'rse' is missing in at least one row of the CHI data where numerator is not 0.") + } + for(mycol in names(unlist(chi_get_yaml()$metadata))){ if(nrow(CHImetadata[is.na(get(mycol))]) > 0){ status <- 0 - warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI metadata. \n", "Fix the error and run this QA script again.")) + warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI metadata.")) } } @@ -213,23 +217,19 @@ chi_qa_tro <- function(CHIestimates, ## Basic logic checks for estimates ---- - if(verbose){ - message("Checking for infinite values, which cannot be pushed to SQL") - } + report_message("Checking for infinite values, which cannot be pushed to SQL") for(var in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator")){ if(nrow(CHIestimates[is.infinite(get(var))]) > 0 ){ status <- 0 if(verbose){ - warning(glue::glue("There is at least one row where is.infinite({var}) == T. + warning(glue::glue("There is at least one row where is.infinite({var}) == T. Please fix this problem before rerunning chi_qa_tro() (e.g., by setting it equal to NA) You can view the problematic data by typing something like: View(CHIestimates[is.infinite({var}), ])")) } } } - if(verbose){ - message("Checking that proportions are between zero and one") - } + report_message("Checking that proportions are between zero and one") CHIestimates <- merge(CHIestimates, CHImetadata[, list(indicator_key, result_type)], by = "indicator_key", all.x = TRUE, all.y = FALSE) # merge on result_type if(nrow(CHIestimates[result_type=="proportion" & !result %between% c(0, 1)]) > 0){ status <- 0 @@ -239,9 +239,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking if upper_bound is greater than lower_bound") - } + report_message("Checking if upper_bound is greater than lower_bound") if(nrow(CHIestimates[upper_bound < lower_bound, ])){ status <- 0 if(verbose){ @@ -251,9 +249,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that result is less than or equal to the upper bound") - } + report_message("Checking that result is less than or equal to the upper bound") if(nrow(CHIestimates[!(result <= upper_bound)])){ status <- 0 if(verbose){ @@ -263,9 +259,7 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that result is greater than or equal to the lower_bound") - } + report_message("Checking that result is greater than or equal to the lower_bound") if(nrow(CHIestimates[!(result >= lower_bound)])){ status <- 0 if(verbose){ @@ -274,34 +268,29 @@ chi_qa_tro <- function(CHIestimates, You can view the problematic data by typing something like: View(CHIestimates[!(result >= lower_bound)])") } } - if(verbose){ - message("Checking that lower_bound is not less than zero") - } + + report_message("Checking that lower_bound is not less than zero") if(nrow(CHIestimates[lower_bound < 0])){ status <- 0 if(verbose){ - warning("There is at least one row where the lower_bound is less than zero (i.e., it is negative). + warning("There is at least one row where the lower_bound is less than zero (i.e., it is negative). Please fix this error prior to rerunning the chi_qa_tro() function. You can view the problematic data by typing something like: View(CHIestimates[lower_bound < 0])") } } - if(verbose){ - message("Checking that RSE is between 0 and 100") - } - # confirmed with Abby 2/7/2020 that want RSE * 100 - if(nrow(CHIestimates[!rse %between% c(0, 100)]) > 0 ){ + report_message("Checking that RSE is between 0 and 100") + # Modified to only check non-NA RSE values + if(nrow(CHIestimates[!is.na(rse) & !rse %between% c(0, 100)]) > 0 ){ status <- 0 if(verbose){ warning(paste("There is at least one row where the RSE (relative standard error) is outside the range of (0, 100].", - "This is not necessarily an error, but you should examine the data to make sure it makes sense.", - "You can view the data in question by typing something like: View(CHIestimates[!rse %between% c(0, 100)])", sep = "\n")) + "This is not necessarily an error, but you should examine the data to make sure it makes sense.", + "You can view the data in question by typing something like: View(CHIestimates[!is.na(rse) & !rse %between% c(0, 100)])", sep = "\n")) } } - if(verbose){ - message("Checking that RSE is on scale of 0-100 (i.e., the proportion should have been multiplied by 100)") - } + report_message("Checking that RSE is on scale of 0-100 (i.e., the proportion should have been multiplied by 100)") if(nrow(CHIestimates[!is.na(rse)]) == nrow(CHIestimates[rse <=1])){ status <- 0 if(verbose){ @@ -311,71 +300,19 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that caution flag exists if RSE >= 30%") - } - if(nrow(CHIestimates[rse>=30 & (caution != "!" | is.na(caution)) ]) > 0 ){ + report_message("Checking that caution flag exists if RSE >= 30%") + # Modified to only check non-NA RSE values + if(nrow(CHIestimates[(numerator == 0 | (!is.na(rse) & rse>=30)) & (caution != "!" | is.na(caution)) ]) > 0 ){ status <- 0 if(verbose){ - warning("There is at least one row where a caution flag ('!') is not used and rse >= 30% or is.na(rse) == T. + warning("There is at least one row where a caution flag ('!') is not used and rse >= 30%. Please fix this error prior to rerunning the chi_qa_tro() function. - You can view the problematic data by typing something like: View(CHIestimates[(rse>=30 | is.na(rse)) & (caution != '!' | is.na(caution))])") + You can view the problematic data by typing something like: View(CHIestimates[!is.na(rse) & rse>=30 & (caution != '!' | is.na(caution))])") } } - if(verbose){ - message("Checking for proper rounding") - } - if(verbose){ - message("Checking that result is rounded to three digits") - } - if(sum(CHIestimates$result != rads::round2(CHIestimates$result, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'result' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("Checking that lower_bound is rounded to three digits") - } - if(sum(CHIestimates$lower_bound != rads::round2(CHIestimates$lower_bound, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'lower_bound' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("checking that upper_bound is rounded to three digits") - } - if(sum(CHIestimates$upper_bound != rads::round2(CHIestimates$upper_bound, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'upper_bound' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("Checking that rse is rounded to three digits") - } - if(sum(CHIestimates$rse != rads::round2(CHIestimates$rse, 3), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'rse' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - if(verbose){ - message("Checking that se is rounded to four digits") - } - if(sum(CHIestimates$se != rads::round2(CHIestimates$se, 4), na.rm = T) != 0) { - status <- 0 - if(verbose){ - warning("The 'se' column does not appear to be rounded to 3 digits, as specified in the CHI standards") - } - } - - if(verbose){ - message("Check that all observations have indicators, categories, tab, and year") - } + report_message("Check that all observations have indicators, categories, tab, and year") for(var in c("indicator_key", "tab", "year", "cat1", "cat1_group", "source_date", "run_date")){ if(nrow(CHIestimates[is.na(get(var))]) > 0 ){ status <- 0 @@ -385,7 +322,8 @@ chi_qa_tro <- function(CHIestimates, } } } - if(acs==F){ + + if(isFALSE(acs)){ if(nrow(CHIestimates[is.na(cat1_varname)]) > 0 ){ status <- 0 if(verbose){ @@ -394,6 +332,7 @@ chi_qa_tro <- function(CHIestimates, } } } + for(var in c("cat2", "cat2_group")){ if(nrow(CHIestimates[tab=="crosstabs" & is.na(get(var))]) > 0 ){ status <- 0 @@ -403,7 +342,8 @@ chi_qa_tro <- function(CHIestimates, } } } - if(acs==F){ + + if(isFALSE(acs)){ if(nrow(CHIestimates[tab=="crosstabs" & is.na(cat2_varname)]) > 0 ){ status <- 0 if(verbose){ @@ -413,10 +353,9 @@ chi_qa_tro <- function(CHIestimates, } } - if(verbose){ - message("Checking that results are present if row is not suppressed") - } - for(var in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator")){ + report_message("Checking that results are present if row is not suppressed") + # Modified to handle rse separately, allowing NA when numerator is 0 + for(var in c("result", "lower_bound", "upper_bound", "se", "numerator", "denominator")){ if(nrow(CHIestimates[suppression != "^" & is.na(get(var))]) > 0 ){ status <- 0 if(verbose){ @@ -426,112 +365,312 @@ chi_qa_tro <- function(CHIestimates, } } - ## Ensure cat1/cat2 values meet CHI standards ---- - if(verbose) { - message("Checking that category combinations align with CHI standards") + # Special check for rse, allowing NA when numerator is 0 + if(nrow(CHIestimates[suppression != "^" & is.na(rse) & numerator != 0]) > 0 ){ + status <- 0 + if(verbose){ + warning(glue::glue("There is at least one row that is not suppressed, where 'rse' is missing and numerator is not 0. + Please fill in the missing value before rerunning chi_qa_tro()")) } + } - # Get reference data - ref_combos <- rads.data::misc_chi_byvars[, list(cat, varname, group)] - - # For ACS data, we only check cat and group combinations - if(acs) { - # Check cat1 combinations - chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( - cat = cat1, - group = cat1_group - )]) + ## Check rounding ---- + report_message("Checking for proper rounding based on result_type") - cat1_invalid <- chi_cat1_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + # Make sure we have result_type information for each observation + if(!"result_type" %in% names(CHIestimates)) { + # Merge only if not already merged above + CHIestimates <- merge(CHIestimates, CHImetadata[, list(indicator_key, result_type)], + by = "indicator_key", all.x = TRUE, all.y = FALSE) + } - if(nrow(cat1_invalid) > 0) { - status <- 0 - if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", - paste0(" - cat1='", cat1_invalid$cat, "', cat1_group='", - cat1_invalid$group, "'", collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") - } + # Check result rounding + report_message("Checking that result is rounded correctly (3 decimals for proportions, 1 decimal for rates)") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(result) & + result != rads::round2(result, 3)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(result) & + result != rads::round2(result, 1)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(result) & + result != round(result)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'result' column has incorrect rounding. Proportions should be rounded to 3 decimal places, + rates to 1 decimal place, and integers to 0 decimal places.") } + } - # Check cat2 combinations - chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( - cat = cat2, - group = cat2_group - )]) + # Check lower_bound rounding + report_message("Checking that lower_bound has same rounding as result") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(lower_bound) & + lower_bound != rads::round2(lower_bound, 3)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(lower_bound) & + lower_bound != rads::round2(lower_bound, 1)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(lower_bound) & + lower_bound != round(lower_bound)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'lower_bound' column has incorrect rounding. Should match result rounding: proportions to 3 decimal places, + rates to 1 decimal place, and integers to 0 decimal places.") + } + } - cat2_invalid <- chi_cat2_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + # Check upper_bound rounding + report_message("Checking that upper_bound has same rounding as result") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(upper_bound) & + upper_bound != rads::round2(upper_bound, 3)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(upper_bound) & + upper_bound != rads::round2(upper_bound, 1)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(upper_bound) & + upper_bound != round(upper_bound)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'upper_bound' column has incorrect rounding. Should match result rounding: proportions to 3 decimal places, + rates to 1 decimal place, and integers to 0 decimal places.") + } + } - if(nrow(cat2_invalid) > 0) { - status <- 0 - if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", - paste0(" - cat2='", cat2_invalid$cat, "', cat2_group='", - cat2_invalid$group, "'", collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") - } + # Check se rounding (one more decimal than result) + report_message("Checking that se is rounded to one more decimal than result (4 for proportions, 2 for rates)") + proportions_wrong_rounding <- CHIestimates[result_type == "proportion" & + !is.na(se) & + se != rads::round2(se, 4)] + + rates_wrong_rounding <- CHIestimates[result_type == "rate" & + !is.na(se) & + se != rads::round2(se, 2)] + + integers_wrong_rounding <- CHIestimates[result_type == "integer" & + !is.na(se) & + se != rads::round2(se, 1)] + + if(nrow(proportions_wrong_rounding) > 0 || + nrow(rates_wrong_rounding) > 0 || + nrow(integers_wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'se' column has incorrect rounding. Should be one more decimal than result: + 4 decimal places for proportions, 2 for rates, and 1 for integers.") } - } else { - # For non-ACS data, check complete combinations including varname - # Check cat1 combinations - chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( - cat = cat1, - varname = cat1_varname, - group = cat1_group - )]) - - # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity - chi_cat1_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] - chi_cat1_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] - chi_cat1_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] - chi_cat1_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] - - cat1_invalid <- chi_cat1_combos[!ref_combos, on = list(cat, varname, group)] - - if(nrow(cat1_invalid) > 0) { - status <- 0 - if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", - paste0(" - cat1='", cat1_invalid$cat, "', cat1_varname='", - cat1_invalid$varname, "', cat1_group='", cat1_invalid$group, "'", - collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") - } + } + + # Check rse rounding (always 3 decimals) + report_message("Checking that rse is rounded to three digits for all result types") + wrong_rounding <- CHIestimates[!is.na(rse) & rse != rads::round2(rse, 3)] + + if(nrow(wrong_rounding) > 0) { + status <- 0 + if(verbose){ + warning("The 'rse' column has incorrect rounding. Should be rounded to 3 decimal places for all result types.") } + } + + ## Ensure cat1/cat2 values meet CHI standards ---- + if(verbose) { + message("Checking that category combinations align with CHI standards") + } - # Check cat2 combinations - chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( - cat = cat2, - varname = cat2_varname, - group = cat2_group - )]) + # Get reference data + ref_combos <- rads.data::misc_chi_byvars[, list(cat, varname, group)] - # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity - chi_cat2_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] - chi_cat2_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] - chi_cat2_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] - chi_cat2_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + # For ACS data, we only check cat and group combinations + if(acs) { + # Check cat1 combinations + chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( + cat = cat1, + group = cat1_group + )]) - cat2_invalid <- chi_cat2_combos[!ref_combos, on = list(cat, varname, group)] + cat1_invalid <- chi_cat1_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] - if(nrow(cat2_invalid) > 0) { + if(nrow(cat1_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", + paste0(" - cat1='", cat1_invalid$cat, "', cat1_group='", + cat1_invalid$group, "'", collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + + # Check cat2 combinations + chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( + cat = cat2, + group = cat2_group + )]) + + cat2_invalid <- chi_cat2_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + + if(nrow(cat2_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", + paste0(" - cat2='", cat2_invalid$cat, "', cat2_group='", + cat2_invalid$group, "'", collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + } else { + # For non-ACS data, check complete combinations including varname + # Check cat1 combinations + chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( + cat = cat1, + varname = cat1_varname, + group = cat1_group + )]) + + # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity + chi_cat1_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] + chi_cat1_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] + chi_cat1_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] + chi_cat1_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + + cat1_invalid <- chi_cat1_combos[!ref_combos, on = list(cat, varname, group)] + + if(nrow(cat1_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", + paste0(" - cat1='", cat1_invalid$cat, "', cat1_varname='", + cat1_invalid$varname, "', cat1_group='", cat1_invalid$group, "'", + collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + + # Check cat2 combinations + chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( + cat = cat2, + varname = cat2_varname, + group = cat2_group + )]) + + # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity + chi_cat2_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] + chi_cat2_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] + chi_cat2_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] + chi_cat2_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + + cat2_invalid <- chi_cat2_combos[!ref_combos, on = list(cat, varname, group)] + + if(nrow(cat2_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", + paste0(" - cat2='", cat2_invalid$cat, "', cat2_varname='", + cat2_invalid$varname, "', cat2_group='", cat2_invalid$group, "'", + collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + } + + ## Ensure there are no more than 10 years of trend data ---- + trend.years <- sort(unique(CHIestimates[tab == 'trends']$year)) + if( length(trend.years) > 10){ + warning('\U00026A0 There are more than 10 unique years of trend data:\n', + paste(trend.years, collapse = ', ')) + } + + ## Ensure there are no gaps in year data for any indicator and proper multi-year coverage ---- + report_message("Checking for gaps in year data and proper multi-year coverage") + + # Get all unique indicator keys + unique_keys <- unique(CHIestimates$indicator_key) + + # Get all years and split them into single years and multi-year periods + all_years <- unique(CHIestimates$year) + multi_years <- all_years[grepl("-", all_years)] + single_years <- all_years[!grepl("-", all_years)] + + # Check if there are multiple multi-year periods + if(length(multi_years) > 1) { + report_message(paste("\U00002139 Note: Multiple multi-year periods detected:", paste(multi_years, collapse=", "))) + } + + # Define excluded indicators (those that don't need to be in multi-year periods) + excluded_5y_indicators <- c("uninsured_all", "uninsured_adult", "uninsured_child", "uninsured_senior", "uninsured") + + # For multi-year periods, check that each indicator exists + if(length(multi_years) > 0) { + multi_year_data <- CHIestimates[year %in% multi_years] + indicators_in_multi <- unique(multi_year_data$indicator_key) + + # Check which indicators are missing from multi-year periods + indicators_to_check <- setdiff(unique_keys, excluded_5y_indicators) + missing_indicators <- setdiff(indicators_to_check, indicators_in_multi) + + if(length(missing_indicators) > 0) { status <- 0 if(verbose) { - warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", - paste0(" - cat2='", cat2_invalid$cat, "', cat2_varname='", - cat2_invalid$varname, "', cat2_group='", cat2_invalid$group, "'", - collapse = "\n"), - "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + warning(paste("The following indicators are missing from multi-year periods:", + paste0(missing_indicators, collapse=", "))) } } } - ## Ensure there are no more than 10 years of trend data ---- - trend.years <- sort(unique(CHIestimates[tab == 'trends']$year)) - if( length(trend.years) > 10){ - warning('\U00026A0 There are more than 10 unique years of trend data:\n', - paste(trend.years, collapse = ', ')) - } + # Check for continuity in single-year data + if(length(single_years) > 1) { + # Count observations by year for each indicator key (only for single years) + year_count <- unique(CHIestimates[year %in% single_years, list(year, indicator_key)]) + setorder(year_count, indicator_key, year) + + # For each indicator, check if there are breaks in the year sequence + year_count[, year := as.integer(year)] + + # Process each indicator separately + unique_indicators <- unique(year_count$indicator_key) + + for(ind_key in unique_indicators) { + ind_data <- year_count[indicator_key == ind_key] + if(nrow(ind_data) > 1) { # Only check if there are at least 2 years + ind_data[, diff := c(NA, diff(year))] # get difference between subsequent years in the data. + + # Check for gaps, with special handling for 2020 when acs=TRUE + if(acs) { + # For ACS data, allow gap if it's 2020 + gaps <- ind_data[diff > 1 & (year != 2020 & year-1 != 2020), ] + } else { + # For non-ACS data, any gap is an issue + gaps <- ind_data[diff > 1, ] + } + + if(nrow(gaps) > 0) { + status <- 0 + if(verbose) { + gap_years <- paste(gaps$year - 1, gaps$year, sep="-") + warning(paste("Indicator", ind_key, "has gaps in years:", paste(gap_years, collapse=", "))) + } + } + } + } + } ## Print success statement!!!!!!!! #### if(verbose) { @@ -545,6 +684,5 @@ chi_qa_tro <- function(CHIestimates, ## Return ---- return(status) - } diff --git a/R/chi_sql_update.R b/R/chi_update_sql.R similarity index 99% rename from R/chi_sql_update.R rename to R/chi_update_sql.R index 1d5bd4d..d34571c 100644 --- a/R/chi_sql_update.R +++ b/R/chi_update_sql.R @@ -25,7 +25,7 @@ #' @examples #' \dontrun{ #' # Update development database -#' chi_sql_update( +#' chi_update_sql( #' CHIestimates = final_estimates, #' CHImetadata = final_metadata, #' table_name = "birth", @@ -52,7 +52,7 @@ #' #' @export #' -chi_sql_update <- function(CHIestimates = NULL, +chi_update_sql <- function(CHIestimates = NULL, CHImetadata = NULL, table_name = NULL, server = 'development', # options include c('development', 'production') diff --git a/R/globals.R b/R/globals.R index e9614f6..d646335 100644 --- a/R/globals.R +++ b/R/globals.R @@ -46,6 +46,7 @@ utils::globalVariables(c( "latest_year_result", "latest_year_resultx", "latest_yearx", + "level", "lower_bound", "notable", "numerator", @@ -69,6 +70,7 @@ utils::globalVariables(c( "run_date", "run_datex", "s2t_fraction", + "se", "source_id", "span", "start", diff --git a/man/chi_calc.Rd b/man/chi_calc.Rd index 60c69e5..30b2783 100644 --- a/man/chi_calc.Rd +++ b/man/chi_calc.Rd @@ -14,7 +14,8 @@ chi_calc( suppress_low = 0, suppress_high = 9, source_name = NULL, - source_date = NULL + source_date = NULL, + non_chi_byvars = NULL ) } \arguments{ @@ -37,12 +38,15 @@ chi_calc( \item{source_name}{character. Name of data source. Default: \code{NULL}.} \item{source_date}{Date. Date ph.data was created. Default: \code{NULL}.} + +\item{non_chi_byvars}{character vector. Variable names to exclude from CHI byvar encoding validation. Default: \code{NULL}.} } \value{ A data.table containing CHI estimates with the following columns: \itemize{ \item{\code{data_source}} Data source (e.g., acs, brfss, etc.) \item{\code{indicator_key}} Unique indicator key + \item{\code{level}} factor level of the indicator_key (e.g., Breech vs Cephalic vs other for fetal_pres in birth data) \item{\code{tab}} Type of analysis (e.g., demgroups, _kingcounty, etc.) \item{\code{year}} Year(s) of data \item{\code{cat1}} Describes data field (e.g., Gender, Ethnicity, etc.) @@ -66,7 +70,7 @@ A data.table containing CHI estimates with the following columns: } } \description{ -Generates CHI estimates from input data according to provided instructions. +Generates CHI estimates from input data according to provided instructions. Handles both proportions and rates, with options for suppression of small numbers. } \seealso{ diff --git a/man/chi_compare_estimates.Rd b/man/chi_compare_estimates.Rd index 1eb4830..896f800 100644 --- a/man/chi_compare_estimates.Rd +++ b/man/chi_compare_estimates.Rd @@ -59,5 +59,5 @@ comparison <- chi_compare_estimates( \seealso{ \code{\link{chi_qa_tro}} for validating individual datasets -\code{\link{chi_sql_update}} for uploading validated results +\code{\link{chi_update_sql}} for uploading validated results } diff --git a/man/chi_generate_analysis_set.Rd b/man/chi_generate_analysis_set.Rd index e8e45fd..1de8132 100644 --- a/man/chi_generate_analysis_set.Rd +++ b/man/chi_generate_analysis_set.Rd @@ -4,12 +4,15 @@ \alias{chi_generate_analysis_set} \title{Generate Analysis Sets for CHI} \usage{ -chi_generate_analysis_set(data_source = NULL) +chi_generate_analysis_set(data_source = NULL, CHIestimates = NULL) } \arguments{ \item{data_source}{Character string (length 1) specifying the data source to analyze (e.g., \code{'birth'}, \code{'brfss'}, \code{'chars'}, \code{'death'}, \code{'hys'}). This corresponds to a table -name in the PHExtractStore database.} +name in the PHExtractStore database on the TSQL Production Server. Required if \code{CHIestimates} is not provided.} + +\item{CHIestimates}{A data.table or data.frame containing CHI data to analyze. If provided, \code{data_source} +is ignored. Must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function in this package.} } \value{ A data.table containing analysis sets with the following columns: @@ -17,33 +20,36 @@ A data.table containing analysis sets with the following columns: \item \code{set}: Integer identifying groups of indicators with identical analysis patterns \item \code{cat1}: Category name as expected in CHI TRO \item \code{cat1_varname}: Category variable name as expected in CHI TRO - \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab - \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab - \item \code{demgroups}: \code{'x'} if analysis includes demographic groups - \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations - \item \code{trends}: \code{'x'} if analysis includes trends analysis + \item \code{_kingcounty}: \code{'x'} if analysis includes '_kingcounty' in the tab column + \item \code{_wastate}: \code{'x'} if analysis includes '_wastate' in the tab column + \item \code{demgroups}: \code{'x'} if analysis includes 'demgroups' in the tab column + \item \code{crosstabs}: \code{'x'} if analysis includes 'crosstabs' in the tab column + \item \code{trends}: \code{'x'} if analysis includes 'trends' in the tab column \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern } } \description{ Creates sets of indicator keys that share common analysis patterns within CHI -(Community Health Indicators) data. This function reads the most recent production -version of CHI estimates for a specified data source and groups indicators that -use the same combinations of \code{cat1}, \code{cat1_varname}, and \code{trends} -columns. +(Community Health Indicators) data. This function can either: + +1. Read the most recent production version of CHI estimates from the TSQL Production Server + for a specified data source, or + +2. Process a provided data.table/data.frame containing CHI data + +It groups indicators that use the same combinations of \code{cat1}, \code{cat1_varname}, and +tab columns (\code{_kingcounty}, \code{_wastate}, \code{demgroups}, \code{crosstabs}, \code{trends}). } \details{ This function generates a table for \code{\link{chi_generate_tro_shell}}, providing the -structure required for generating analysis instructions. It connects to the -\code{[PHExtractStore]} database on \code{KCITSQLPRPHIP40} to retrieve the latest production data. +structure required for generating analysis instructions. + +When using \code{data_source}, the function connects to the +\code{[PHExtractStore]} database on the TSQL Production Server \code{KCITSQLPRPHIP40} to retrieve the latest production estimates. Users need appropriate database credentials - contact your manager if you need access. -Typically, the analysis will not change from year to year. However, you should -compare the output of this function with the 'CHI-Variables_ByDataSource' worksheet -in -\href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ -SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} -and update it as necessary. +When using \code{CHIestimates}, the provided estimates are processed directly, bypassing the database connection. +The data must comply with the latest CHI standard, which can be assessed/tested with the \code{chi_qa_tro} function. The output structure directly informs \code{\link{chi_generate_tro_shell}} about which indicators should be analyzed together based on their shared patterns of @@ -51,11 +57,15 @@ categories and analysis types. } \examples{ \dontrun{ -# Generate analysis sets for birth data +# Generate analysis sets for birth data from the database birth_sets <- chi_generate_analysis_set("birth") -# Generate analysis sets for BRFSS data +# Generate analysis sets for BRFSS data from the database brfss_sets <- chi_generate_analysis_set("brfss") + +# Generate analysis sets from an existing data.table +my_data <- data.table::fread("my_chi_data.csv") +custom_sets <- chi_generate_analysis_set(CHIestimates = my_data) } } diff --git a/man/chi_generate_metadata.Rd b/man/chi_generate_metadata.Rd index 65c8a50..e53a5bb 100644 --- a/man/chi_generate_metadata.Rd +++ b/man/chi_generate_metadata.Rd @@ -4,18 +4,28 @@ \alias{chi_generate_metadata} \title{CHI Generate Metadata} \usage{ -chi_generate_metadata(meta.old = NULL, est.current = NULL) +chi_generate_metadata( + meta.old = NULL, + est.current = NULL, + allowed_missing = list(list(indicator_key = "medinc", column = "latest_year_count")), + warn_only = FALSE +) } \arguments{ \item{meta.old}{Name of a data.table with the prior year's metadata} \item{est.current}{current year's tableau ready output with completed estimates} + +\item{allowed_missing}{A list of lists specifying allowed missing metadata values. Each inner list +should have \code{indicator_key} and \code{column} elements.} + +\item{warn_only}{Logical, if TRUE, only warns about unexpected missing values instead of stopping} } \value{ table of metadata } \description{ -function to generate metadata table combining existing metadata and latest estimates. +Function to generate metadata table combining existing metadata and latest estimates. } \seealso{ \code{\link{chi_calc}} for generating estimates diff --git a/man/chi_sql_update.Rd b/man/chi_update_sql.Rd similarity index 92% rename from man/chi_sql_update.Rd rename to man/chi_update_sql.Rd index 7ec1e5e..f134dc7 100644 --- a/man/chi_sql_update.Rd +++ b/man/chi_update_sql.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/chi_sql_update.R -\name{chi_sql_update} -\alias{chi_sql_update} +% Please edit documentation in R/chi_update_sql.R +\name{chi_update_sql} +\alias{chi_update_sql} \title{CHI SQL Update} \usage{ -chi_sql_update( +chi_update_sql( CHIestimates = NULL, CHImetadata = NULL, table_name = NULL, @@ -45,7 +45,7 @@ SharePoint > DPH-KCCross-SectorData > Documents > References > SQL > SQL Server \examples{ \dontrun{ # Update development database -chi_sql_update( +chi_update_sql( CHIestimates = final_estimates, CHImetadata = final_metadata, table_name = "birth", diff --git a/quarto_docs/Calculating_Prevalences.qmd b/quarto_docs/Calculating_Prevalences.qmd index 6cf6856..db37ad1 100644 --- a/quarto_docs/Calculating_Prevalences.qmd +++ b/quarto_docs/Calculating_Prevalences.qmd @@ -217,7 +217,27 @@ Here are two rows from `myestimates` that will show the structure and contents o pretty_kable(head(myestimates[sample(1:.N, 2)])) ``` -# Tidying Race/Ethnicity Categories +## Restoring Sequential Processing + +After completing the parallel processing job, it's important to restore the system returns to its normal state. + +```{r future-revert} +future::plan(future::sequential) +``` + +# Tidying `chi_calc()` output + +## Drop the `level` column + +The `chi_calc()` function returns a column called `level`, which contains the specific factor level for categorical variables. For example, if `indicator_key == 'fetal_pres'`, the function would return separate rows for each possible presentation: a row with `level == 'Breech'`, another with `level == 'Cephalic'`, and another with `level == 'Other'`. In these cases, we would use the `level` column to filter for our factor level of interest. + +In our current example, since `bw_norm` is not a categorical variable, the `level` column contains only `NA` values and can be safely dropped. + +```{r tidy-level} +myestimates[, level := NULL] +``` + +## Race/Ethnicity Due to the way that APDE decided to display race3 (Hispanic as ethnicity) and race4 (Hispanic as race), the calculation and preparation of these variables can be complex. We need to manipulate the results to align them with CHI standards. @@ -227,26 +247,27 @@ For future reference, all CHI standards can be found in `rads.data::misc_chi_byv #| warning: false #| message: false -# Update race4 categories -myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race4', +# For race4 categories (both cat1 and cat2) +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race4', cat1 := "Birthing person's race"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race4', +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race4', cat2 := "Birthing person's race"] -# Update race3 categories - separate race and ethnicity -myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group != 'Hispanic', +# For race3 categories - default to race, override for Hispanic ethnicity +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3', cat1 := "Birthing person's race"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group == 'Hispanic', +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group == 'Hispanic', cat1 := "Birthing person's ethnicity"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group != 'Hispanic', + +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3', cat2 := "Birthing person's race"] -myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group == 'Hispanic', +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group == 'Hispanic', cat2 := "Birthing person's ethnicity"] -# Update trend data labels -myestimates[tab == 'trends' & cat1_varname %in% c('race3', 'race4'), +# Update trend data labels (both race3 and race4 at once) +myestimates[tab == 'trends' & cat1_varname %in% c('race3', 'race4'), cat1 := "Birthing person's race/ethnicity"] -myestimates[tab == 'trends' & cat2_varname %in% c('race3', 'race4'), +myestimates[tab == 'trends' & cat2_varname %in% c('race3', 'race4'), cat2 := "Birthing person's race/ethnicity"] ``` @@ -375,7 +396,7 @@ We need to save our results and metadata to the development SQL Server. Later, o ```{r save-sql} #| warning: false -chi_sql_update(CHIestimates = myestimates, +chi_update_sql(CHIestimates = myestimates, CHImetadata = mymetadata, table_name = 'junk', # replace with data source name, e.g., 'birth', 'brfss', etc. server = 'development', @@ -464,7 +485,7 @@ New functions you used: - `chi_generate_metadata()` to create an updated metadata table - `chi_qa_tro()` to perform quality assurance checks - `chi_compare_estimates()` to identify notable differences compared to previous estimates -- `chi_sql_update()` to save estimates and metadata to SQL Server +- `chi_update_sql()` to save estimates and metadata to SQL Server If you encounter issues or believe you've found a bug, please submit a GitHub issue at [Issues ยท PHSKC-APDE/apde.chi.tools](https://github.com/PHSKC-APDE/apde.chi.tools/issues). diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R index 011233e..c5a0e95 100644 --- a/tests/testthat/test-chi_generate_analysis_set.R +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -1,5 +1,6 @@ # Tests for chi_generate_analysis_set test_that("chi_generate_analysis_set validates inputs", { - expect_error(chi_generate_analysis_set(), "data_source parameter must be provided") - expect_error(chi_generate_analysis_set(123), "data_source must be a single character string") + expect_error(chi_generate_analysis_set(), "Either data_source parameter or CHIestimates parameter must be provided") + expect_error(chi_generate_analysis_set(data_source = 123), "data_source must be a single character string") + expect_error(chi_generate_analysis_set(CHIestimates = 123), "CHIestimates must be a data.table or data.frame") }) diff --git a/tests/testthat/test-chi_sql_update.R b/tests/testthat/test-chi_sql_update.R index 0ce77ad..3962127 100644 --- a/tests/testthat/test-chi_sql_update.R +++ b/tests/testthat/test-chi_sql_update.R @@ -1,9 +1,9 @@ -# Tests for chi_sql_update -test_that("chi_sql_update validates inputs", { +# Tests for chi_update_sql +test_that("chi_update_sql validates inputs", { test_data <- setup_test_data() expect_warning( - chi_sql_update( + chi_update_sql( CHIestimates = test_data$my.estimate, CHImetadata = test_data$my.metadata, table_name = 'JustTesting', @@ -13,11 +13,11 @@ test_that("chi_sql_update validates inputs", { "Validation may be flawed for the following variables because they are 100% missing" ) - expect_error(chi_sql_update(), + expect_error(chi_update_sql(), "The results table to push to SQL \\(CHIestimates\\) is missing") - expect_error(suppressWarnings(chi_sql_update(CHIestimates = test_data$my.estimate)), + expect_error(suppressWarnings(chi_update_sql(CHIestimates = test_data$my.estimate)), "The metadata table to push to SQL \\(CHImetadata\\) is missing") - expect_error(suppressWarnings(chi_sql_update(CHIestimates = test_data$my.estimate, + expect_error(suppressWarnings(chi_update_sql(CHIestimates = test_data$my.estimate, CHImetadata = test_data$my.metadata)), "The table_name argument is missing") })