From 8332ec790f73aa351e6ea526efe2ad2ac9f43f2b Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 9 May 2025 16:15:23 -0700 Subject: [PATCH 1/2] chi_suppress_results created - based heavily on rads::suppress - main improvement is that it suppresses on denominators (not just numerators) - also add caution when numerator == 0 - more validation - more customizable arguments, but defaults are for CHI output - many more tests - passes all tests without errors or warnings in devtools::check() --- NAMESPACE | 2 + R/chi_suppress_results.R | 304 +++++++++++++++++++++ man/chi_suppress_results.Rd | 139 ++++++++++ tests/testthat/test-chi_suppress_results.R | 224 +++++++++++++++ 4 files changed, 669 insertions(+) create mode 100644 R/chi_suppress_results.R create mode 100644 man/chi_suppress_results.Rd create mode 100644 tests/testthat/test-chi_suppress_results.R diff --git a/NAMESPACE b/NAMESPACE index 6248b5c..b51d2f4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ export(chi_get_proper_pop) export(chi_get_yaml) export(chi_keep_proper_ages) export(chi_qa_tro) +export(chi_suppress_results) export(chi_update_sql) import(data.table) import(dtsurvey) @@ -30,6 +31,7 @@ importFrom(DBI,dbGetQuery) importFrom(DBI,dbWriteTable) importFrom(data.table,"%between%") importFrom(data.table,":=") +importFrom(data.table,':=') importFrom(data.table,.GRP) importFrom(data.table,.SD) importFrom(data.table,CJ) diff --git a/R/chi_suppress_results.R b/R/chi_suppress_results.R new file mode 100644 index 0000000..e8815c0 --- /dev/null +++ b/R/chi_suppress_results.R @@ -0,0 +1,304 @@ +#' Suppress data due to small numbers and create a caution flag due to reliability +#' +#' @description +#' This function applies primary data suppression based on the numerator and +#' denominator and secondary suppression based on the numerator. It also flags +#' low reliability. Suppressed data are noted by \code{suppression = '^'} and +#' unreliable data are noted by \code{caution = '!'}. +#' +#' @details +#' Data source specific suppression ranges can be found in +#' \href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDE_SmallNumberUpdate.xlsx}{APDE_SmallNumberUpdate.xlsx} +#' +#' +#' Please review the +#' \href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDEDataPresentationAlgorithm_2020_Approved.pptx}{APDE} +#' and \href{https://www.doh.wa.gov/Portals/1/Documents/1500/SmallNumbers.pdf}{DOH} +#' suppression guidelines for details on the logic used in this function. +#' +#' This function expects data that has already been formatted for CHI. However, +#' the user can modify the default parameter settings for use with most tables of +#' summary data. +#' +#' @param ph.data A data.table or data.frame. Must contain the data to be suppressed +#' with standard metric names, +#' i.e., mean, median, sum, rate, lower, upper, se, rse, numerator, denominator, +#' proportion +#' @param suppress_range Integer vector of length 2. They specify the minimum and +#' maximum range for suppression. +#' @param secondary Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +#' indicating whether secondary suppression should be run +#' @param secondary_ids Character vector of column names which are used to define +#' groups for secondary suppression. +#' Note, this should not include the most granular level. For example, if you wanted +#' secondary suppression for race/ethnicity within HRAs +#' where \code{cat1_varname == 'race4'} and \code{cat1_group == 'AIAN'}, +#' \code{cat1_group == 'Asian'}, \code{cat1_group == 'Black'}, etc., you should +#' have \code{secondary_ids = c('hra20_name', 'cat1_varname')} rather than +#' \code{secondary_ids = c('hra20_name', 'cat1_varname', 'cat1_group')} +#' @param secondary_exclude An unquoted expression that evaluates to a logical vector +#' indicating which rows to include in secondary suppression analysis. Use this parameter +#' to exclude categories that are not mutually exclusive (e.g., overlapping demographic +#' groups). The expression should use column names from the dataset and evaluate to TRUE +#' for rows to include. For example: `secondary_exclude = cat1_varname != "race3"` would +#' exclude all rows where cat1_varname equals "race3" from secondary suppression. +#' @param flag_only Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +#' indicating whether data to be +#' suppressed should be flagged without setting estimates to NA +#' @param numerator_col Character string with the name of the numerator column. Default is "numerator". +#' @param denominator_col Character string with the name of the denominator column. Default is "denominator". +#' @param rse_col Character string with the name of the relative standard error column. Default is "rse". +#' @param columns_to_suppress Character vector of column names to be suppressed (set to NA) when +#' suppression is flagged. Default includes common result columns. +#' +#' @return A data.table with suppression applied to CHI standard columns. +#' +#' @export +#' +#' @keywords suppression +#' +#' @importFrom data.table ':=' data.table is.data.table setDT fsetdiff setorder setorderv copy +#' +#' @examples +#' \dontrun{ +#' set.seed(98104) +#' dt <- data.table::data.table( +#' chi_year = rep(2018, 100), +#' result = rnorm(100, .25, .05), +#' numerator = round(rnorm(100, 20, 9), 0), +#' denominator = round(rnorm(100, 100, 30), 0), +#' rse = sample(1:100, 100, replace = TRUE) +#' ) +#' +#' # Basic suppression with default parameters +#' newdt <- chi_suppress_results(ph.data = dt, +#' suppress_range = c(1, 9), +#' secondary = FALSE) +#' +#' nrow(dt[numerator %in% 1:9 | denominator %in% 1:9]) # rows needed suppression +#' nrow(newdt[suppression=='^']) # rows suppressed +#' nrow(newdt[rse >= 30 | numerator == 0]) # rows needing caution +#' nrow(newdt[caution=='!']) # rows with caution +#' +#' # With secondary suppression +#' dt$region <- sample(c("North", "South", "East", "Seattle"), 100, replace = TRUE) +#' dt$category <- sample(c("A", "B", "C"), 100, replace = TRUE) +#' +#' newdt2 <- chi_suppress_results(ph.data = dt, +#' suppress_range = c(1, 9), +#' secondary = TRUE, +#' secondary_ids = c("region", "category")) +#' +#' nrow(newdt[suppression=='^']) # only primary suppression +#' nrow(newdt2[suppression=='^']) # with secondary suppression +#' +#' # Using custom column names +#' dt2 <- data.table::data.table( +#' chi_year = rep(2018, 100), +#' mean = rnorm(100, .25, .05), +#' num = round(rnorm(100, 20, 9), 0), +#' denom = round(rnorm(100, 100, 30), 0), +#' rel_se = sample(1:100, 100, replace = TRUE) +#' ) +#' +#' newdt3 <- chi_suppress_results(ph.data = dt2, +#' suppress_range = c(1, 9), +#' numerator_col = "num", +#' denominator_col = "denom", +#' rse_col = "rel_se", +#' columns_to_suppress = c("mean", "num", "denom")) +#' +#' nrow(dt2[num %in% 1:9 | denom %in% 1:9]) # rows need suppression +#' nrow(newdt3[suppression == '^']) # rows suppressed +#' } +#' + +chi_suppress_results <- function(ph.data = NULL, + suppress_range = c(1, 9), + secondary = FALSE, + secondary_ids = c("tab", "indicator_key", "cat1", "cat2_group", "year"), + secondary_exclude, + flag_only = FALSE, + numerator_col = "numerator", + denominator_col = "denominator", + rse_col = "rse", + columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse", + "numerator", "denominator")){ + + ## Global variables used by data.table declared as NULL here to play nice with devtools::check() + numerator <- denominator <- suppression <- my.group <- my.order <- my.rowct <- + suppressed.group <- my.flag <- rse <- caution <- rows.unsuppressed <- + result <- lower_bound <- upper_bound <- se <- NULL + + # ---- Validate 'ph.data' ---- + if(is.null(ph.data)){ + stop("\n\U1F6D1 You must specify a dataset (i.e., 'ph.data' must be defined)") + } + + if(!data.table::is.data.table(ph.data)){ + if(is.data.frame(ph.data)){ + data.table::setDT(ph.data) + } else { + stop(paste0("\n\U1F6D1 <{ph.data}> must be the name of a data.frame or data.table.")) + } + } + + # ---- Validate 'suppress_range' ---- + if(is.null(suppress_range)){ + suppress_range <- c(0, 9) + } + + if(!is.null(suppress_range) & + (length(suppress_range) != 2 | suppress_range[1] %% 1 != 0 | suppress_range[2] %% 1 != 0 | + suppress_range[1] < 0 | suppress_range[2] < 0)){ + stop("\n\U1F6D1 must be comprised of two non-negative integers (i.e., 'c(0, 9)'") + } + + # ---- Validate 'secondary' ---- + if(!is.logical(secondary)){ + stop("\n\U1F6D1 'secondary' must be specified as a logical (i.e., TRUE, T, FALSE, or F)") + } + + # ---- Validate 'secondary_ids' ---- + if(secondary == TRUE & length(setdiff(secondary_ids, names(ph.data))) > 0){ + stop("\n\U1F6D1 At least one name in 'secondary_ids' is not found among the column names in 'ph.data'") + } + + # ---- Validate 'secondary_exclude' ---- + if(!missing(secondary_exclude)){ + call = match.call() + + if(is.character(call[['secondary_exclude']])){ + where = str2lang(call[['secondary_exclude']]) + warning('\u26A0\ufe0f `secondary_exclude` is a string. It was converted so that it would work, but in the future, this might turn into an error. + In the future, please pass unquoted commands that will resolve to a logical') + } else { + where = copy(call[['secondary_exclude']]) + } + + e <- substitute(expr = where) # get parse tree expression `where` + myfilter <- eval(expr = e, envir = ph.data, enclos = parent.frame()) # evaluate + stopifnot('`where` does not resolve to a logical' = is.logical(myfilter)) + if(nrow(ph.data[myfilter,]) < 1){ + stop(paste0("\n\U1F6D1 Your 'secondary_exclude' argument filters out all rows of data. Please revise and submit again")) + } + } + + # ---- Validate 'flag_only' ---- + if(!is.logical(flag_only)){ + stop("\n\U1F6D1 'flag_only' must be specified as a logical (i.e., TRUE, T, FALSE, or F)") + } + + # ---- Validate 'numerator_col' ---- + if(!numerator_col %in% names(ph.data)){ + stop(paste0("\n\U1F6D1 Required column '", numerator_col, "' is missing from the dataset")) + } + + # ---- Validate 'denominator_col' ---- + if(!denominator_col %in% names(ph.data)){ + warning(paste0("\u26A0\ufe0f Column '", denominator_col, "' is missing from the dataset. Only numerator-based suppression will be applied.")) + } + + # ---- Validate 'rse_col' ---- + if(!rse_col %in% names(ph.data)){ + warning(paste0("\u26A0\ufe0f Column '", rse_col, "', the value of `rse_col`, is missing from the dataset. `caution` flag for reliability will not be generated.")) + } + + # ---- Validate 'columns_to_suppress' ---- + missing_cols <- setdiff(columns_to_suppress, names(ph.data)) + if(length(missing_cols) > 0){ + warning(paste0("\u26A0\ufe0f The following columns specified in 'columns_to_suppress' are missing from the dataset and will be ignored: ", + paste(missing_cols, collapse = ", "))) + columns_to_suppress <- intersect(columns_to_suppress, names(ph.data)) + } + + + # ---- Copy ph.data to avoid changing the underlying data.table due to modification by references---- + temp.dt <- data.table::setDT(copy(ph.data)) + + # ---- Check for existing suppression and caution columns ---- + if("suppression" %in% names(temp.dt)){ + warning("\u26A0\ufe0f Existing 'suppression' column will be overwritten") + temp.dt[, suppression := NULL] # Remove existing column + } + + if("caution" %in% names(temp.dt)){ + warning("\u26A0\ufe0f Existing 'caution' column will be overwritten") + temp.dt[, caution := NULL] # Remove existing column + } + + # ---- Identify primary suppression ---- + # Check both numerator and denominator for suppression + temp.dt[, suppression := NA_character_] + + temp.dt[get(numerator_col) >= suppress_range[1] & get(numerator_col) <= suppress_range[2], suppression := "^"] + + if(denominator_col %in% names(temp.dt)){ # above used warning, not stop, if denominator_col didn't exist + temp.dt[get(denominator_col) >= suppress_range[1] & get(denominator_col) <= suppress_range[2], suppression := "^"] + } + + # ---- Identify secondary suppression (only based on numerator) ---- + if(isTRUE(secondary)){ + + # Apply secondary_exclude argument + if(!missing(secondary_exclude)){ + myfilter <- eval(expr = e, envir = temp.dt, enclos = parent.frame()) + temp.dt.aside <- data.table::fsetdiff(temp.dt, temp.dt[myfilter,]) + temp.dt <- temp.dt[myfilter,] + } + + # Create group id for each set of secondary_ids + temp.dt[, my.group := .GRP, by = secondary_ids] + data.table::setorder(temp.dt, my.group) + + # Identify max number of rows per group defined by secondary_ids + temp.dt[, my.rowct := .N, by = secondary_ids] + + # Identify groups that had initial suppression + temp.dt[, suppressed.group := FALSE] + temp.dt[my.group %in% unique(temp.dt[suppression == "^"]$my.group), suppressed.group := TRUE] + + # Within groups that had suppression, count the number of rows that were not suppressed + temp.dt[my.group %in% unique(temp.dt[suppressed.group == TRUE]$my.group) & is.na(suppression), + rows.unsuppressed := .N, by = secondary_ids] + suppressWarnings(temp.dt[, rows.unsuppressed := max(rows.unsuppressed, na.rm = TRUE), by = my.group]) + + # Identify when the number of un-suppressed rows (in groups that had suppression) is max rows minus 1 + # (these need secondary suppression) + temp.dt[is.na(suppression) & rows.unsuppressed == my.rowct - 1, + my.flag := "group needs secondary suppression"] + + # Sort table so the smallest numerator per group that needs secondary suppression is first + data.table::setorderv(temp.dt, c('my.group', numerator_col), na.last = TRUE) + + # Suppress row with smallest numerator among groups needing secondary suppression + if(nrow(temp.dt[my.flag == "group needs secondary suppression"]) > 0){ + temp.dt[my.flag == "group needs secondary suppression", my.order := 1:.N, by = my.group] + temp.dt[my.order == 1, suppression := "^"] + } + + # Drop all temporary variables + temp.dt[, (intersect(c("my.group", "suppressed.group", "my.rowct", "my.flag", "my.order", + "rows.unsuppressed"), names(temp.dt))) := NULL] + + # Combine back with data filtered out by secondary_exclude + if(exists("temp.dt.aside")){ + temp.dt <- rbind(temp.dt, temp.dt.aside) + rm(temp.dt.aside) + } + } + + # ---- Apply suppression to columns_to_suppress unless flag_only = FALSE ---- + # Use validated columns_to_suppress + if(isFALSE(flag_only)){ + temp.dt[suppression == "^", (columns_to_suppress) := NA] + } + + # ---- Apply caution flag if possible ---- + if(rse_col %in% names(temp.dt)){ + temp.dt[get(rse_col) >= 30 | get(numerator_col) == 0, caution := "!"] + } + + return(temp.dt) + +} diff --git a/man/chi_suppress_results.Rd b/man/chi_suppress_results.Rd new file mode 100644 index 0000000..a6c283e --- /dev/null +++ b/man/chi_suppress_results.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_suppress_results.R +\name{chi_suppress_results} +\alias{chi_suppress_results} +\title{Suppress data due to small numbers and create a caution flag due to reliability} +\usage{ +chi_suppress_results( + ph.data = NULL, + suppress_range = c(1, 9), + secondary = FALSE, + secondary_ids = c("tab", "indicator_key", "cat1", "cat2_group", "year"), + secondary_exclude, + flag_only = FALSE, + numerator_col = "numerator", + denominator_col = "denominator", + rse_col = "rse", + columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse", + "numerator", "denominator") +) +} +\arguments{ +\item{ph.data}{A data.table or data.frame. Must contain the data to be suppressed +with standard metric names, +i.e., mean, median, sum, rate, lower, upper, se, rse, numerator, denominator, +proportion} + +\item{suppress_range}{Integer vector of length 2. They specify the minimum and +maximum range for suppression.} + +\item{secondary}{Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +indicating whether secondary suppression should be run} + +\item{secondary_ids}{Character vector of column names which are used to define +groups for secondary suppression. +Note, this should not include the most granular level. For example, if you wanted +secondary suppression for race/ethnicity within HRAs +where \code{cat1_varname == 'race4'} and \code{cat1_group == 'AIAN'}, +\code{cat1_group == 'Asian'}, \code{cat1_group == 'Black'}, etc., you should +have \code{secondary_ids = c('hra20_name', 'cat1_varname')} rather than +\code{secondary_ids = c('hra20_name', 'cat1_varname', 'cat1_group')}} + +\item{secondary_exclude}{An unquoted expression that evaluates to a logical vector +indicating which rows to include in secondary suppression analysis. Use this parameter +to exclude categories that are not mutually exclusive (e.g., overlapping demographic +groups). The expression should use column names from the dataset and evaluate to TRUE +for rows to include. For example: `secondary_exclude = cat1_varname != "race3"` would +exclude all rows where cat1_varname equals "race3" from secondary suppression.} + +\item{flag_only}{Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE}) +indicating whether data to be +suppressed should be flagged without setting estimates to NA} + +\item{numerator_col}{Character string with the name of the numerator column. Default is "numerator".} + +\item{denominator_col}{Character string with the name of the denominator column. Default is "denominator".} + +\item{rse_col}{Character string with the name of the relative standard error column. Default is "rse".} + +\item{columns_to_suppress}{Character vector of column names to be suppressed (set to NA) when +suppression is flagged. Default includes common result columns.} +} +\value{ +A data.table with suppression applied to CHI standard columns. +} +\description{ +This function applies primary data suppression based on the numerator and +denominator and secondary suppression based on the numerator. It also flags +low reliability. Suppressed data are noted by \code{suppression = '^'} and +unreliable data are noted by \code{caution = '!'}. +} +\details{ +Data source specific suppression ranges can be found in +\href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDE_SmallNumberUpdate.xlsx}{APDE_SmallNumberUpdate.xlsx} + + +Please review the +\href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDEDataPresentationAlgorithm_2020_Approved.pptx}{APDE} +and \href{https://www.doh.wa.gov/Portals/1/Documents/1500/SmallNumbers.pdf}{DOH} +suppression guidelines for details on the logic used in this function. + +This function expects data that has already been formatted for CHI. However, +the user can modify the default parameter settings for use with most tables of +summary data. +} +\examples{ +\dontrun{ +set.seed(98104) +dt <- data.table::data.table( + chi_year = rep(2018, 100), + result = rnorm(100, .25, .05), + numerator = round(rnorm(100, 20, 9), 0), + denominator = round(rnorm(100, 100, 30), 0), + rse = sample(1:100, 100, replace = TRUE) +) + +# Basic suppression with default parameters +newdt <- chi_suppress_results(ph.data = dt, + suppress_range = c(1, 9), + secondary = FALSE) + +nrow(dt[numerator \%in\% 1:9 | denominator \%in\% 1:9]) # rows needed suppression +nrow(newdt[suppression=='^']) # rows suppressed +nrow(newdt[rse >= 30 | numerator == 0]) # rows needing caution +nrow(newdt[caution=='!']) # rows with caution + +# With secondary suppression +dt$region <- sample(c("North", "South", "East", "Seattle"), 100, replace = TRUE) +dt$category <- sample(c("A", "B", "C"), 100, replace = TRUE) + +newdt2 <- chi_suppress_results(ph.data = dt, + suppress_range = c(1, 9), + secondary = TRUE, + secondary_ids = c("region", "category")) + +nrow(newdt[suppression=='^']) # only primary suppression +nrow(newdt2[suppression=='^']) # with secondary suppression + +# Using custom column names +dt2 <- data.table::data.table( + chi_year = rep(2018, 100), + mean = rnorm(100, .25, .05), + num = round(rnorm(100, 20, 9), 0), + denom = round(rnorm(100, 100, 30), 0), + rel_se = sample(1:100, 100, replace = TRUE) +) + +newdt3 <- chi_suppress_results(ph.data = dt2, + suppress_range = c(1, 9), + numerator_col = "num", + denominator_col = "denom", + rse_col = "rel_se", + columns_to_suppress = c("mean", "num", "denom")) + +nrow(dt2[num \%in\% 1:9 | denom \%in\% 1:9]) # rows need suppression +nrow(newdt3[suppression == '^']) # rows suppressed +} + +} +\keyword{suppression} diff --git a/tests/testthat/test-chi_suppress_results.R b/tests/testthat/test-chi_suppress_results.R new file mode 100644 index 0000000..3d1e234 --- /dev/null +++ b/tests/testthat/test-chi_suppress_results.R @@ -0,0 +1,224 @@ +library('data.table') +library('testthat') + +# create test data ---- +set.seed(98104) +dt <- suppressWarnings(data.table::data.table(chi_year = 2022, + indicator = "home ownership", + team = sample(as.vector(outer(letters, 1:15, paste0)), rep = T), + color = c("red", "blue", "yellow", "green"), + numerator = sample(1:200, 1000, rep = TRUE))) +dt[, denominator := sample(500:1000, 1000, rep = TRUE)] +dt[, result := numerator / denominator] +dt[, se := sqrt(result/100)] # not a real formula! +dt[, lower_bound := result - (1.96 * se)] +dt[, upper_bound := result + (1.96 * se)] +dt[, rse := 100*se / result] +setorder(dt, indicator, team, color, numerator) +dt[, counter := 1:.N, c("indicator", "team", "color")] +dt <- dt[counter == 1][, counter := NULL] + +# test defaults ---- +dt1 <- chi_suppress_results(dt) +test_that('Check that defaults work as expected',{ + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt[numerator %in% 1:9 | denominator %in% 1:9])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(result)])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(se)])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(lower_bound)])) + expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(rse)])) + expect_equal(nrow(dt1[caution=="!"]), nrow(dt[!(numerator %in% 1:9 | denominator %in% 1:9) & (rse >=30 | numerator == 0)])) +}) + +# test suppression range ---- +dt2 <- chi_suppress_results(dt, suppress_range = c(0,10), secondary = FALSE) +test_that('Check that the suppression_range argument works',{ + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt[numerator <= 10 | denominator <= 10])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(result)])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(se)])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(lower_bound)])) + expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(rse)])) + expect_equal(nrow(dt2[caution=="!"]), nrow(dt[!(numerator %in% 0:10 | denominator %in% 0:10) & (rse >=30)])) +}) + +# test secondary suppression ---- +dt3 <- chi_suppress_results(dt, suppress_range = c(0,10), + secondary = TRUE, + secondary_ids = c("indicator", "team")) +#ugly manual method to apply secondary suppression for comparison +sec.suppress3 <- copy(dt2) # build off results from initial / primary suppression +sec.suppress3[, max.grp.rows := .N, .(indicator, team)] # num of rows per set of secondary_ids +sec.suppress3[, group := .GRP, by = .(indicator, team)] # create group id for each set of secondary_ids +supp.ids <- unique(sec.suppress3[suppression=="^"]$group) # get group ids where there was initial suppression +sec.suppress3[, suppressed.group := F] +sec.suppress3[group %in% supp.ids, suppressed.group := T] # identify groups with initial suppression in table +sec.suppress3[group %in% supp.ids & is.na(suppression), unsuppressed := .N, .(indicator, team)] # rows unsuppressed per group +suppressWarnings(sec.suppress3[, unsuppressed := max(unsuppressed, na.rm = T), .(indicator, team)]) # fill in NA for rows unsuppressed +sec.suppress3[is.na(suppression) & unsuppressed == max.grp.rows - 1, secondary.suppression := T] # identify groups that need secondary suppression (groups with exactly one suppressed row) +setorder(sec.suppress3, group, numerator, na.last = T) # sort from smallest to largest numerator by group +sec.suppress3[secondary.suppression == T, order := 1:.N, group] # identify 1st row (smallest numerator) of each group needing secondary suppression +sec.suppress3[order==1, suppression := "^"] # mark the specific rows to have secondary suppression +sec.suppress3[suppression == "^", c("numerator", "denominator", "result", "se", "lower_bound", "upper_bound", "rse", "caution") := NA] +sec.suppress3[, c("max.grp.rows", "group", "suppressed.group", "unsuppressed", "secondary.suppression", "order") := NULL] + +test_that('Check that secondary suppression works',{ + expect_equal(nrow(dt3[suppression=="^"]), nrow(sec.suppress3[suppression=="^"])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(result)])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(se)])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(lower_bound)])) + expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(rse)])) +}) + +# test NA handling in numerator and demominator ---- +dt_na <- copy(dt) +dt_na[1:10, numerator := NA] +dt_na[11:20, denominator := NA] +dt_na_result <- chi_suppress_results(dt_na) +test_that('Check NA handling in numerator and denominator', { + # Check whether NAs are properly handled and not incorrectly flagged + expect_equal(sum(is.na(dt_na_result[1:20]$suppression)), 20) +}) + +# test missing numerator column ---- +dt_no_num <- copy(dt)[, numerator := NULL] +test_that('Check error when numerator column is missing', { + expect_error(chi_suppress_results(dt_no_num), "Required column 'numerator' is missing") +}) + +# test secondary suppression group ---- +dt_single <- data.table( + chi_year = 2022, + indicator = c("A", "A", "B"), + team = c("team1", "team2", "team1"), + numerator = c(5, 15, 20), + denominator = c(100, 100, 100) +) +dt_single[, result := numerator / denominator] +dt_single[, rse := 20] + +dt_single_result <- suppressWarnings(chi_suppress_results(dt_single, + secondary = TRUE, + secondary_ids = c("indicator"))) + +test_that('Check secondary suppression with single row group', { + # Group A should be entirely suppressed and + # Group B shoudl have one unsuppressed row + expect_equal(nrow(dt_single_result[indicator == "A" & suppression=='^']), 2) + expect_equal(nrow(dt_single_result[indicator == "B" & is.na(suppression)]), 1) +}) + +# test secondary suppression with secondary_exclude ---- +dt4 <- chi_suppress_results(dt, suppress_range = c(0,10), + secondary = TRUE, + secondary_ids = c("indicator", "team"), + secondary_exclude = !team %in% c('a10', 'a11')) + +#ugly manual method to apply secondary suppression for testing +exclusion4 <- copy(dt2)[team %in% c('a10', 'a11')] # partition off part excluded from secondary suppression +sec.suppress4 <- copy(dt2)[!team %in% c('a10', 'a11')] # build off results from initial / primary suppression +sec.suppress4[, max.grp.rows := .N, .(indicator, team)] # num of rows per set of secondary_ids +sec.suppress4[, group := .GRP, by = .(indicator, team)] # create group id for each set of secondary_ids +supp.ids <- unique(sec.suppress4[suppression=="^"]$group) # get group ids where there was initial suppression +sec.suppress4[, suppressed.group := F] +sec.suppress4[group %in% supp.ids, suppressed.group := T] # identify groups with initial suppression in table +sec.suppress4[group %in% supp.ids & is.na(suppression), unsuppressed := .N, .(indicator, team)] # rows unsuppressed per group +suppressWarnings(sec.suppress4[, unsuppressed := max(unsuppressed, na.rm = T), .(indicator, team)]) # fill in NA for rows unsuppressed +sec.suppress4[is.na(suppression) & unsuppressed == max.grp.rows - 1, secondary.suppression := T] # identify groups that need secondary suppression (groups with exactly one suppressed row) +setorder(sec.suppress4, group, numerator, na.last = T) # sort from smallest to largest numerator by group +sec.suppress4[secondary.suppression == T, order := 1:.N, group] # identify 1st row (smallest numerator) of each group needing secondary suppression +sec.suppress4[order==1, suppression := "^"] # mark the specific rows to have secondary suppression +sec.suppress4[suppression == "^", c("numerator", "denominator", "result", "se", "lower_bound", "upper_bound", "rse", "caution") := NA] +sec.suppress4[, c("max.grp.rows", "group", "suppressed.group", "unsuppressed", "secondary.suppression", "order") := NULL] +sec.suppress4 <- rbind(sec.suppress4, exclusion4) + +test_that('Check that secondary suppression with exclusion works',{ + expect_equal(nrow(dt4[suppression=="^"]), nrow(sec.suppress4[suppression=="^"])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(result)])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(se)])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(lower_bound)])) + expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(rse)])) +}) + +# test flag_only ---- +dt5 <- chi_suppress_results(dt, flag_only = TRUE) +test_that('Check that flag_only works',{ + expect_equal(nrow(dt5[suppression=="^"]), nrow(dt[numerator <= 9 | denominator <= 9])) + expect_equal(0, nrow(dt5[is.na(result)])) + expect_equal(0, nrow(dt5[is.na(se)])) + expect_equal(0, nrow(dt5[is.na(lower_bound)])) + expect_equal(0, nrow(dt5[is.na(rse)])) + expect_equal(nrow(dt5[caution=="!"]), nrow(dt[rse >=30 | numerator == 0])) +}) + +# test secondary_exclude when character and unquoted expression ---- +test_that('Check that the same results are returned whether or not quoted',{ + expect_warning(dt6 <- chi_suppress_results(dt, secondary_exclude = "team %like% '^a|^b|^c|^d'")) + dt7 <- chi_suppress_results(dt, secondary_exclude = team %like% '^a|^b|^c|^d') + expect_identical(dt6, dt7) +}) + +# test custom column names ---- +dt_custom <- copy(dt) +setnames(dt_custom, + old = c("numerator", "denominator", "result", "rse"), + new = c("num", "denom", "value", "rel_error")) + +test_that('Check that custom column names work correctly', { + dt_custom_result <- chi_suppress_results( + dt_custom, + numerator_col = "num", + denominator_col = "denom", + rse_col = "rel_error", + columns_to_suppress = c("value", "se", "lower_bound", "upper_bound", "num", "denom") + ) + + expect_equal(nrow(dt_custom_result[suppression=="^"]), + nrow(dt_custom[num <= 9 | denom <= 9])) + expect_equal(nrow(dt_custom_result[caution=="!"]), + nrow(dt_custom[rel_error >= 30 | num == 0])) +}) + +# test missing columns handling ---- +dt_missing <- copy(dt)[, rse := NULL] + +test_that("Check that missing rse column is handled properly", { + warnings <- character() + dt_missing_result <- withCallingHandlers( # needed because will produce two warnings + chi_suppress_results(dt_missing), + warning = function(w) { + warnings <<- c(warnings, conditionMessage(w)) + invokeRestart("muffleWarning") # prevents printing the warning + } + ) + + expect_true(any(grepl("Column 'rse', the value of `rse_col`, is missing", warnings))) + expect_true(any(grepl("columns specified in 'columns_to_suppress' are missing", warnings))) + expect_false("caution" %in% names(dt_missing_result)) +}) + +# test non-existent columns in columns_to_suppress ---- +test_that('Check handling of non-existent columns in columns_to_suppress', { + expect_warning( + dt_cols <- chi_suppress_results( + dt, + columns_to_suppress = c("result", "non_existent_column", "another_missing") + ), + "The following columns specified in 'columns_to_suppress' are missing from the dataset" + ) + + # Should only suppress existing columns + expect_true(all(is.na(dt_cols[suppression=="^"]$result))) +}) + +# test existing suppression column ---- +dt_exist <- copy(dt) +dt_exist[, suppression := "old"] + +test_that('Check that existing suppression column is overwritten with warning', { + expect_warning( + dt_exist_result <- chi_suppress_results(dt_exist), + "Existing 'suppression' column will be overwritten" + ) + + # Should have overwritten the old suppression column + expect_false(nrow(dt_exist_result[suppression == "old"]) > 0) +}) From e086cb9b166a10d7683c6eb47e5872ae3b5e957a Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 13 May 2025 14:59:29 -0700 Subject: [PATCH 2/2] added suppression of comparison_with_kc col - added to the columns_to_suppress default - no other changes needed because validation of columns_to_suppress already notifies of missing columns and will remove from columns_to_suppress --- R/chi_suppress_results.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/chi_suppress_results.R b/R/chi_suppress_results.R index e8815c0..be072f2 100644 --- a/R/chi_suppress_results.R +++ b/R/chi_suppress_results.R @@ -123,7 +123,7 @@ chi_suppress_results <- function(ph.data = NULL, denominator_col = "denominator", rse_col = "rse", columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse", - "numerator", "denominator")){ + "numerator", "denominator", "comparison_with_kc")){ ## Global variables used by data.table declared as NULL here to play nice with devtools::check() numerator <- denominator <- suppression <- my.group <- my.order <- my.rowct <-