From 8332ec790f73aa351e6ea526efe2ad2ac9f43f2b Mon Sep 17 00:00:00 2001
From: Danny Colombara <dcolombara@kingcounty.gov>
Date: Fri, 9 May 2025 16:15:23 -0700
Subject: [PATCH 1/2] chi_suppress_results created

 - based heavily on rads::suppress
 - main improvement is that it suppresses on denominators (not just numerators)
 - also add caution when numerator == 0
 - more validation
 - more customizable arguments, but defaults are for CHI output
 - many more tests
 - passes all tests without errors or warnings in devtools::check()
---
 NAMESPACE                                  |   2 +
 R/chi_suppress_results.R                   | 304 +++++++++++++++++++++
 man/chi_suppress_results.Rd                | 139 ++++++++++
 tests/testthat/test-chi_suppress_results.R | 224 +++++++++++++++
 4 files changed, 669 insertions(+)
 create mode 100644 R/chi_suppress_results.R
 create mode 100644 man/chi_suppress_results.Rd
 create mode 100644 tests/testthat/test-chi_suppress_results.R

diff --git a/NAMESPACE b/NAMESPACE
index 6248b5c..b51d2f4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -15,6 +15,7 @@ export(chi_get_proper_pop)
 export(chi_get_yaml)
 export(chi_keep_proper_ages)
 export(chi_qa_tro)
+export(chi_suppress_results)
 export(chi_update_sql)
 import(data.table)
 import(dtsurvey)
@@ -30,6 +31,7 @@ importFrom(DBI,dbGetQuery)
 importFrom(DBI,dbWriteTable)
 importFrom(data.table,"%between%")
 importFrom(data.table,":=")
+importFrom(data.table,':=')
 importFrom(data.table,.GRP)
 importFrom(data.table,.SD)
 importFrom(data.table,CJ)
diff --git a/R/chi_suppress_results.R b/R/chi_suppress_results.R
new file mode 100644
index 0000000..e8815c0
--- /dev/null
+++ b/R/chi_suppress_results.R
@@ -0,0 +1,304 @@
+#' Suppress data due to small numbers and create a caution flag due to reliability
+#'
+#' @description
+#' This function applies primary data suppression based on the numerator and
+#' denominator and secondary suppression based on the numerator. It also flags
+#' low reliability. Suppressed data are noted by \code{suppression = '^'} and
+#' unreliable data are noted by \code{caution = '!'}.
+#'
+#' @details
+#' Data source specific suppression ranges can be found in
+#' \href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDE_SmallNumberUpdate.xlsx}{APDE_SmallNumberUpdate.xlsx}
+#'
+#'
+#' Please review the
+#' \href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDEDataPresentationAlgorithm_2020_Approved.pptx}{APDE}
+#' and \href{https://www.doh.wa.gov/Portals/1/Documents/1500/SmallNumbers.pdf}{DOH}
+#' suppression guidelines for details on the logic used in this function.
+#'
+#' This function expects data that has already been formatted for CHI. However,
+#' the user can modify the default parameter settings for use with most tables of
+#' summary data.
+#'
+#' @param ph.data A data.table or data.frame. Must contain the data to be suppressed
+#' with standard metric names,
+#' i.e., mean, median, sum, rate, lower, upper, se, rse, numerator, denominator,
+#' proportion
+#' @param suppress_range Integer vector of length 2. They specify the minimum and
+#' maximum range for suppression.
+#' @param secondary Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE})
+#' indicating whether secondary suppression should be run
+#' @param secondary_ids Character vector of column names which are used to define
+#' groups for secondary suppression.
+#' Note, this should not include the most granular level. For example, if you wanted
+#' secondary suppression for race/ethnicity within HRAs
+#' where \code{cat1_varname == 'race4'} and \code{cat1_group == 'AIAN'},
+#' \code{cat1_group == 'Asian'}, \code{cat1_group == 'Black'}, etc., you should
+#' have \code{secondary_ids = c('hra20_name', 'cat1_varname')} rather than
+#' \code{secondary_ids = c('hra20_name', 'cat1_varname', 'cat1_group')}
+#' @param secondary_exclude An unquoted expression that evaluates to a logical vector
+#' indicating which rows to include in secondary suppression analysis. Use this parameter
+#' to exclude categories that are not mutually exclusive (e.g., overlapping demographic
+#' groups). The expression should use column names from the dataset and evaluate to TRUE
+#' for rows to include. For example: `secondary_exclude = cat1_varname != "race3"` would
+#' exclude all rows where cat1_varname equals "race3" from secondary suppression.
+#' @param flag_only Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE})
+#' indicating whether data to be
+#' suppressed should be flagged without setting estimates to NA
+#' @param numerator_col Character string with the name of the numerator column. Default is "numerator".
+#' @param denominator_col Character string with the name of the denominator column. Default is "denominator".
+#' @param rse_col Character string with the name of the relative standard error column. Default is "rse".
+#' @param columns_to_suppress Character vector of column names to be suppressed (set to NA) when
+#' suppression is flagged. Default includes common result columns.
+#'
+#' @return A data.table with suppression applied to CHI standard columns.
+#'
+#' @export
+#'
+#' @keywords suppression
+#'
+#' @importFrom data.table ':=' data.table is.data.table setDT fsetdiff setorder setorderv copy
+#'
+#' @examples
+#' \dontrun{
+#' set.seed(98104)
+#' dt <- data.table::data.table(
+#'   chi_year = rep(2018, 100),
+#'   result = rnorm(100, .25, .05),
+#'   numerator = round(rnorm(100, 20, 9), 0),
+#'   denominator = round(rnorm(100, 100, 30), 0),
+#'   rse = sample(1:100, 100, replace = TRUE)
+#' )
+#'
+#' # Basic suppression with default parameters
+#' newdt <- chi_suppress_results(ph.data = dt,
+#'                               suppress_range = c(1, 9),
+#'                               secondary = FALSE)
+#'
+#' nrow(dt[numerator %in% 1:9 | denominator %in% 1:9]) # rows needed suppression
+#' nrow(newdt[suppression=='^']) # rows suppressed
+#' nrow(newdt[rse >= 30 | numerator == 0]) # rows needing caution
+#' nrow(newdt[caution=='!']) # rows with caution
+#'
+#' # With secondary suppression
+#' dt$region <- sample(c("North", "South", "East", "Seattle"), 100, replace = TRUE)
+#' dt$category <- sample(c("A", "B", "C"), 100, replace = TRUE)
+#'
+#' newdt2 <- chi_suppress_results(ph.data = dt,
+#'                                suppress_range = c(1, 9),
+#'                                secondary = TRUE,
+#'                                secondary_ids = c("region", "category"))
+#'
+#' nrow(newdt[suppression=='^']) # only primary suppression
+#' nrow(newdt2[suppression=='^']) # with secondary suppression
+#'
+#' # Using custom column names
+#' dt2 <- data.table::data.table(
+#'   chi_year = rep(2018, 100),
+#'   mean = rnorm(100, .25, .05),
+#'   num = round(rnorm(100, 20, 9), 0),
+#'   denom = round(rnorm(100, 100, 30), 0),
+#'   rel_se = sample(1:100, 100, replace = TRUE)
+#' )
+#'
+#' newdt3 <- chi_suppress_results(ph.data = dt2,
+#'                                suppress_range = c(1, 9),
+#'                                numerator_col = "num",
+#'                                denominator_col = "denom",
+#'                                rse_col = "rel_se",
+#'                                columns_to_suppress = c("mean", "num", "denom"))
+#'
+#' nrow(dt2[num %in% 1:9 | denom %in% 1:9]) # rows need suppression
+#' nrow(newdt3[suppression == '^']) # rows suppressed
+#' }
+#'
+
+chi_suppress_results  <- function(ph.data = NULL,
+                                  suppress_range = c(1, 9),
+                                  secondary = FALSE,
+                                  secondary_ids = c("tab", "indicator_key", "cat1", "cat2_group", "year"),
+                                  secondary_exclude,
+                                  flag_only = FALSE,
+                                  numerator_col = "numerator",
+                                  denominator_col = "denominator",
+                                  rse_col = "rse",
+                                  columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse",
+                                                          "numerator", "denominator")){
+
+  ## Global variables used by data.table declared as NULL here to play nice with devtools::check()
+  numerator <- denominator <- suppression <- my.group <- my.order <- my.rowct <-
+    suppressed.group <- my.flag <- rse <- caution <- rows.unsuppressed <-
+    result <- lower_bound <- upper_bound <- se <- NULL
+
+  # ---- Validate 'ph.data' ----
+  if(is.null(ph.data)){
+    stop("\n\U1F6D1 You must specify a dataset (i.e., 'ph.data' must be defined)")
+  }
+
+  if(!data.table::is.data.table(ph.data)){
+    if(is.data.frame(ph.data)){
+      data.table::setDT(ph.data)
+    } else {
+      stop(paste0("\n\U1F6D1 <{ph.data}> must be the name of a data.frame or data.table."))
+    }
+  }
+
+  # ---- Validate 'suppress_range' ----
+  if(is.null(suppress_range)){
+    suppress_range <- c(0, 9)
+  }
+
+  if(!is.null(suppress_range) &
+     (length(suppress_range) != 2 | suppress_range[1] %% 1 != 0 | suppress_range[2] %% 1 != 0 |
+      suppress_range[1] < 0 | suppress_range[2] < 0)){
+    stop("\n\U1F6D1 <suppress_range> must be comprised of two non-negative integers (i.e., 'c(0, 9)'")
+  }
+
+  # ---- Validate 'secondary' ----
+  if(!is.logical(secondary)){
+    stop("\n\U1F6D1 'secondary' must be specified as a logical (i.e., TRUE, T, FALSE, or F)")
+  }
+
+  # ---- Validate 'secondary_ids' ----
+  if(secondary == TRUE & length(setdiff(secondary_ids, names(ph.data))) > 0){
+    stop("\n\U1F6D1 At least one name in 'secondary_ids' is not found among the column names in 'ph.data'")
+  }
+
+  # ---- Validate 'secondary_exclude' ----
+  if(!missing(secondary_exclude)){
+    call = match.call()
+
+    if(is.character(call[['secondary_exclude']])){
+      where = str2lang(call[['secondary_exclude']])
+      warning('\u26A0\ufe0f `secondary_exclude` is a string. It was converted so that it would work, but in the future, this might turn into an error.
+              In the future, please pass unquoted commands that will resolve to a logical')
+    } else {
+      where = copy(call[['secondary_exclude']])
+    }
+
+    e <- substitute(expr = where) # get parse tree expression `where`
+    myfilter <- eval(expr = e, envir = ph.data, enclos = parent.frame()) # evaluate
+    stopifnot('`where` does not resolve to a logical' = is.logical(myfilter))
+    if(nrow(ph.data[myfilter,]) < 1){
+      stop(paste0("\n\U1F6D1 Your 'secondary_exclude' argument filters out all rows of data. Please revise and submit again"))
+    }
+  }
+
+  # ---- Validate 'flag_only' ----
+  if(!is.logical(flag_only)){
+    stop("\n\U1F6D1 'flag_only' must be specified as a logical (i.e., TRUE, T, FALSE, or F)")
+  }
+
+  # ---- Validate 'numerator_col' ----
+  if(!numerator_col %in% names(ph.data)){
+    stop(paste0("\n\U1F6D1 Required column '", numerator_col, "' is missing from the dataset"))
+  }
+
+  # ---- Validate 'denominator_col' ----
+  if(!denominator_col %in% names(ph.data)){
+    warning(paste0("\u26A0\ufe0f Column '", denominator_col, "' is missing from the dataset. Only numerator-based suppression will be applied."))
+  }
+
+  # ---- Validate 'rse_col' ----
+  if(!rse_col %in% names(ph.data)){
+    warning(paste0("\u26A0\ufe0f Column '", rse_col, "', the value of `rse_col`, is missing from the dataset. `caution` flag for reliability will not be generated."))
+  }
+
+  # ---- Validate 'columns_to_suppress' ----
+  missing_cols <- setdiff(columns_to_suppress, names(ph.data))
+  if(length(missing_cols) > 0){
+    warning(paste0("\u26A0\ufe0f The following columns specified in 'columns_to_suppress' are missing from the dataset and will be ignored: ",
+                   paste(missing_cols, collapse = ", ")))
+    columns_to_suppress <- intersect(columns_to_suppress, names(ph.data))
+  }
+
+
+  # ---- Copy ph.data to avoid changing the underlying data.table due to modification by references----
+  temp.dt <- data.table::setDT(copy(ph.data))
+
+  # ---- Check for existing suppression and caution columns ----
+  if("suppression" %in% names(temp.dt)){
+    warning("\u26A0\ufe0f Existing 'suppression' column will be overwritten")
+    temp.dt[, suppression := NULL]  # Remove existing column
+  }
+
+  if("caution" %in% names(temp.dt)){
+    warning("\u26A0\ufe0f Existing 'caution' column will be overwritten")
+    temp.dt[, caution := NULL]  # Remove existing column
+  }
+
+  # ---- Identify primary suppression ----
+  # Check both numerator and denominator for suppression
+  temp.dt[, suppression := NA_character_]
+
+  temp.dt[get(numerator_col) >= suppress_range[1] & get(numerator_col) <= suppress_range[2], suppression := "^"]
+
+  if(denominator_col %in% names(temp.dt)){ # above used warning, not stop, if denominator_col didn't exist
+    temp.dt[get(denominator_col) >= suppress_range[1] & get(denominator_col) <= suppress_range[2], suppression := "^"]
+  }
+
+  # ---- Identify secondary suppression (only based on numerator) ----
+  if(isTRUE(secondary)){
+
+    # Apply secondary_exclude argument
+    if(!missing(secondary_exclude)){
+      myfilter <- eval(expr = e, envir = temp.dt, enclos = parent.frame())
+      temp.dt.aside <- data.table::fsetdiff(temp.dt, temp.dt[myfilter,])
+      temp.dt <- temp.dt[myfilter,]
+    }
+
+    # Create group id for each set of secondary_ids
+    temp.dt[, my.group := .GRP, by = secondary_ids]
+    data.table::setorder(temp.dt, my.group)
+
+    # Identify max number of rows per group defined by secondary_ids
+    temp.dt[, my.rowct := .N, by = secondary_ids]
+
+    # Identify groups that had initial suppression
+    temp.dt[, suppressed.group := FALSE]
+    temp.dt[my.group %in% unique(temp.dt[suppression == "^"]$my.group), suppressed.group := TRUE]
+
+    # Within groups that had suppression, count the number of rows that were not suppressed
+    temp.dt[my.group %in% unique(temp.dt[suppressed.group == TRUE]$my.group) & is.na(suppression),
+            rows.unsuppressed := .N, by = secondary_ids]
+    suppressWarnings(temp.dt[, rows.unsuppressed := max(rows.unsuppressed, na.rm = TRUE), by = my.group])
+
+    # Identify when the number of un-suppressed rows (in groups that had suppression) is max rows minus 1
+    # (these need secondary suppression)
+    temp.dt[is.na(suppression) & rows.unsuppressed == my.rowct - 1,
+            my.flag := "group needs secondary suppression"]
+
+    # Sort table so the smallest numerator per group that needs secondary suppression is first
+    data.table::setorderv(temp.dt, c('my.group', numerator_col), na.last = TRUE)
+
+    # Suppress row with smallest numerator among groups needing secondary suppression
+    if(nrow(temp.dt[my.flag == "group needs secondary suppression"]) > 0){
+      temp.dt[my.flag == "group needs secondary suppression", my.order := 1:.N, by = my.group]
+      temp.dt[my.order == 1, suppression := "^"]
+    }
+
+    # Drop all temporary variables
+    temp.dt[, (intersect(c("my.group", "suppressed.group", "my.rowct", "my.flag", "my.order",
+                           "rows.unsuppressed"), names(temp.dt))) := NULL]
+
+    # Combine back with data filtered out by secondary_exclude
+    if(exists("temp.dt.aside")){
+      temp.dt <- rbind(temp.dt, temp.dt.aside)
+      rm(temp.dt.aside)
+    }
+  }
+
+  # ---- Apply suppression to columns_to_suppress unless flag_only = FALSE ----
+  # Use validated columns_to_suppress
+  if(isFALSE(flag_only)){
+    temp.dt[suppression == "^", (columns_to_suppress) := NA]
+  }
+
+  # ---- Apply caution flag if possible ----
+  if(rse_col %in% names(temp.dt)){
+    temp.dt[get(rse_col) >= 30 | get(numerator_col) == 0, caution := "!"]
+  }
+
+  return(temp.dt)
+
+}
diff --git a/man/chi_suppress_results.Rd b/man/chi_suppress_results.Rd
new file mode 100644
index 0000000..a6c283e
--- /dev/null
+++ b/man/chi_suppress_results.Rd
@@ -0,0 +1,139 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/chi_suppress_results.R
+\name{chi_suppress_results}
+\alias{chi_suppress_results}
+\title{Suppress data due to small numbers and create a caution flag due to reliability}
+\usage{
+chi_suppress_results(
+  ph.data = NULL,
+  suppress_range = c(1, 9),
+  secondary = FALSE,
+  secondary_ids = c("tab", "indicator_key", "cat1", "cat2_group", "year"),
+  secondary_exclude,
+  flag_only = FALSE,
+  numerator_col = "numerator",
+  denominator_col = "denominator",
+  rse_col = "rse",
+  columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse",
+    "numerator", "denominator")
+)
+}
+\arguments{
+\item{ph.data}{A data.table or data.frame. Must contain the data to be suppressed
+with standard metric names,
+i.e., mean, median, sum, rate, lower, upper, se, rse, numerator, denominator,
+proportion}
+
+\item{suppress_range}{Integer vector of length 2. They specify the minimum and
+maximum range for suppression.}
+
+\item{secondary}{Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE})
+indicating whether secondary suppression should be run}
+
+\item{secondary_ids}{Character vector of column names which are used to define
+groups for secondary suppression.
+Note, this should not include the most granular level. For example, if you wanted
+secondary suppression for race/ethnicity within HRAs
+where \code{cat1_varname == 'race4'} and \code{cat1_group == 'AIAN'},
+\code{cat1_group == 'Asian'}, \code{cat1_group == 'Black'}, etc., you should
+have \code{secondary_ids = c('hra20_name', 'cat1_varname')} rather than
+\code{secondary_ids = c('hra20_name', 'cat1_varname', 'cat1_group')}}
+
+\item{secondary_exclude}{An unquoted expression that evaluates to a logical vector
+indicating which rows to include in secondary suppression analysis. Use this parameter
+to exclude categories that are not mutually exclusive (e.g., overlapping demographic
+groups). The expression should use column names from the dataset and evaluate to TRUE
+for rows to include. For example: `secondary_exclude = cat1_varname != "race3"` would
+exclude all rows where cat1_varname equals "race3" from secondary suppression.}
+
+\item{flag_only}{Logical (\code{T}, \code{TRUE}, \code{F}, or \code{FALSE})
+indicating whether data to be
+suppressed should be flagged without setting estimates to NA}
+
+\item{numerator_col}{Character string with the name of the numerator column. Default is "numerator".}
+
+\item{denominator_col}{Character string with the name of the denominator column. Default is "denominator".}
+
+\item{rse_col}{Character string with the name of the relative standard error column. Default is "rse".}
+
+\item{columns_to_suppress}{Character vector of column names to be suppressed (set to NA) when
+suppression is flagged. Default includes common result columns.}
+}
+\value{
+A data.table with suppression applied to CHI standard columns.
+}
+\description{
+This function applies primary data suppression based on the numerator and
+denominator and secondary suppression based on the numerator. It also flags
+low reliability. Suppressed data are noted by \code{suppression = '^'} and
+unreliable data are noted by \code{caution = '!'}.
+}
+\details{
+Data source specific suppression ranges can be found in
+\href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDE_SmallNumberUpdate.xlsx}{APDE_SmallNumberUpdate.xlsx}
+
+
+Please review the
+\href{https://kc1.sharepoint.com/teams/DPH-APDEData/Shared\%20Documents/General/data_presentation_algorithm/APDEDataPresentationAlgorithm_2020_Approved.pptx}{APDE}
+and \href{https://www.doh.wa.gov/Portals/1/Documents/1500/SmallNumbers.pdf}{DOH}
+suppression guidelines for details on the logic used in this function.
+
+This function expects data that has already been formatted for CHI. However,
+the user can modify the default parameter settings for use with most tables of
+summary data.
+}
+\examples{
+\dontrun{
+set.seed(98104)
+dt <- data.table::data.table(
+  chi_year = rep(2018, 100),
+  result = rnorm(100, .25, .05),
+  numerator = round(rnorm(100, 20, 9), 0),
+  denominator = round(rnorm(100, 100, 30), 0),
+  rse = sample(1:100, 100, replace = TRUE)
+)
+
+# Basic suppression with default parameters
+newdt <- chi_suppress_results(ph.data = dt,
+                              suppress_range = c(1, 9),
+                              secondary = FALSE)
+
+nrow(dt[numerator \%in\% 1:9 | denominator \%in\% 1:9]) # rows needed suppression
+nrow(newdt[suppression=='^']) # rows suppressed
+nrow(newdt[rse >= 30 | numerator == 0]) # rows needing caution
+nrow(newdt[caution=='!']) # rows with caution
+
+# With secondary suppression
+dt$region <- sample(c("North", "South", "East", "Seattle"), 100, replace = TRUE)
+dt$category <- sample(c("A", "B", "C"), 100, replace = TRUE)
+
+newdt2 <- chi_suppress_results(ph.data = dt,
+                               suppress_range = c(1, 9),
+                               secondary = TRUE,
+                               secondary_ids = c("region", "category"))
+
+nrow(newdt[suppression=='^']) # only primary suppression
+nrow(newdt2[suppression=='^']) # with secondary suppression
+
+# Using custom column names
+dt2 <- data.table::data.table(
+  chi_year = rep(2018, 100),
+  mean = rnorm(100, .25, .05),
+  num = round(rnorm(100, 20, 9), 0),
+  denom = round(rnorm(100, 100, 30), 0),
+  rel_se = sample(1:100, 100, replace = TRUE)
+)
+
+newdt3 <- chi_suppress_results(ph.data = dt2,
+                               suppress_range = c(1, 9),
+                               numerator_col = "num",
+                               denominator_col = "denom",
+                               rse_col = "rel_se",
+                               columns_to_suppress = c("mean", "num", "denom"))
+
+nrow(dt2[num \%in\% 1:9 | denom \%in\% 1:9]) # rows need suppression
+nrow(newdt3[suppression == '^']) # rows suppressed
+}
+
+}
+\keyword{suppression}
diff --git a/tests/testthat/test-chi_suppress_results.R b/tests/testthat/test-chi_suppress_results.R
new file mode 100644
index 0000000..3d1e234
--- /dev/null
+++ b/tests/testthat/test-chi_suppress_results.R
@@ -0,0 +1,224 @@
+library('data.table')
+library('testthat')
+
+# create test data ----
+set.seed(98104)
+dt <- suppressWarnings(data.table::data.table(chi_year = 2022,
+                                              indicator = "home ownership",
+                                              team = sample(as.vector(outer(letters, 1:15, paste0)), rep = T),
+                                              color = c("red", "blue", "yellow", "green"),
+                                              numerator = sample(1:200, 1000, rep = TRUE)))
+dt[, denominator := sample(500:1000, 1000, rep = TRUE)]
+dt[, result := numerator / denominator]
+dt[, se := sqrt(result/100)] # not a real formula!
+dt[, lower_bound := result - (1.96 * se)]
+dt[, upper_bound := result + (1.96 * se)]
+dt[, rse := 100*se / result]
+setorder(dt, indicator, team, color, numerator)
+dt[, counter := 1:.N, c("indicator", "team", "color")]
+dt <- dt[counter == 1][, counter := NULL]
+
+# test defaults ----
+dt1 <- chi_suppress_results(dt)
+test_that('Check that defaults work as expected',{
+  expect_equal(nrow(dt1[suppression=="^"]), nrow(dt[numerator %in% 1:9 | denominator %in% 1:9]))
+  expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(result)]))
+  expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(se)]))
+  expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(lower_bound)]))
+  expect_equal(nrow(dt1[suppression=="^"]), nrow(dt1[is.na(rse)]))
+  expect_equal(nrow(dt1[caution=="!"]), nrow(dt[!(numerator %in% 1:9 | denominator %in% 1:9) & (rse >=30 | numerator == 0)]))
+})
+
+# test suppression range ----
+dt2 <- chi_suppress_results(dt, suppress_range = c(0,10), secondary = FALSE)
+test_that('Check that the suppression_range argument works',{
+  expect_equal(nrow(dt2[suppression=="^"]), nrow(dt[numerator <= 10 | denominator <= 10]))
+  expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(result)]))
+  expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(se)]))
+  expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(lower_bound)]))
+  expect_equal(nrow(dt2[suppression=="^"]), nrow(dt2[is.na(rse)]))
+  expect_equal(nrow(dt2[caution=="!"]), nrow(dt[!(numerator %in% 0:10 | denominator %in% 0:10) & (rse >=30)]))
+})
+
+# test secondary suppression ----
+dt3 <- chi_suppress_results(dt, suppress_range = c(0,10),
+                            secondary = TRUE,
+                            secondary_ids = c("indicator", "team"))
+#ugly manual method to apply secondary suppression for comparison
+sec.suppress3 <- copy(dt2) # build off results from initial / primary suppression
+sec.suppress3[, max.grp.rows := .N, .(indicator, team)] # num of rows per set of secondary_ids
+sec.suppress3[, group := .GRP, by = .(indicator, team)] # create group id for each set of secondary_ids
+supp.ids <- unique(sec.suppress3[suppression=="^"]$group) # get group ids where there was initial suppression
+sec.suppress3[, suppressed.group := F]
+sec.suppress3[group %in% supp.ids, suppressed.group := T] # identify groups with initial suppression in table
+sec.suppress3[group %in% supp.ids & is.na(suppression), unsuppressed := .N, .(indicator, team)] # rows unsuppressed per group
+suppressWarnings(sec.suppress3[, unsuppressed := max(unsuppressed, na.rm = T), .(indicator, team)]) # fill in NA for rows unsuppressed
+sec.suppress3[is.na(suppression) & unsuppressed == max.grp.rows - 1, secondary.suppression := T] # identify groups that need secondary suppression (groups with exactly one suppressed row)
+setorder(sec.suppress3, group, numerator, na.last = T) # sort from smallest to largest numerator by group
+sec.suppress3[secondary.suppression == T, order := 1:.N, group] # identify 1st row (smallest numerator) of each group needing secondary suppression
+sec.suppress3[order==1, suppression := "^"] # mark the specific rows to have secondary suppression
+sec.suppress3[suppression == "^", c("numerator", "denominator", "result", "se", "lower_bound", "upper_bound", "rse", "caution") := NA]
+sec.suppress3[, c("max.grp.rows", "group", "suppressed.group", "unsuppressed", "secondary.suppression", "order") := NULL]
+
+test_that('Check that secondary suppression works',{
+  expect_equal(nrow(dt3[suppression=="^"]), nrow(sec.suppress3[suppression=="^"]))
+  expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(result)]))
+  expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(se)]))
+  expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(lower_bound)]))
+  expect_equal(nrow(dt3[suppression=="^"]), nrow(dt3[is.na(rse)]))
+})
+
+# test NA handling in numerator and demominator ----
+dt_na <- copy(dt)
+dt_na[1:10, numerator := NA]
+dt_na[11:20, denominator := NA]
+dt_na_result <- chi_suppress_results(dt_na)
+test_that('Check NA handling in numerator and denominator', {
+  # Check whether NAs are properly handled and not incorrectly flagged
+  expect_equal(sum(is.na(dt_na_result[1:20]$suppression)), 20)
+})
+
+# test missing numerator column ----
+dt_no_num <- copy(dt)[, numerator := NULL]
+test_that('Check error when numerator column is missing', {
+  expect_error(chi_suppress_results(dt_no_num), "Required column 'numerator' is missing")
+})
+
+# test secondary suppression group ----
+dt_single <- data.table(
+  chi_year = 2022,
+  indicator = c("A", "A", "B"),
+  team = c("team1", "team2", "team1"),
+  numerator = c(5, 15, 20),
+  denominator = c(100, 100, 100)
+)
+dt_single[, result := numerator / denominator]
+dt_single[, rse := 20]
+
+dt_single_result <- suppressWarnings(chi_suppress_results(dt_single,
+                                                          secondary = TRUE,
+                                                          secondary_ids = c("indicator")))
+
+test_that('Check secondary suppression with single row group', {
+  # Group A should be entirely suppressed and
+  # Group B shoudl have one unsuppressed row
+  expect_equal(nrow(dt_single_result[indicator == "A" & suppression=='^']), 2)
+  expect_equal(nrow(dt_single_result[indicator == "B" & is.na(suppression)]), 1)
+})
+
+# test secondary suppression with secondary_exclude ----
+dt4 <- chi_suppress_results(dt, suppress_range = c(0,10),
+                            secondary = TRUE,
+                            secondary_ids = c("indicator", "team"),
+                            secondary_exclude = !team %in% c('a10', 'a11'))
+
+#ugly manual method to apply secondary suppression for testing
+exclusion4 <- copy(dt2)[team %in% c('a10', 'a11')] # partition off part excluded from secondary suppression
+sec.suppress4 <- copy(dt2)[!team %in% c('a10', 'a11')] # build off results from initial / primary suppression
+sec.suppress4[, max.grp.rows := .N, .(indicator, team)] # num of rows per set of secondary_ids
+sec.suppress4[, group := .GRP, by = .(indicator, team)] # create group id for each set of secondary_ids
+supp.ids <- unique(sec.suppress4[suppression=="^"]$group) # get group ids where there was initial suppression
+sec.suppress4[, suppressed.group := F]
+sec.suppress4[group %in% supp.ids, suppressed.group := T] # identify groups with initial suppression in table
+sec.suppress4[group %in% supp.ids & is.na(suppression), unsuppressed := .N, .(indicator, team)] # rows unsuppressed per group
+suppressWarnings(sec.suppress4[, unsuppressed := max(unsuppressed, na.rm = T), .(indicator, team)]) # fill in NA for rows unsuppressed
+sec.suppress4[is.na(suppression) & unsuppressed == max.grp.rows - 1, secondary.suppression := T] # identify groups that need secondary suppression (groups with exactly one suppressed row)
+setorder(sec.suppress4, group, numerator, na.last = T) # sort from smallest to largest numerator by group
+sec.suppress4[secondary.suppression == T, order := 1:.N, group] # identify 1st row (smallest numerator) of each group needing secondary suppression
+sec.suppress4[order==1, suppression := "^"] # mark the specific rows to have secondary suppression
+sec.suppress4[suppression == "^", c("numerator", "denominator", "result", "se", "lower_bound", "upper_bound", "rse", "caution") := NA]
+sec.suppress4[, c("max.grp.rows", "group", "suppressed.group", "unsuppressed", "secondary.suppression", "order") := NULL]
+sec.suppress4 <- rbind(sec.suppress4, exclusion4)
+
+test_that('Check that secondary suppression with exclusion works',{
+  expect_equal(nrow(dt4[suppression=="^"]), nrow(sec.suppress4[suppression=="^"]))
+  expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(result)]))
+  expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(se)]))
+  expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(lower_bound)]))
+  expect_equal(nrow(dt4[suppression=="^"]), nrow(dt4[is.na(rse)]))
+})
+
+# test flag_only ----
+dt5 <- chi_suppress_results(dt, flag_only = TRUE)
+test_that('Check that flag_only works',{
+  expect_equal(nrow(dt5[suppression=="^"]), nrow(dt[numerator <= 9 | denominator <= 9]))
+  expect_equal(0, nrow(dt5[is.na(result)]))
+  expect_equal(0, nrow(dt5[is.na(se)]))
+  expect_equal(0, nrow(dt5[is.na(lower_bound)]))
+  expect_equal(0, nrow(dt5[is.na(rse)]))
+  expect_equal(nrow(dt5[caution=="!"]), nrow(dt[rse >=30 | numerator == 0]))
+})
+
+# test secondary_exclude when character and unquoted expression ----
+test_that('Check that the same results are returned whether or not quoted',{
+  expect_warning(dt6 <- chi_suppress_results(dt, secondary_exclude = "team %like% '^a|^b|^c|^d'"))
+  dt7 <- chi_suppress_results(dt, secondary_exclude = team %like% '^a|^b|^c|^d')
+  expect_identical(dt6, dt7)
+})
+
+# test custom column names ----
+dt_custom <- copy(dt)
+setnames(dt_custom,
+         old = c("numerator", "denominator", "result", "rse"),
+         new = c("num", "denom", "value", "rel_error"))
+
+test_that('Check that custom column names work correctly', {
+  dt_custom_result <- chi_suppress_results(
+    dt_custom,
+    numerator_col = "num",
+    denominator_col = "denom",
+    rse_col = "rel_error",
+    columns_to_suppress = c("value", "se", "lower_bound", "upper_bound", "num", "denom")
+  )
+
+  expect_equal(nrow(dt_custom_result[suppression=="^"]),
+               nrow(dt_custom[num <= 9 | denom <= 9]))
+  expect_equal(nrow(dt_custom_result[caution=="!"]),
+               nrow(dt_custom[rel_error >= 30 | num == 0]))
+})
+
+# test missing columns handling ----
+dt_missing <- copy(dt)[, rse := NULL]
+
+test_that("Check that missing rse column is handled properly", {
+  warnings <- character()
+  dt_missing_result <- withCallingHandlers( # needed because will produce two warnings
+    chi_suppress_results(dt_missing),
+    warning = function(w) {
+      warnings <<- c(warnings, conditionMessage(w))
+      invokeRestart("muffleWarning")  # prevents printing the warning
+    }
+  )
+
+  expect_true(any(grepl("Column 'rse', the value of `rse_col`, is missing", warnings)))
+  expect_true(any(grepl("columns specified in 'columns_to_suppress' are missing", warnings)))
+  expect_false("caution" %in% names(dt_missing_result))
+})
+
+# test non-existent columns in columns_to_suppress ----
+test_that('Check handling of non-existent columns in columns_to_suppress', {
+  expect_warning(
+    dt_cols <- chi_suppress_results(
+      dt,
+      columns_to_suppress = c("result", "non_existent_column", "another_missing")
+    ),
+    "The following columns specified in 'columns_to_suppress' are missing from the dataset"
+  )
+
+  # Should only suppress existing columns
+  expect_true(all(is.na(dt_cols[suppression=="^"]$result)))
+})
+
+# test existing suppression column ----
+dt_exist <- copy(dt)
+dt_exist[, suppression := "old"]
+
+test_that('Check that existing suppression column is overwritten with warning', {
+  expect_warning(
+    dt_exist_result <- chi_suppress_results(dt_exist),
+    "Existing 'suppression' column will be overwritten"
+  )
+
+  # Should have overwritten the old suppression column
+  expect_false(nrow(dt_exist_result[suppression == "old"]) > 0)
+})

From e086cb9b166a10d7683c6eb47e5872ae3b5e957a Mon Sep 17 00:00:00 2001
From: Danny Colombara <dcolombara@kingcounty.gov>
Date: Tue, 13 May 2025 14:59:29 -0700
Subject: [PATCH 2/2] added suppression of comparison_with_kc col

 - added to the columns_to_suppress default
 - no other changes needed because validation of columns_to_suppress
   already notifies of missing columns and will remove from columns_to_suppress
---
 R/chi_suppress_results.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/chi_suppress_results.R b/R/chi_suppress_results.R
index e8815c0..be072f2 100644
--- a/R/chi_suppress_results.R
+++ b/R/chi_suppress_results.R
@@ -123,7 +123,7 @@ chi_suppress_results  <- function(ph.data = NULL,
                                   denominator_col = "denominator",
                                   rse_col = "rse",
                                   columns_to_suppress = c("result", "lower_bound", "upper_bound", "se", "rse",
-                                                          "numerator", "denominator")){
+                                                          "numerator", "denominator", "comparison_with_kc")){
 
   ## Global variables used by data.table declared as NULL here to play nice with devtools::check()
   numerator <- denominator <- suppression <- my.group <- my.order <- my.rowct <-