Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export(chi_generate_instructions_pop)
export(chi_generate_metadata)
export(chi_generate_tro_shell)
export(chi_get_cols)
export(chi_get_proper_pop)
export(chi_get_yaml)
export(chi_qa_tro)
export(chi_update_sql)
Expand All @@ -26,6 +27,7 @@ importFrom(data.table,"%between%")
importFrom(data.table,":=")
importFrom(data.table,.GRP)
importFrom(data.table,.SD)
importFrom(data.table,CJ)
importFrom(data.table,`:=`)
importFrom(data.table,between)
importFrom(data.table,copy)
Expand All @@ -47,6 +49,9 @@ importFrom(future.apply,future_lapply)
importFrom(glue,glue)
importFrom(glue,glue_sql)
importFrom(odbc,odbc)
importFrom(progressr,handlers)
importFrom(progressr,progressor)
importFrom(progressr,with_progress)
importFrom(rads,calc)
importFrom(rads,round2)
importFrom(rads,string_clean)
Expand Down
344 changes: 344 additions & 0 deletions R/chi_count_by_age.R

Large diffs are not rendered by default.

106 changes: 106 additions & 0 deletions R/chi_drop_illogical_ages.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#' Filter Out Age Values That Don't Match Their Corresponding Age Group Categories
#'
#' @description
#' This function filters a data.table to remove rows with inconsistent age values.
#' It compares the single year age value (specified by \code{agevar}) against age
#' ranges defined in \code{cat1_group} and \code{cat2_group} columns, keeping only
#' those rows where:
#' \itemize{
#' \item The categorical variable is not age-related, OR
#' \item The age value falls within the range specified by the corresponding age group
#' }
#'
#' Age groups are expected to be in formats like "10-17", "<5", or "45+", which
#' the function automatically parses into numeric ranges.
#'
#' @param ph.data A data.table or data.frame containing category and age data to be filtered.
#' @param agevar Character string specifying the name of the age variable column.
#'
#' Default: \code{agevar = 'chi_age'}
#'
#' @return A filtered data.table with only logically consistent age values
#'
#' @details
#' The function interprets special formats in age group strings:
#' \itemize{
#' \item "<1" is treated as age "0-0" (age zero)
#' \item "<N" is converted to "0-N" (ages 0 through N)
#' \item "N+" is converted to "N-120" (ages N through 120)
#' }
#'
#' @seealso
#' \code{\link{chi_count_by_age}} and \code{\link{chi_get_proper_pop}} - This function
#' is typically applied to tables resulting from merging the output of these two functions.
#'
#' @examples
#' \dontrun{
#' # Assuming infmort_combo is a data.frame or data.table with age, cat1,
#' # cat1_group, cat2, cat2_group columns:
#'
#' filtered_data <- chi_drop_illogical_ages(ph.data = infmort_combo, agevar = 'age')
#' }
#'
#' @importFrom data.table copy := fcase between
#' @export
#'
chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age') {
# Validate ph.data
if (!is.data.frame(ph.data)) {
stop("'ph.data' must be a data.frame or data.table")
}

# Convert to data.table if needed
setDT(ph.data)

# Check if required columns exist
critical_cols <- c(agevar, "cat1", "cat1_group", "cat2", "cat2_group")
missing_cols <- setdiff(critical_cols, names(ph.data))

if (length(missing_cols) > 0) {
stop(paste("The following required columns are missing:",
paste(missing_cols, collapse = ", ")))
}

# Ensure agevar column contains numeric values
if (!is.numeric(ph.data[[agevar]])) {
warning(paste("'", agevar, "' column is not numeric. Attempting to convert...", sep = ""))
tryCatch({
ph.data[, (agevar) := as.numeric(get(agevar))]
}, error = function(e) {
stop(paste("'", agevar, "' could not be converted to numeric. Error: ", e$message, sep = ""))
})
}

# Loop for cat1 and cat2
for (catnum in c("cat1", "cat2")) {
# Get column names for this category
catgroup <- paste0(catnum, "_group")
temp_catgroup <- paste0(catgroup, "_temp")

# Create a standardized version of the age group
ph.data[, (temp_catgroup) := data.table::fcase(
get(catgroup) == '<1', '0-0',

grepl("<", get(catgroup)), gsub("<", "0-", get(catgroup)),

grepl("\\+", get(catgroup)), gsub("\\+", "-120", get(catgroup)),

grepl('-', get(catgroup)), get(catgroup)
)]

# Extract min and max age
ph.data[, "min_age" := as.numeric(gsub("-.*", "", get(temp_catgroup)))]
ph.data[, "max_age" := as.numeric(gsub(".*-", "", get(temp_catgroup)))]

# Keep rows where either:
# 1. The cat is not age-related, OR
# 2. The cat is age-related AND age value is within the min-max range
ph.data <- ph.data[!grepl(' age$|^age$', get(catnum), ignore.case = T) |
data.table::between(get(agevar), min_age, max_age)]

# Clean up temporary columns
ph.data[, c(temp_catgroup, "min_age", "max_age") := NULL]
}

return(ph.data)
}
145 changes: 145 additions & 0 deletions R/chi_generate_instructions_pop.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#' Generate Population Instructions for CHI Analysis
#'
#' @description
#' Creates instructions for \code{\link{chi_get_proper_pop}} based on a table of count
#' data. These instructions configure appropriate demographic groupings,
#' geographic types, and time periods for retrieving population denominators used
#' in CHI rate calculations.
#'
#' @param mycount.data Input data.table produced by \code{\link{chi_count_by_age}},
#' containing the following columns:
#' \itemize{
#' \item indicator_key: indicator_key used by CHI
#' \item year: Year range (e.g., "2019-2021" or single year)
#' \item tab: Visualization tab type
#' \item cat1, cat1_varname, cat1_group: Primary stratification variables
#' \item cat2, cat2_varname, cat2_group: Secondary stratification variables
#' \item chi_age: Single year age
#' \item count: Count of events (births, death, hospitalizations, etc. )
#' }
#' @param povgeo Geographic level for poverty analysis ('blk' or 'zip')
#'
#' @return A data.table containing population processing instructions with columns:
#' \itemize{
#' \item year: Original year range from input
#' \item cat1, cat1_varname: Primary stratification details
#' \item cat2, cat2_varname: Secondary stratification details
#' \item tab: Visualization tab type
#' \item start, stop: Start and end years parsed from the year range
#' \item race_type: Race categorization type ('race', 'race_eth', or 'race_aic')
#' \item geo_type: Geographic level for analysis ('kc', 'hra', 'region', 'blk', 'zip', 'wa')
#' \item group_by1, group_by2: Demographic grouping specifications
#' }
#'
#' @seealso
#' \code{\link{chi_count_by_age}} which generates the count data used as input
#' to this function
#'
#' \code{\link{chi_get_proper_pop}} which uses the output of this function
#'
#' @importFrom data.table copy `:=` setorder tstrsplit
#' @importFrom tools toTitleCase
#' @export
#'
chi_generate_instructions_pop <- function(mycount.data,
povgeo = c('blk', 'zip')) {

# Validation of inputs ----
if (is.null(mycount.data)) {
stop("\n\U1F6D1 mycount.data parameter is required")
}

if (!is.data.frame(mycount.data)) {
stop("\n\U1F6D1 mycount.data must be a data.frame or data.table")
} else {mycount.data <- setDT(copy(mycount.data))}

povgeo <- match.arg(povgeo)

# Initial data preparation ----
# Create a template with only the necessary columns to avoid duplicates
pop.template <- copy(mycount.data)
pop.template <- unique(copy(pop.template)[, list(year, cat1, cat1_varname, cat2, cat2_varname, tab)])

# Process year ranges ----
# Split year ranges (e.g., "2019-2021") into start and stop years
pop.template[, c("start", "stop") := tstrsplit(year, split = '-')]
# For single years, set the stop year equal to the start year
pop.template[is.na(stop), stop := start]

# Set default demographic settings ----
# Default race type includes ethnicity (Hispanic as race)
pop.template[, race_type := 'race_eth']

# Handle maternal data prefixes ----
# Remove "Birthing person's" prefix to standardize maternal data categories
pop.template[grepl("birthing person", cat1, ignore.case = TRUE),
cat1 := tools::toTitleCase(gsub("Birthing person's ", "", cat1))]
pop.template[grepl("birthing person", cat2, ignore.case = TRUE),
cat2 := tools::toTitleCase(gsub("Birthing person's ", "", cat2))]

# Process geographic types and demographic groupings ----
# Define OMB AIC (alone or in combination)
omb_aic <- c("chi_race_aic_aian", "chi_race_aic_asian", "chi_race_aic_black",
"chi_race_aic_his", "chi_race_aic_nhpi", "chi_race_aic_wht")

# Process both primary (cat1) and secondary (cat2) stratification variables
for(catnum in c("cat1", "cat2")) {
catvarname <- paste0(catnum, "_varname")
temp.groupby <- paste0("group_by", gsub('cat', '', catnum))

# Set geographic type based on category
pop.template[get(catnum) == "Cities/neighborhoods", geo_type := "hra"]

# Set race_type and group_by based on race/ethnicity variable
pop.template[get(catvarname) == "race3", c("race_type", temp.groupby) := 'race']
pop.template[get(catvarname) == "race4", c("race_type", temp.groupby) := 'race_eth']
pop.template[get(catvarname) %in% omb_aic, c("race_type", temp.groupby) := 'race_aic']

# Filter out non-standard AIC race/ethnicity categories that don't have population data
pop.template <- pop.template[!(grepl('_aic_', get(catvarname)) &
!get(catvarname) %in% omb_aic)]

# Set demographic grouping based on category label
pop.template[get(catnum) == "Ethnicity", c("race_type", temp.groupby) := 'race_eth']
pop.template[get(catnum) == "Gender", (temp.groupby) := 'genders']
pop.template[get(catnum) %in% c("Race", "Race/ethnicity") & get(catvarname) == 'race4',
(temp.groupby) := 'race_eth']
pop.template[(get(catnum) == "Race" & get(catvarname) == 'race3'),
(temp.groupby) := 'race']

# Set geographic type based on regions
pop.template[get(catnum) == "Regions" & (is.na(geo_type) | geo_type != 'hra'),
geo_type := "region"]
pop.template[get(catnum) == "Big cities", geo_type := "hra"]
pop.template[get(catnum) == "Washington State", geo_type := "wa"]
}

# Handle special geographic cases ----
# Set geographic type for poverty analysis to block group level by default
pop.template[grepl("poverty$", cat1, ignore.case = TRUE) |
grepl("poverty$", cat2, ignore.case = TRUE),
geo_type := "blk"]

# Override with zip code level for poverty analysis if specified in povgeo parameter
if(!is.na(povgeo) && povgeo == 'zip') {
pop.template[grepl("poverty$", cat1, ignore.case = TRUE) |
grepl("poverty$", cat2, ignore.case = TRUE),
geo_type := "zip"]
}

# Special case for combined regions and cities/neighborhoods analysis
pop.template[(cat1 == "Regions" & cat2 == "Cities/neighborhoods") |
(cat2 == "Regions" & cat1 == "Cities/neighborhoods"),
geo_type := "blk"]

# Set missing or default values ----
# Replace NA cat2 with "NA" string for consistent processing
pop.template[is.na(cat2), cat2 := "NA"]

# Set default geographic type as King County when not specified
pop.template[is.na(geo_type), geo_type := 'kc']

# Return final results ----
# Remove duplicate rows to minimize calls to get_population for efficiency
return(unique(pop.template))
}
Loading