From bb2ad7bcd8377265f3fcb424cae9c08ff193ed3f Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 09:49:41 -0800 Subject: [PATCH 01/35] fixed DESCRIPTION issues from devtools::check() - drop VignetteBuilder: knitr b/c no vignettes to build - added future and future.apply to imports - replace remote install of rads.data with rads b/c rads is needed too and auto installs rads data - removed self reference to github::PHSKC-APDE/apde.chi.tools - changed role = 'auth' to role = 'aut' - tweaked the Description paragraph, which must be in complete sentences --- DESCRIPTION | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 54e5361..89c7a46 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,15 +5,14 @@ Version: 0.0.1 Authors@R: person( "Ronald","Buie", email = "rbuie@kingcounty.gov", - role = c("auth", "cre")) -Description: This package is for use by epidemiologists within Public Health Seattle - and King County, and aligns to analytics guidelines and practices for Community + role = c("aut", "cre")) +Description: This package provides tools for epidemiologists within Public Health Seattle + and King County. It aligns with analytics guidelines and practices for Community Health Indicators (CHI) as reported by the Assessment, Policy Development, and - Evaluation Unit. The major activities assisted by this packages are - Simplifying the execution of identical CHI analyses across multiple variables - Creating CHI Tableau ready output of the analysis - Performing quality assurance regarding data structure and formatting - Pushing and pulling data to APDE servers + Evaluation Unit. The package simplifies the execution of identical CHI analyses + across multiple variables, creates CHI Tableau-ready output of the analysis, + performs quality assurance regarding data structure and formatting, and + facilitates pushing and pulling data to APDE servers. URL: https://github.com/PHSKC-APDE/apde.chi.tools BugReports: https://github.com/PHSKC-APDE/apde.chi.tools/issues License: MIT + file LICENSE @@ -23,18 +22,17 @@ Imports: data.table (>= 1.14.2), DBI (>= 1.1.0), dtsurvey (>= 0.0.2.0), + future, + future.apply, glue (>= 1.6.1), - keyring (>= 1.2.0), - lubridate (>= 1.6.0), - methods, odbc (>= 1.2.2), - rads.data (>= 1.0.7.5), - stats, - utf8 (>= 1.1.4), + rads, + rads.data, + tidyr, yaml (>= 2.2.1), utils Remotes: - github::PHSKC-APDE/rads.data, github::PHSKC-APDE/dtsurvey, github::PHSKC-APDE/apde.chi.tools + github::PHSKC-APDE/rads, github::PHSKC-APDE/dtsurvey Suggests: httr, knitr, @@ -43,8 +41,7 @@ Suggests: testthat (>= 3.0.0), srvyr, survey (>= 4.0) -VignetteBuilder: knitr Depends: - R (>= 3.1) + R (>= 4.1) Config/testthat/edition: 3 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 From c5a6606930e5483271e29be559a7346659614ff3 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 09:49:53 -0800 Subject: [PATCH 02/35] Fix typo in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d6bb7c0..8dfc0a1 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ apde.chi.tools is an R library that provides easy access and centralized versioning of convenience tools for the CHI/CHNA update process. -The are ment to support our epidemiologist in various steps along the pipeline, including: +The are meant to support our epidemiologist in various steps along the pipeline, including: + planning + batch processing From 3987d663857422075ab498389f1fb6116fb6d1a5 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 10:08:07 -0800 Subject: [PATCH 03/35] namespace qualification, importFrom & remove .() - specify namespace for substrRight, calc,chi_cols, compare_estimate, get_population, - added @importFrom for crossing & substrRight - replace .() with list() - added trend.span parameter that was missing - name space updated by devtools::document() due to importFrom --- NAMESPACE | 2 ++ R/chi_generate_metadata.R | 5 ++-- R/chi_process_nontrends.R | 7 ++--- R/chi_process_trends.R | 3 ++- R/proto_chi_calc.R | 15 +++++------ R/proto_chi_count_by_age.R | 6 ++--- R/proto_chi_generate_instructions_pop.R | 2 +- R/proto_chi_get_proper_pop.R | 36 ++++++++++++------------- man/chi_process_trends.Rd | 4 +-- 9 files changed, 41 insertions(+), 39 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 0a9f307..8f1f3e5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,5 +23,7 @@ importFrom(data.table,setorder) importFrom(data.table,shift) importFrom(glue,glue) importFrom(rads,chi_qa) +importFrom(rads,substrRight) +importFrom(tidyr,crossing) importFrom(utils,write.table) importFrom(yaml,yaml.load) diff --git a/R/chi_generate_metadata.R b/R/chi_generate_metadata.R index 9a0eb2a..2271e61 100644 --- a/R/chi_generate_metadata.R +++ b/R/chi_generate_metadata.R @@ -7,13 +7,14 @@ #' @param est.current current year's tableau ready output with completed estimates #' #' @return table of metadata +#' @importFrom rads substrRight #' @export #' chi_generate_metadata <- function(meta.old = NULL, est.current = NULL){ # get new metadata ---- meta.new <- unique(est.current[tab == "metadata", - .(indicator_key, + list(indicator_key, latest_yearx = as.integer(year), latest_year_resultx = result, run_datex = run_date, @@ -33,7 +34,7 @@ chi_generate_metadata <- function(meta.old = NULL, meta.new[, c("latest_yearx", "latest_year_resultx", "run_datex", "latest_year_countx", "latest_year_kc_popx") := NULL] # update valid_years ---- - meta.new[as.integer(latest_year) > suppressWarnings(as.integer(substrRight(valid_years, 1, 4))), + meta.new[as.integer(latest_year) > suppressWarnings(as.integer(rads::substrRight(valid_years, 1, 4))), valid_years := suppressWarnings(paste(as.integer(substr(valid_years, 1, 4)):as.integer(latest_year), collapse = " "))] # Ensure there are no missing important metadata cells ---- diff --git a/R/chi_process_nontrends.R b/R/chi_process_nontrends.R index f2d099e..38e5e9a 100644 --- a/R/chi_process_nontrends.R +++ b/R/chi_process_nontrends.R @@ -24,6 +24,7 @@ #' @import dtsurvey #' @import future #' @import future.apply +#' @importFrom tidyr crossing chi_process_nontrends <- function(ph.analysis_set = NULL, myset = NULL){ @@ -35,15 +36,15 @@ chi_process_nontrends <- function(ph.analysis_set = NULL, tempy <- rbindlist(lapply(as.list(seq(1, length(subtabs))), FUN = function(subtab){ tempx <- subsets[get(subtabs[subtab]) == 'x', - .(tab = subtabs[subtab], cat1, cat1_varname, cat2 = NA_character_, cat2_varname = NA_character_)] + list(tab = subtabs[subtab], cat1, cat1_varname, cat2 = NA_character_, cat2_varname = NA_character_)] tempx <- setDT(tidyr::crossing(tempx, data.table(indicator_key = sub_indicators))) })) # crosstabs are a bit more complicated sub_crosstabs = setDT(tidyr::crossing( - unique(subsets[crosstabs == 'x', .(cat1, cat1_varname)]), - unique(subsets[crosstabs == 'x', .(cat2 = cat1, cat2_varname = cat1_varname)]) )) + unique(subsets[crosstabs == 'x', list(cat1, cat1_varname)]), + unique(subsets[crosstabs == 'x', list(cat2 = cat1, cat2_varname = cat1_varname)]) )) sub_crosstabs <- sub_crosstabs[cat1 == 'King County' | cat1_varname != cat2_varname] sub_crosstabs <- sub_crosstabs[!(cat1_varname == 'race3' & cat2_varname %in% c('race3', 'race4'))] # do not want race x race sub_crosstabs <- sub_crosstabs[!(cat2_varname == 'race3' & cat1_varname %in% c('race3', 'race4'))] # do not want race x race diff --git a/R/chi_process_trends.R b/R/chi_process_trends.R index 4c40b25..3c92dca 100644 --- a/R/chi_process_trends.R +++ b/R/chi_process_trends.R @@ -1,7 +1,7 @@ #' CHI Generate Trend Years #' #' @param indicator_key chi indicator key variable -#' @param span number of years to include in a trend year estimate +#' @param trend.span the number of years to be included in a single trend period #' @param end.year last year of a trend year time series #' @param trend.periods number of periods to calculate #' @@ -15,6 +15,7 @@ #' @returns TRO with rows for each indicator key and span of years within the provided time frame #' @keywords CHI, Tableau, Production #' @import dtsurvey +#' @importFrom tidyr crossing chi_process_trends <- function(indicator_key = NULL, trend.span = NULL, diff --git a/R/proto_chi_calc.R b/R/proto_chi_calc.R index 7a0ff15..19dc5ff 100644 --- a/R/proto_chi_calc.R +++ b/R/proto_chi_calc.R @@ -7,9 +7,6 @@ chi_calc <- function(ph.data = NULL, suppress_high = 9, source_name = 'blahblah', source_date = NULL){ - - - # Error if ph.instructions has no data ---- if(nrow(ph.instructions) == 0){ stop("\n\U0001f47f the table ph.instructions does not have any rows.") @@ -67,13 +64,13 @@ chi_calc <- function(ph.data = NULL, # use calc()---- if(rate == FALSE){ # standard proportion analysis if(any(grepl('wastate', tempbv))){ - tempest <- calc(ph.data = ph.data, + tempest <- rads::calc(ph.data = ph.data, what = ph.instructions[X][['indicator_key']], where = chi_year >= tempstart & chi_year <= tempend, by = tempbv, metrics = c('mean', 'numerator', 'denominator', 'rse')) } else { - tempest <- calc(ph.data = ph.data, + tempest <- rads::calc(ph.data = ph.data, what = ph.instructions[X][['indicator_key']], where = chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County', by = tempbv, @@ -82,14 +79,14 @@ chi_calc <- function(ph.data = NULL, } if(rate == TRUE){ if(any(grepl('wastate', tempbv))){ - tempest <- calc(ph.data = ph.data, + tempest <- rads::calc(ph.data = ph.data, what = ph.instructions[X][['indicator_key']], where = chi_year >= tempstart & chi_year <= tempend, by = tempbv, metrics = c('rate', 'numerator', 'denominator', 'rse'), per = rate_per) } else { - tempest <- calc(ph.data = ph.data, + tempest <- rads::calc(ph.data = ph.data, what = ph.instructions[X][['indicator_key']], where = chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County', by = tempbv, @@ -197,7 +194,7 @@ chi_calc <- function(ph.data = NULL, tempCHIest[, data_source := source_name] tempCHIest[, time_trends := NA] # NA because no longer calculated - tempCHIest <- compare_estimate(mydt = tempCHIest, + tempCHIest <- rads::compare_estimate(mydt = tempCHIest, id_vars = c("indicator_key", "year"), key_where = cat1_group == "King County" & tab != "crosstabs", new_col = "comparison_with_kc", @@ -215,7 +212,7 @@ chi_calc <- function(ph.data = NULL, # Keep and order standard CHI columns ---- - tempCHIest <- tempCHIest[, c(chi_cols()[]), with = F] + tempCHIest <- tempCHIest[, c(rads::chi_cols()[]), with = F] tempCHIest <- tempCHIest[, cat1 := factor(cat1, levels = c("King County", sort(setdiff(unique(tempCHIest$cat1), "King County"))) )] tempCHIest <- tempCHIest[, tab := factor(tab, levels = c(c("_kingcounty","demgroups", "trends"), sort(setdiff(unique(tempCHIest$tab), c("_kingcounty","demgroups", "trends")))) )] diff --git a/R/proto_chi_count_by_age.R b/R/proto_chi_count_by_age.R index b5a6788..10a8e5f 100644 --- a/R/proto_chi_count_by_age.R +++ b/R/proto_chi_count_by_age.R @@ -52,13 +52,13 @@ chi_count_by_age <- function(ph.data = NULL, # use calc---- if(any(grepl('wastate', tempbv))){ - tempcount <- calc(ph.data = ph.data, + tempcount <- rads::calc(ph.data = ph.data, what = ph.instructions[X][['indicator_key']], where = chi_year >= tempstart & chi_year <= tempend, by = tempbv, metrics = c('numerator')) } else { - tempcount <- calc(ph.data = ph.data, + tempcount <- rads::calc(ph.data = ph.data, what = ph.instructions[X][['indicator_key']], where = chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County', by = tempbv, @@ -103,7 +103,7 @@ chi_count_by_age <- function(ph.data = NULL, year := ph.instructions[X][['end']]] # order output---- - tempcount <- tempcount[, .(indicator_key, year, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, count)] + tempcount <- tempcount[, list(indicator_key, year, tab, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group, chi_age, count)] setorder(tempcount, cat1_group, cat2_group, chi_age) } diff --git a/R/proto_chi_generate_instructions_pop.R b/R/proto_chi_generate_instructions_pop.R index 424d893..71d79bb 100644 --- a/R/proto_chi_generate_instructions_pop.R +++ b/R/proto_chi_generate_instructions_pop.R @@ -1,6 +1,6 @@ chi_generate_instructions_pop <- function(mycount.data, povgeo = NA){ pop.template <- copy(mycount.data) - pop.template <- unique(copy(pop.template)[, .(year, cat1, cat1_varname, cat2, cat2_varname, tab)]) + pop.template <- unique(copy(pop.template)[, list(year, cat1, cat1_varname, cat2, cat2_varname, tab)]) pop.template[, c("start", "stop") := tstrsplit(year, split = '-') ] pop.template[is.na(stop), stop := start] # need to have an end date even if it is just one year diff --git a/R/proto_chi_get_proper_pop.R b/R/proto_chi_get_proper_pop.R index 28cc82e..dff8d76 100644 --- a/R/proto_chi_get_proper_pop.R +++ b/R/proto_chi_get_proper_pop.R @@ -26,7 +26,7 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages # use rads::get_population ---- if(is.na(pop.template[X, geo_type])){ - tempy <- get_population(group_by = groupy, + tempy <- rads::get_population(group_by = groupy, race_type = pop.template[X, race_type], years = pop.template[X, start]:pop.template[X, stop], genders = gendery, @@ -34,7 +34,7 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages round = F) } if(!is.na(pop.template[X, geo_type])){ - tempy <- get_population(group_by = groupy, + tempy <- rads::get_population(group_by = groupy, geo_type = pop.template[X, geo_type], race_type = pop.template[X, race_type], years = pop.template[X, start]:pop.template[X, stop], @@ -85,21 +85,21 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages # HRAS ---- if(tempy[1, geo_type] == 'blk' & tempy[1, get(catnum)] == 'Cities/neighborhoods'){ - temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, .(geo_id = GEOID20, hra20_name)] + temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] tempy <- merge(tempy, temp.xwalk, by = "geo_id", all.x = T, all.y = F) tempy[, paste0(catnum, "_group") := hra20_name] } # Regions ---- if(tempy[1, geo_type] == 'blk' & tempy[1, get(catnum)] == 'Regions'){ - temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, .(geo_id = GEOID20, region_name)] + temp.xwalk <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, region_name)] tempy <- merge(tempy, temp.xwalk, by = 'geo_id', all.x = T, all.y = F) tempy[, paste0(catnum, "_group") := region_name] } if(tempy[1, geo_type] == 'hra' & tempy[1, get(catnum)] == 'Regions'){ - temp.xwalk <- rads.data::spatial_hra20_to_region20[, .(geo_id = hra20_name, region_name)] + temp.xwalk <- rads.data::spatial_hra20_to_region20[, list(geo_id = hra20_name, region_name)] tempy <- merge(tempy, temp.xwalk, by = 'geo_id', all.x = T, all.y = F) tempy[, paste0(catnum, "_group") := region_name] @@ -108,11 +108,11 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages if(tempy[1, geo_type] == 'zip' & tempy[1, get(catnum)] == 'Regions'){ zip_2_region <- rads.data::spatial_zip_to_hra20_pop zip_2_region <- merge(zip_2_region, - rads.data::spatial_hra20_to_region20[, .(hra20_name, region = region_name)], + rads.data::spatial_hra20_to_region20[, list(hra20_name, region = region_name)], by = 'hra20_name', all = T) - zip_2_region <- zip_2_region[, .(s2t_fraction = sum(s2t_fraction)), # collapse fractions down to region level - .(geo_id = as.character(source_id), region)] + zip_2_region <- zip_2_region[, list(s2t_fraction = sum(s2t_fraction)), # collapse fractions down to region level + list(geo_id = as.character(source_id), region)] tempy <- merge(tempy, zip_2_region, by = "geo_id", all.x = T, all.y = F, allow.cartesian = T) tempy[, pop := pop * s2t_fraction] # calculate weighted pop @@ -122,13 +122,13 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages # Big Cities ---- if(tempy[1, get(catnum)] == 'Big cities'){ if(tempy[1, geo_type] == 'blk'){ - blk20_hra20 <- rads.data::spatial_block20_to_hra20_to_region20[, .(geo_id = GEOID20, hra20_name)] + blk20_hra20 <- rads.data::spatial_block20_to_hra20_to_region20[, list(geo_id = GEOID20, hra20_name)] tempy <- merge(tempy, blk20_hra20, by = "geo_id", all.x = T, all.y = F) - hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, .(hra20_name, bigcity)] + hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] tempy <- merge(tempy, hra20_bigcity, by = 'hra20_name', all.x = T, all.y = F) } if(tempy[1, geo_type] == 'hra'){ - hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, .(hra20_name, bigcity)] + hra20_bigcity <- rads.data::spatial_hra20_to_bigcities[, list(hra20_name, bigcity)] tempy <- merge(tempy, hra20_bigcity, by.x = 'geo_id', by.y = 'hra20_name', all.x = T, all.y = F) } tempy[, paste0(catnum, "_group") := bigcity] @@ -159,7 +159,7 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages if(tempy[1, geo_type] == 'blk' & grepl("poverty$", tempy[1, get(catnum)], ignore.case = T)){ tempy[, geo_tract2020 := substr(geo_id, 1, 11)] # have blocks (15 char), so keep first 11 for tracts tempy <- merge(tempy, - rads.data::misc_poverty_groups[geo_type=='Tract'][, .(geo_tract2020 = geo_id, pov200grp)], + rads.data::misc_poverty_groups[geo_type=='Tract'][, list(geo_tract2020 = geo_id, pov200grp)], by = "geo_tract2020", all.x = T, all.y = F) @@ -167,7 +167,7 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages } if( tempy[1, geo_type] == 'zip' & grepl("poverty$", tempy[1, get(catnum)], ignore.case = T)){ tempy <- merge(tempy, - rads.data::misc_poverty_groups[geo_type=='ZCTA'][, .(geo_id, pov200grp)], + rads.data::misc_poverty_groups[geo_type=='ZCTA'][, list(geo_id, pov200grp)], by = 'geo_id', all.x = T, all.y = F) @@ -183,7 +183,7 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages tempy <- tempy[!(cat2 != 'NA' & (cat2_group == 'NA') | is.na(cat2_group))] # did not yet switch back to true NA at this point # collapse to one row per demographic combination and keep minimum needed columns ---- - tempy <- tempy[, .(pop = sum(pop)), .(chi_age = age, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] + tempy <- tempy[, list(pop = sum(pop)), list(chi_age = age, year, cat1, cat1_varname, cat1_group, cat2, cat2_varname, cat2_group)] # ensure each demographic has rows for all relevant ages & only relevant ages ---- if(tempy[1]$cat1 == "Age"){ @@ -205,7 +205,7 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages breaks = c(-1, 4, 9, 14, 17), labels = c('0-4', '5-9', '10-14', '15-17'))]} - temp.demog <- setDT(tidyr::crossing(unique(tempy[, .(year = as.character(year), cat2, cat2_varname, cat2_group)]), + temp.demog <- setDT(tidyr::crossing(unique(tempy[, list(year = as.character(year), cat2, cat2_varname, cat2_group)]), tempage)) } @@ -228,15 +228,15 @@ chi_get_proper_pop <- function(pop.template = NULL, pop.genders = NULL, pop.ages breaks = c(-1, 4, 9, 14, 17), labels = c('0-4', '5-9', '10-14', '15-17'))]} - temp.demog <- setDT(tidyr::crossing(unique(tempy[, .(year = as.character(year), cat1, cat1_varname, cat1_group)]), + temp.demog <- setDT(tidyr::crossing(unique(tempy[, list(year = as.character(year), cat1, cat1_varname, cat1_group)]), tempage)) } if(!"Age" %in% unique(c(tempy$cat1, tempy$cat2))){ # all combinations of cat1 x cat2 temp.demog <- setDT(tidyr::crossing( - unique(tempy[, .(cat1, cat1_varname, cat1_group)]), - unique(tempy[, .(cat2, cat2_varname, cat2_group)]) + unique(tempy[, list(cat1, cat1_varname, cat1_group)]), + unique(tempy[, list(cat2, cat2_varname, cat2_group)]) )) # all combination of cat table with year and age temp.demog <- setDT(tidyr::crossing( diff --git a/man/chi_process_trends.Rd b/man/chi_process_trends.Rd index 6f88d86..e98bf65 100644 --- a/man/chi_process_trends.Rd +++ b/man/chi_process_trends.Rd @@ -14,11 +14,11 @@ chi_process_trends( \arguments{ \item{indicator_key}{chi indicator key variable} +\item{trend.span}{the number of years to be included in a single trend period} + \item{end.year}{last year of a trend year time series} \item{trend.periods}{number of periods to calculate} - -\item{span}{number of years to include in a trend year estimate} } \value{ TRO with rows for each indicator key and span of years within the provided time frame From 5bee92ad177512457f473c40efc1cd0eb73224e5 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 10:09:28 -0800 Subject: [PATCH 04/35] Declare global variable for devtools::check() - to avoid warnings about missing visible bindings for global variables --- R/globals.R | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 R/globals.R diff --git a/R/globals.R b/R/globals.R new file mode 100644 index 0000000..4b08dde --- /dev/null +++ b/R/globals.R @@ -0,0 +1,15 @@ +# Declare global variables for devtools::check() +utils::globalVariables(c( + "varname", "group", "keepme", "reference", "chi_year", "tempstart", + "tempend", "chi_geo_kc", "cat1", "cat1_varname", "cat2", "cat2_varname", + "chi_age", "numerator", "count", "cat2_group", "indicator_key", "tab", + "round2", "lower_bound", "cat1_group", "cat2", "tempbv", "run_date", + "chi", "data_source", "time_trends", "suppression", "rse", "caution", + "comparison_with_kc_sig", "AgeMin", "AgeMax", "start", "race_type", "geo_type", + "denominator", "latest_yearx", "latest_year", "latest_year_resultx", "latest_year_result", + "run_datex", "latest_year_countx", "latest_year_count", "latest_year_kc_popx", + "latest_year_kc_pop", "valid_years", "end", "group_by1", "group_by2", "geo_id", + "gender", "race_eth", "race", "race_aic", "GEOID20", "hra20_name", "region_name", + "s2t_fraction", "source_id", "region", "pop", "bigcity", "geo_tract2020", "pov200grp", + "crosstabs", "vebrose", "span", "age", 'year.span' +)) From a45ac758e5077d300b4cbc1e1cc31caf2a5821cf Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 11:14:37 -0800 Subject: [PATCH 05/35] Commented out test for chi_generate_trend_years - chi_generate_trend_years does not exist for testing --- tests/testthat/test-chi_process_trends.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-chi_process_trends.R b/tests/testthat/test-chi_process_trends.R index 02e177e..f36c36a 100644 --- a/tests/testthat/test-chi_process_trends.R +++ b/tests/testthat/test-chi_process_trends.R @@ -1,3 +1,4 @@ test_that("calculates trends", { - DT <- chi_generate_trend_years(indicator_key = c("test1", "test2"),span = 3,begin.year = 2009,final.year = 2023) + # chi_generate_trend_years does not exist + # DT <- chi_generate_trend_years(indicator_key = c("test1", "test2"),span = 3,begin.year = 2009,final.year = 2023) }) From a76d60aba64851ea6bddc0348e17f065022baf66 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 11:18:45 -0800 Subject: [PATCH 06/35] Fixes for chi_generate_tro_shell test failure - added year.span as explicit argument to chi_generate_tro_shell() and test - replaced `span` (undefined var) with `trend.span) in chi_process_trends --- R/chi_generate_tro_shell.R | 2 ++ R/chi_process_trends.R | 2 +- man/chi_generate_tro_shell.Rd | 3 +++ tests/testthat/test-chi_generate_tro_shell.R | 9 +++++++-- 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 53d8969..9333930 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -20,6 +20,7 @@ #' @param ph.analysis_set name of data.table to parse #' @param start.year the earliest year to be used for estimates #' @param end.year the latest year to be used for aggregate estimates (note, the earliest year for trends estimates is calculated from from the span and number of periods) +#' @param year.span the number of years to be included in a single non-trend period #' @param trend.span the number of years to be included in a single trend period #' @param trend.periods the number of periods to be included in a trend #' @returns data table with a single row for each calculation to be performed in generating Tableau Ready Output for CHI reporting @@ -36,6 +37,7 @@ chi_generate_tro_shell <- function(ph.analysis_set, start.year, end.year, + year.span = NULL, trend.span = NULL, trend.periods = NULL){ #parameterization checks diff --git a/R/chi_process_trends.R b/R/chi_process_trends.R index 3c92dca..ea8edef 100644 --- a/R/chi_process_trends.R +++ b/R/chi_process_trends.R @@ -21,7 +21,7 @@ chi_process_trends <- function(indicator_key = NULL, trend.span = NULL, end.year = NULL, trend.periods = NULL){ - last.start <- end.year-(span-1) + last.start <- end.year-(trend.span-1) all.start.years <- last.start:(last.start-(trend.periods-1)) all.end.years <- end.year:(end.year-(trend.periods-1)) spandt <- data.table(end = all.end.years, start = all.start.years) diff --git a/man/chi_generate_tro_shell.Rd b/man/chi_generate_tro_shell.Rd index af3b94e..954091e 100644 --- a/man/chi_generate_tro_shell.Rd +++ b/man/chi_generate_tro_shell.Rd @@ -8,6 +8,7 @@ chi_generate_tro_shell( ph.analysis_set, start.year, end.year, + year.span = NULL, trend.span = NULL, trend.periods = NULL ) @@ -19,6 +20,8 @@ chi_generate_tro_shell( \item{end.year}{the latest year to be used for aggregate estimates (note, the earliest year for trends estimates is calculated from from the span and number of periods)} +\item{year.span}{the number of years to be included in a single non-trend period} + \item{trend.span}{the number of years to be included in a single trend period} \item{trend.periods}{the number of periods to be included in a trend} diff --git a/tests/testthat/test-chi_generate_tro_shell.R b/tests/testthat/test-chi_generate_tro_shell.R index b6402a0..28dfb97 100644 --- a/tests/testthat/test-chi_generate_tro_shell.R +++ b/tests/testthat/test-chi_generate_tro_shell.R @@ -11,7 +11,7 @@ test_that("injest template format", { trends <- c("x",NA,NA, "x","x", "x", "x", "x",NA,"x", "x") set_indicator_keys <- c(rep("key1, key2, key3",7),rep("key4, key5",4)) - template <- data.table(set, + template <- data.table::data.table(set, cat1, cat1_varname, kingCounty, @@ -20,7 +20,12 @@ test_that("injest template format", { crosstabs, trends, set_indicator_keys) - DT <- chi_generate_tro_shell(ph.analysis_set, 2021, 2022, 3, 5) + DT <- chi_generate_tro_shell(ph.analysis_set = template, + start.year = 2021, + end.year = 2022, + year.span = 5, + trend.span = 3, + trend.periods = 5) expect_equal(nrow(DT), 264) expect_equal(length(unique(DT$indicator_key)),5) expect_equal(DT[tab == "trends",][1]$end - DT[tab == "trends",][1]$start,2) From c4895032f3f734a62589952b8895757692e34536 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 11:22:02 -0800 Subject: [PATCH 07/35] Refactor funct. rm global environment assignments - Replaced `assign()` calls with local variable assignments within `chi_calc` function to improve encapsulation and avoid unintended side effects in the global environment - Simplified code for better readability and maintainability - Removed unnecessary cleanup of global variables (`rm()`) - Eliminating globals within package functions is important for ensuring predictability, avoiding conflicts, and maintaining clean, self-contained functions --- R/proto_chi_calc.R | 11 ++++------- R/proto_chi_count_by_age.R | 8 ++++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/R/proto_chi_calc.R b/R/proto_chi_calc.R index 19dc5ff..135e288 100644 --- a/R/proto_chi_calc.R +++ b/R/proto_chi_calc.R @@ -56,10 +56,10 @@ chi_calc <- function(ph.data = NULL, if(length(tempbv2) == 0){tempbv2 = NA} tempbv <- setdiff(c(tempbv1, tempbv2), c(NA)) - # send constants to global environment so it can be used by the calc() function below - assign("tempbv", tempbv, envir = .GlobalEnv) - assign("tempend", ph.instructions[X][['end']], envir = .GlobalEnv) - assign("tempstart", ph.instructions[X][['start']], envir = .GlobalEnv) + # create variables of interest used in calc function below + tempbv <- tempbv + tempend <- ph.instructions[X][['end']] + tempstart <- ph.instructions[X][['start']] # use calc()---- if(rate == FALSE){ # standard proportion analysis @@ -183,9 +183,6 @@ chi_calc <- function(ph.data = NULL, cat2 := 'Ethnicity'] } - # drop temporary vars that were sent to global environment for calc() ---- - rm(tempbv, tempend, tempstart, envir = .GlobalEnv) - # Create additional necessary CHI columns ---- tempCHIest[, source_date := as.Date(source_date)] tempCHIest[, run_date := as.Date(Sys.Date(), "%Y%m%d")] diff --git a/R/proto_chi_count_by_age.R b/R/proto_chi_count_by_age.R index 10a8e5f..330432e 100644 --- a/R/proto_chi_count_by_age.R +++ b/R/proto_chi_count_by_age.R @@ -45,10 +45,10 @@ chi_count_by_age <- function(ph.data = NULL, tempbv <- setdiff(c(tempbv1, tempbv2), c(NA)) tempbv <- c(tempbv, "chi_age") - # send constants to global environment so it can be used by the calc() function below - assign("tempbv", tempbv, envir = .GlobalEnv) - assign("tempend", ph.instructions[X][['end']], envir = .GlobalEnv) - assign("tempstart", ph.instructions[X][['start']], envir = .GlobalEnv) + # create variables of interest used in calc function below + tempbv <- tempbv + tempend <- ph.instructions[X][['end']] + tempstart <- ph.instructions[X][['start']] # use calc---- if(any(grepl('wastate', tempbv))){ From 4f54e6cb7ea6d927436c140a3006820b57c76765 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 16:10:14 -0800 Subject: [PATCH 08/35] FIX #6 Plus other improvements in chi_qa_tro - validate_yaml_data >> tsql_validate_field_types - updated filepath to CHI standards - update roxygen2 header - added chi_get_yaml & chi_get_cols to streamline code - updated references to chi_qa >> chi_qa_tro --- R/chi_get_yaml.R | 39 ++++++++++++++++++++++++++++ R/chi_qa_tro.R | 58 +++++++++++++++++++---------------------- inst/ref/chi_qa.yaml | 62 ++++++++++++++++++++++++++++++++++++++++++++ man/chi_get_yaml.Rd | 19 ++++++++++++++ man/chi_qa_tro.Rd | 2 +- 5 files changed, 148 insertions(+), 32 deletions(-) create mode 100644 R/chi_get_yaml.R create mode 100644 inst/ref/chi_qa.yaml create mode 100644 man/chi_get_yaml.Rd diff --git a/R/chi_get_yaml.R b/R/chi_get_yaml.R new file mode 100644 index 0000000..7d7c813 --- /dev/null +++ b/R/chi_get_yaml.R @@ -0,0 +1,39 @@ +#' Get CHI variable column names +#' +#' Returns a character vector of column names defined in the CHI YAML reference file. +#' This helper function provides easy access to the standardized CHI variable names. +#' +#' @return A character vector of column names +#' @importFrom yaml read_yaml +#' @export +#' +#' @examples +#' cols <- chi_get_cols() +chi_get_cols <- function() { + chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools") + if (chi.yaml.filepath == "") { + stop("Could not find reference file chi_qa.yaml") + } + chi.yaml <- yaml::read_yaml(chi.yaml.filepath) + return(names(chi.yaml$vars)) +} + +#' Get CHI YAML configuration +#' +#' Returns the complete CHI YAML configuration as a list. +#' This helper function provides access to the full YAML configuration +#' which contains variable definitions and other CHI-related settings. +#' +#' @return A list containing the parsed YAML configuration +#' @importFrom yaml read_yaml +#' @export +#' +#' @examples +#' config <- chi_get_yaml() +chi_get_yaml <- function() { + chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools") + if (chi.yaml.filepath == "") { + stop("Could not find reference file chi_qa.yaml") + } + return(yaml::read_yaml(chi.yaml.filepath)) +} diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index 2c6a6b0..de39f60 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -9,7 +9,7 @@ #' specifications. If set to verbose mode (default) will report diagnostic information in #' The user interface as warnings (if failed) and messages (for progress and pass). In any #' Case will return 1 or 0 for pass or fail respectively. -#' The CHI Tableau Ready Output (TRO) standards can be reviewed [here]("//phshare01/epe_share/WORK/CHI Visualizations/Tableau Ready Output Format_v2.xlsx") +#' The CHI Tableau Ready Output (TRO) standards can be reviewed [here](https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/_layouts/15/Doc.aspx?sourcedoc=%7BBED2F507-B834-4D28-8658-C5724F64C001%7D&file=CHI-Standards-TableauReady%20Output.xlsx&action=default&mobileredirect=true&CID=975A2706-130E-46AB-AEB2-DF919FDEC185&wdLOR=cDF77DB7A-3774-430F-92C7-EFAB34CEB35F) #' #' @param chi_est Name of a data.table or data.frame containing the prepared data to be pushed to SQL #' @param chi_meta Name of a data.table or data.frame containing the metadata to be pushed to SQL @@ -21,11 +21,10 @@ #' #' @keywords CHI, Tableau, Production #' -#' @importFrom data.table is.data.table ':=' setDT setDF data.table setorder copy setnames setorder dcast setcolorder fread shift "%between%" +#' @importFrom data.table setDT copy setcolorder is.data.table %between% #' @importFrom glue glue -#' @importFrom utils write.table -#' @importFrom yaml yaml.load -#' @importFrom rads chi_qa +#' @importFrom yaml read_yaml +#' @importFrom rads tsql_validate_field_types #' #' @examples #' @@ -71,9 +70,6 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = chi_est <- data.table::setDT(copy(chi_est)) chi_meta <- data.table::setDT(copy(chi_meta)) - ## Load reference YAML ---- - chi.yaml <- yaml::yaml.load(httr::GET(url = "https://raw.githubusercontent.com/PHSKC-APDE/rads/main/ref/chi_qa.yaml", httr::authenticate(Sys.getenv("GITHUB_TOKEN"), ""))) - ## Check columns ---- if(verbose) { message("Checking that all column names are unique") @@ -94,7 +90,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that all necessary columns exist") } - missing.var <- setdiff(names(chi.yaml$vars), names(chi_est)) + missing.var <- setdiff(chi_get_cols(), names(chi_est)) if(length(missing.var) > 0){ status <- 0 if(verbose) { @@ -103,7 +99,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } - missing.var <- setdiff(names(chi.yaml$metadata), names(chi_meta)) + missing.var <- setdiff(names(unlist(chi_get_yaml()$metadata)), names(chi_meta)) if(length(missing.var) > 0){ status <- 0 if(verbose){ @@ -113,7 +109,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } # Confirm that there are no additional variables ---- - extra.var <- setdiff(names(chi_est), names(chi.yaml$vars)) + extra.var <- setdiff(names(chi_est), chi_get_cols()) if(length(extra.var) > 0){ status <- 0 if(verbose){ @@ -123,7 +119,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } - extra.var <- setdiff(names(chi_meta), names(chi.yaml$meta)) + extra.var <- setdiff(names(chi_meta), names(unlist(chi_get_yaml()$metadata))) if(length(extra.var) > 0){ status <- 0 if(verbose){ @@ -139,11 +135,11 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Validating CHI estimates: ") } - rads::validate_yaml_data(DF = chi_est, YML = chi.yaml, VARS = "vars") # check CHI estimate table + rads::tsql_validate_field_types(ph.data = chi_est, field_types = unlist(chi_get_yaml()$vars)) # check CHI estimate table if(verbose){ message(paste("", "Validating CHI metadata: ", sep = "\n")) } - rads::validate_yaml_data(DF = chi_meta, YML = chi.yaml, VARS = "metadata") # check CHI metadata table + rads::tsql_validate_field_types(ph.data = chi_meta, field_types = unlist(chi_get_yaml()$metadata)) # check CHI metadata table if(verbose) message(paste("", "", sep = "\n")) if(verbose){ @@ -161,7 +157,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) } } - for(mycol in setdiff(names(chi.yaml$metadata), c("latest_year_kc_pop", "latest_year_count"))){ + for(mycol in setdiff(names(unlist(chi_get_yaml()$metadata)), c("latest_year_kc_pop", "latest_year_count"))){ if(nrow(chi_meta[is.na(get(mycol))]) > 0){ status <- 0 warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI metadata. \n", "Fix the error and run this QA script again.")) @@ -182,8 +178,8 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } ## Set the columns in standard order ---- - setcolorder(chi_est, names(chi.yaml$vars)) - setcolorder(chi_meta, names(chi.yaml$meta)) + setcolorder(chi_est, chi_get_cols()) + setcolorder(chi_meta, names(unlist(chi_get_yaml()$metadata))) #Basic logic checks for estimates @@ -195,7 +191,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning(glue::glue("There is at least one row where is.infinite({var}) == T. - Please fix this problem before rerunning chi_qa() (e.g., by setting it equal to NA) + Please fix this problem before rerunning chi_qa_tro() (e.g., by setting it equal to NA) You can view the problematic data by typing something like: View(chi_est[is.infinite({var}), ])")) } } @@ -220,7 +216,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning("There is at least one row where the upper_bound is less than the lower_bound. - Please fix this error prior to re-running the chi_qa() function. + Please fix this error prior to re-running the chi_qa_tro() function. You can view the problematic data by typing something like: View(chi_est[upper_bound < lower_bound, ])") } } @@ -232,7 +228,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning("There is at least one row where the result is not less than or equal to the upper_bound. - Please fix this error prior to rerunning the chi_qa() function. + Please fix this error prior to rerunning the chi_qa_tro() function. You can view the problematic data by typing something like: View(chi_est[!(result <= upper_bound)])") } } @@ -244,7 +240,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning("There is at least one row where the result is not greater than or equal to the lower_bound. - Please fix this error prior to rerunning the chi_qa() function. + Please fix this error prior to rerunning the chi_qa_tro() function. You can view the problematic data by typing something like: View(chi_est[!(result >= lower_bound)])") } } @@ -255,7 +251,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning("There is at least one row where the lower_bound is less than zero (i.e., it is negative). - Please fix this error prior to rerunning the chi_qa() function. + Please fix this error prior to rerunning the chi_qa_tro() function. You can view the problematic data by typing something like: View(chi_est[lower_bound < 0])") } } @@ -281,7 +277,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ warning("All RSEs are within the range (0, 1]. CHI Tableau Ready standards necessitate that these proportions be mutliplied by 100. I.e., .12345 >> 12.345 - Please fix this error prior to rerunning the chi_qa() function.") + Please fix this error prior to rerunning the chi_qa_tro() function.") } } @@ -292,7 +288,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning("There is at least one row where a caution flag ('!') is not used and rse >= 30% or is.na(rse) == T. - Please fix this error prior to rerunning the chi_qa() function. + Please fix this error prior to rerunning the chi_qa_tro() function. You can view the problematic data by typing something like: View(chi_est[(rse>=30 | is.na(rse)) & (caution != '!' | is.na(caution))])") } } @@ -337,7 +333,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } - if(vebrose){ + if(verbose){ message("Checking that se is rounded to four digits") } if(sum(chi_est$se != round2(chi_est$se, 4), na.rm = T) != 0) { @@ -355,7 +351,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning(glue::glue("There is at least one row where '{var}' is missing. - Please fill in the missing value before rerunning chi_qa()")) + Please fill in the missing value before rerunning chi_qa_tro()")) } } } @@ -364,7 +360,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning(glue::glue("There is at least one row where 'cat1_varname' is missing. - Please fill in the missing value before rerunning chi_qa()")) + Please fill in the missing value before rerunning chi_qa_tro()")) } } } @@ -373,7 +369,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning(glue::glue("There is at least one row where tab=='crosstabs' & where '{var}' is missing. - Please fill in the missing value before rerunning chi_qa()")) + Please fill in the missing value before rerunning chi_qa_tro()")) } } } @@ -382,7 +378,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning(glue::glue("There is at least one row where 'cat2_varname' is missing. - Please fill in the missing value before rerunning chi_qa()")) + Please fill in the missing value before rerunning chi_qa_tro()")) } } } @@ -395,7 +391,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose){ warning(glue::glue("There is at least one row that is not suppressed & where '{var}' is missing. - Please fill in the missing value before rerunning chi_qa()")) + Please fill in the missing value before rerunning chi_qa_tro()")) } } } @@ -408,7 +404,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = status <- 0 if(verbose) { warning(glue::glue("There is at least one row where tab=='trends' & where 'time_trends' is missing. - Please fill in the missing value before rerunning chi_qa()")) + Please fill in the missing value before rerunning chi_qa_tro()")) } } } diff --git a/inst/ref/chi_qa.yaml b/inst/ref/chi_qa.yaml new file mode 100644 index 0000000..cb0f5a7 --- /dev/null +++ b/inst/ref/chi_qa.yaml @@ -0,0 +1,62 @@ +schema: APDE_WIP +table: +years: + +vars: + data_source: varchar(255) + indicator_key: varchar(255) + tab: varchar(255) + year: varchar(255) + cat1: varchar(255) + cat1_group: nvarchar(2000) + cat1_varname: varchar(255) + cat2: varchar(255) + cat2_group: nvarchar(2000) + cat2_varname: varchar(255) + result: float + lower_bound: float + upper_bound: float + se: float + rse: float + comparison_with_kc: varchar(255) + time_trends: varchar(255) + significance: varchar(255) + caution: varchar(255) + suppression: varchar(255) + numerator: float + denominator: float + chi: tinyint + source_date: date + run_date: date + +metadata: + data_source: varchar(255) + indicator_key: varchar(255) + result_type: varchar(255) + valence: varchar(255) + latest_year: int + latest_year_result: float + latest_year_kc_pop: int + latest_year_count: int + map_type: varchar(255) + unit: varchar(255) + valid_years: varchar(255) + chi: tinyint + run_date: date + +toc: + data_source: varchar(255) + indicator_key: varchar(255) + topic_chi: varchar(255) + topic_bsk: varchar(255) + topic_bskhs: varchar(255) + title_toc: varchar(255) + description: varchar(1000) + url_backup: varchar(500) + latest_year_bk: varchar(255) + latest_rate_bk: varchar(255) + toc_bsk: float + toc_bskhs: float + toc_chi: float + toc_cc: float + diff --git a/man/chi_get_yaml.Rd b/man/chi_get_yaml.Rd new file mode 100644 index 0000000..9da9f3a --- /dev/null +++ b/man/chi_get_yaml.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_get_yaml.R +\name{chi_get_yaml} +\alias{chi_get_yaml} +\title{Get CHI YAML configuration} +\usage{ +chi_get_yaml() +} +\value{ +A list containing the parsed YAML configuration +} +\description{ +Returns the complete CHI YAML configuration as a list. +This helper function provides access to the full YAML configuration +which contains variable definitions and other CHI-related settings. +} +\examples{ +config <- chi_get_yaml() +} diff --git a/man/chi_qa_tro.Rd b/man/chi_qa_tro.Rd index b4618bb..517c0a0 100644 --- a/man/chi_qa_tro.Rd +++ b/man/chi_qa_tro.Rd @@ -28,7 +28,7 @@ This function tests if the structure of the data matches CHI Tableau Ready Outpu specifications. If set to verbose mode (default) will report diagnostic information in The user interface as warnings (if failed) and messages (for progress and pass). In any Case will return 1 or 0 for pass or fail respectively. -The CHI Tableau Ready Output (TRO) standards can be reviewed [here]("//phshare01/epe_share/WORK/CHI Visualizations/Tableau Ready Output Format_v2.xlsx") +The CHI Tableau Ready Output (TRO) standards can be reviewed [here](https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/_layouts/15/Doc.aspx?sourcedoc=%7BBED2F507-B834-4D28-8658-C5724F64C001%7D&file=CHI-Standards-TableauReady%20Output.xlsx&action=default&mobileredirect=true&CID=975A2706-130E-46AB-AEB2-DF919FDEC185&wdLOR=cDF77DB7A-3774-430F-92C7-EFAB34CEB35F) } \examples{ From a9ab1091cf152d1b2a08b1fc31b9b69123988599 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 16:11:29 -0800 Subject: [PATCH 09/35] roxygen2 headers to functions without them - added to four functions --- R/proto_chi_calc.R | 34 +++++++++++++++---- R/proto_chi_count_by_age.R | 18 ++++++++++ R/proto_chi_drop_illogical_ages.R | 31 ++++++++++++----- R/proto_chi_generate_instructions_pop.R | 15 +++++++++ man/chi_calc.Rd | 44 +++++++++++++++++++++++++ man/chi_count_by_age.Rd | 23 +++++++++++++ man/chi_drop_illogical_ages.Rd | 21 ++++++++++++ man/chi_generate_instructions_pop.Rd | 20 +++++++++++ 8 files changed, 192 insertions(+), 14 deletions(-) create mode 100644 man/chi_calc.Rd create mode 100644 man/chi_count_by_age.Rd create mode 100644 man/chi_drop_illogical_ages.Rd create mode 100644 man/chi_generate_instructions_pop.Rd diff --git a/R/proto_chi_calc.R b/R/proto_chi_calc.R index 135e288..915d7bb 100644 --- a/R/proto_chi_calc.R +++ b/R/proto_chi_calc.R @@ -1,3 +1,25 @@ +#' Calculate CHI Estimates +#' +#' @description +#' Generates CHI estimates from input data according to provided instructions. +#' Handles both proportions and rates, with options for suppression of small numbers. +#' +#' @param ph.data Input data.frame or data.table containing CHI data +#' @param ph.instructions data.frame or data.table containing calculation instructions +#' @param rate Logical; if TRUE calculates rates, if FALSE calculates proportions +#' @param rate_per Rate multiplier when rate=TRUE (e.g., 100000 for per 100,000) +#' @param small_num_suppress Logical; if TRUE suppresses small numbers +#' @param suppress_low Lower bound for suppression +#' @param suppress_high Upper bound for suppression +#' @param source_name Name of data source +#' @param source_date Date of data source +#' +#' @return A data.table containing CHI estimates with standard columns +#' @importFrom data.table setDT copy setnames := setorder set .SD data.table +#' @importFrom rads calc compare_estimate suppress chi_cols +#' @export +#' +#' chi_calc <- function(ph.data = NULL, ph.instructions = NULL, rate = F, @@ -35,7 +57,7 @@ chi_calc <- function(ph.data = NULL, stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data phbyvars <- rbindlist(lapply( X=as.list(neededbyvars), - FUN = function(X){data.table(varname = X, group = setdiff(unique(ph.data[[X]]), NA), ph.data = 1)})) + FUN = function(X){data.table::data.table(varname = X, group = setdiff(unique(ph.data[[X]]), NA), ph.data = 1)})) compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) if(nrow(compbyvars[is.na(reference)| is.na(ph.data)]) > 0){ print(compbyvars[is.na(reference)| is.na(ph.data)]) @@ -93,7 +115,7 @@ chi_calc <- function(ph.data = NULL, metrics = c('rate', 'numerator', 'denominator', 'rse'), per = rate_per) } - setnames(tempest, gsub("^rate", "mean", names(tempest))) + data.table::setnames(tempest, gsub("^rate", "mean", names(tempest))) } # add on CHI standard columns that are from ph.instructions (in order of standard results output)---- @@ -104,14 +126,14 @@ chi_calc <- function(ph.data = NULL, tempest[ph.instructions[X][['end']] == ph.instructions[X][['start']], year := ph.instructions[X][['end']]] tempest[, cat1 := ph.instructions[X][['cat1']]] - setnames(tempest, ph.instructions[X][['cat1_varname']], 'cat1_group') + data.table::setnames(tempest, ph.instructions[X][['cat1_varname']], 'cat1_group') tempest[, cat1_varname := ph.instructions[X][['cat1_varname']]] tempest[, cat2 := ph.instructions[X][['cat2']]] if(!is.na(tempbv2) & tempbv1 != tempbv2){ - setnames(tempest, ph.instructions[X][['cat2_varname']], 'cat2_group')} else{ + data.table::setnames(tempest, ph.instructions[X][['cat2_varname']], 'cat2_group')} else{ tempest[, cat2_group := NA] } tempest[, cat2_varname := ph.instructions[X][['cat2_varname']]] - setnames(tempest, + data.table::setnames(tempest, c("mean", "mean_lower", "mean_upper", "mean_se"), c("result", "lower_bound", "upper_bound", "se")) } @@ -196,7 +218,7 @@ chi_calc <- function(ph.data = NULL, key_where = cat1_group == "King County" & tab != "crosstabs", new_col = "comparison_with_kc", tidy = TRUE) - setnames(tempCHIest, "comparison_with_kc_sig", "significance") + data.table::setnames(tempCHIest, "comparison_with_kc_sig", "significance") if(small_num_suppress == TRUE){ tempCHIest <- rads::suppress(sup_data = tempCHIest, diff --git a/R/proto_chi_count_by_age.R b/R/proto_chi_count_by_age.R index 330432e..67c1bcb 100644 --- a/R/proto_chi_count_by_age.R +++ b/R/proto_chi_count_by_age.R @@ -1,3 +1,21 @@ +#' Generate Age-Specific Counts for CHI Data +#' +#' @description +#' Creates a detailed breakdown of counts by age for CHI data analysis, most +#' often for age standardization.Processes data according to provided +#' instructions and handles demographic groupings. +#' +#' @param ph.data Input data frame or data table containing CHI data +#' @param ph.instructions Data frame or data table containing calculation instructions +#' @param source_date Date of data source +#' +#' @return A data.table containing age-specific counts with standard CHI groupings +#' @importFrom data.table setDT rbindlist setnames := setorder data.table +#' @importFrom rads calc +#' @importFrom future.apply future_lapply +#' @importFrom tidyr crossing +#' @export +#' chi_count_by_age <- function(ph.data = NULL, ph.instructions = NULL, source_date = NULL){ diff --git a/R/proto_chi_drop_illogical_ages.R b/R/proto_chi_drop_illogical_ages.R index f54430a..84a10cf 100644 --- a/R/proto_chi_drop_illogical_ages.R +++ b/R/proto_chi_drop_illogical_ages.R @@ -1,12 +1,27 @@ -chi_drop_illogical_ages <- function(DTx, agevar = 'chi_age'){ - DTx = copy(DTx) +#' Drop Illogical Age Combinations from CHI Data +#' +#' @description +#' Removes age combinations that don't make logical sense based on category +#' groupings. For example, removes cases where the age falls outside the range +#' specified by age group categories. +#' +#' @param ph.data Input data.table to process +#' @param agevar Name of the age variable (defaults to 'chi_age') +#' +#' @return A data.table with illogical age combinations removed +#' @importFrom data.table copy := fcase between +#' @export +#' + +chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age'){ + ph.data = copy(ph.data) for(CatNum in c("cat1", "cat2")){ - DTx[, paste0(CatNum, '_group_temp') := fcase(get(paste0(CatNum, "_group")) == '<1', '0-0', # <1 is special! + ph.data[, paste0(CatNum, '_group_temp') := fcase(get(paste0(CatNum, "_group")) == '<1', '0-0', # <1 is special! get(CatNum) %in% c("Age", "Birthing person's age"), gsub("<", "0-", gsub("\\+", "-120", get(paste0(CatNum, '_group')))))] - DTx[, AgeMin := gsub("-.*", "", get(paste0(CatNum, '_group_temp')))] - DTx[, AgeMax := gsub(".*-", "", get(paste0(CatNum, '_group_temp')))] - DTx <- DTx[!get(CatNum) %in% c("Age", "Birthing person's age") | between(get(agevar), AgeMin, AgeMax)] - DTx[, c("AgeMin", paste0(CatNum, '_group_temp'), "AgeMax") := NULL] + ph.data[, AgeMin := gsub("-.*", "", get(paste0(CatNum, '_group_temp')))] + ph.data[, AgeMax := gsub(".*-", "", get(paste0(CatNum, '_group_temp')))] + ph.data <- ph.data[!get(CatNum) %in% c("Age", "Birthing person's age") | between(get(agevar), AgeMin, AgeMax)] + ph.data[, c("AgeMin", paste0(CatNum, '_group_temp'), "AgeMax") := NULL] } - return(DTx) + return(ph.data) } diff --git a/R/proto_chi_generate_instructions_pop.R b/R/proto_chi_generate_instructions_pop.R index 71d79bb..e679bc7 100644 --- a/R/proto_chi_generate_instructions_pop.R +++ b/R/proto_chi_generate_instructions_pop.R @@ -1,3 +1,18 @@ +#' Generate Population Instructions for CHI Analysis +#' +#' @description +#' Creates a instructions for rads::get_population() based on count data +#' specifications. Handles various geographic types and demographic groupings. +#' +#' @param mycount.data Input data.table containing count data specifications +#' @param povgeo Geographic level for poverty analysis (NA or 'zip') +#' +#' @return A data.table containing population processing instructions +#' @importFrom data.table copy `:=` setorder tstrsplit +#' @importFrom tools toTitleCase +#' @export +#' + chi_generate_instructions_pop <- function(mycount.data, povgeo = NA){ pop.template <- copy(mycount.data) pop.template <- unique(copy(pop.template)[, list(year, cat1, cat1_varname, cat2, cat2_varname, tab)]) diff --git a/man/chi_calc.Rd b/man/chi_calc.Rd new file mode 100644 index 0000000..4a65b7e --- /dev/null +++ b/man/chi_calc.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/proto_chi_calc.R +\name{chi_calc} +\alias{chi_calc} +\title{Calculate CHI Estimates} +\usage{ +chi_calc( + ph.data = NULL, + ph.instructions = NULL, + rate = F, + rate_per = NULL, + small_num_suppress = T, + suppress_low = 0, + suppress_high = 9, + source_name = "blahblah", + source_date = NULL +) +} +\arguments{ +\item{ph.data}{Input data.frame or data.table containing CHI data} + +\item{ph.instructions}{data.frame or data.table containing calculation instructions} + +\item{rate}{Logical; if TRUE calculates rates, if FALSE calculates proportions} + +\item{rate_per}{Rate multiplier when rate=TRUE (e.g., 100000 for per 100,000)} + +\item{small_num_suppress}{Logical; if TRUE suppresses small numbers} + +\item{suppress_low}{Lower bound for suppression} + +\item{suppress_high}{Upper bound for suppression} + +\item{source_name}{Name of data source} + +\item{source_date}{Date of data source} +} +\value{ +A data.table containing CHI estimates with standard columns +} +\description{ +Generates CHI estimates from input data according to provided instructions. +Handles both proportions and rates, with options for suppression of small numbers. +} diff --git a/man/chi_count_by_age.Rd b/man/chi_count_by_age.Rd new file mode 100644 index 0000000..d5af4ac --- /dev/null +++ b/man/chi_count_by_age.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/proto_chi_count_by_age.R +\name{chi_count_by_age} +\alias{chi_count_by_age} +\title{Generate Age-Specific Counts for CHI Data} +\usage{ +chi_count_by_age(ph.data = NULL, ph.instructions = NULL, source_date = NULL) +} +\arguments{ +\item{ph.data}{Input data frame or data table containing CHI data} + +\item{ph.instructions}{Data frame or data table containing calculation instructions} + +\item{source_date}{Date of data source} +} +\value{ +A data.table containing age-specific counts with standard CHI groupings +} +\description{ +Creates a detailed breakdown of counts by age for CHI data analysis, most +often for age standardization.Processes data according to provided +instructions and handles demographic groupings. +} diff --git a/man/chi_drop_illogical_ages.Rd b/man/chi_drop_illogical_ages.Rd new file mode 100644 index 0000000..4d83d44 --- /dev/null +++ b/man/chi_drop_illogical_ages.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/proto_chi_drop_illogical_ages.R +\name{chi_drop_illogical_ages} +\alias{chi_drop_illogical_ages} +\title{Drop Illogical Age Combinations from CHI Data} +\usage{ +chi_drop_illogical_ages(ph.data, agevar = "chi_age") +} +\arguments{ +\item{ph.data}{Input data.table to process} + +\item{agevar}{Name of the age variable (defaults to 'chi_age')} +} +\value{ +A data.table with illogical age combinations removed +} +\description{ +Removes age combinations that don't make logical sense based on category +groupings. For example, removes cases where the age falls outside the range +specified by age group categories. +} diff --git a/man/chi_generate_instructions_pop.Rd b/man/chi_generate_instructions_pop.Rd new file mode 100644 index 0000000..90c6afa --- /dev/null +++ b/man/chi_generate_instructions_pop.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/proto_chi_generate_instructions_pop.R +\name{chi_generate_instructions_pop} +\alias{chi_generate_instructions_pop} +\title{Generate Population Instructions for CHI Analysis} +\usage{ +chi_generate_instructions_pop(mycount.data, povgeo = NA) +} +\arguments{ +\item{mycount.data}{Input data.table containing count data specifications} + +\item{povgeo}{Geographic level for poverty analysis (NA or 'zip')} +} +\value{ +A data.table containing population processing instructions +} +\description{ +Creates a instructions for rads::get_population() based on count data +specifications. Handles various geographic types and demographic groupings. +} From a0e7c3815e7e8bb505e3cce8805e770cdbf6a047 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 16:14:09 -0800 Subject: [PATCH 10/35] replace sql_clean with string_clean sql_clean deprecated in rads --- R/chi_process_nontrends.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/chi_process_nontrends.R b/R/chi_process_nontrends.R index 38e5e9a..947016f 100644 --- a/R/chi_process_nontrends.R +++ b/R/chi_process_nontrends.R @@ -19,12 +19,18 @@ #' #' @param ph.analysis_set name of data.table to parse #' @param myset chosen set number from table +#' #' @returns data table with a single row for each calculation to be performed in generating Tableau Ready Output for CHI reporting +#' #' @keywords CHI, Tableau, Production +#' +#' @importFrom data.table setDT rbindlist setcolorder `:=` #' @import dtsurvey #' @import future #' @import future.apply #' @importFrom tidyr crossing +#' @importFrom rads string_clean +#' @export chi_process_nontrends <- function(ph.analysis_set = NULL, myset = NULL){ @@ -63,7 +69,7 @@ chi_process_nontrends <- function(ph.analysis_set = NULL, tempy[cat1 %in% c('Ethnicity', "Birthing person's ethnicity") & cat1_varname == 'race3', cat1_varname := 'race3_hispanic'] tempy[cat2 %in% c('Ethnicity', "Birthing person's ethnicity") & cat2_varname == 'race3', cat2_varname := 'race3_hispanic'] setcolorder(tempy, 'indicator_key') - rads::sql_clean(tempy) + rads::string_clean(tempy) tempy <- tempy[!(tab == 'crosstabs' & cat1 == 'King County' & cat2 != 'King County')] # only legit xtab for KC is KC by itself tempy[tab == 'crosstabs' & cat2 == 'King County', `:=` (cat2 = 'Overall', cat2_varname = 'overall')] From 9efdf1109d594d1990976f9fb50a7ca2098b8d2b Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 16:16:33 -0800 Subject: [PATCH 11/35] update / improve roxygen2 headers --- R/chi_generate_metadata.R | 1 + R/chi_generate_tro_shell.R | 3 +-- R/chi_process_trends.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/chi_generate_metadata.R b/R/chi_generate_metadata.R index 2271e61..5096c0b 100644 --- a/R/chi_generate_metadata.R +++ b/R/chi_generate_metadata.R @@ -7,6 +7,7 @@ #' @param est.current current year's tableau ready output with completed estimates #' #' @return table of metadata +#' @importFrom data.table setDT copy := #' @importFrom rads substrRight #' @export #' diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 9333930..7814c0e 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -27,8 +27,7 @@ #' #' @keywords CHI, Tableau, Production #' -#' @import data.table -#' @import dtsurvey +#' @importFrom data.table rbindlist #' @import future #' @import future.apply #' diff --git a/R/chi_process_trends.R b/R/chi_process_trends.R index ea8edef..b2d35e8 100644 --- a/R/chi_process_trends.R +++ b/R/chi_process_trends.R @@ -14,7 +14,7 @@ #' #' @returns TRO with rows for each indicator key and span of years within the provided time frame #' @keywords CHI, Tableau, Production -#' @import dtsurvey +#' @importFrom data.table setDT setorder #' @importFrom tidyr crossing chi_process_trends <- function(indicator_key = NULL, From ec18afcdf046c5e3570a3604294d07457a66c44e Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 16:59:00 -0800 Subject: [PATCH 12/35] Fix #6 - replace validate_yaml_data with tsql_validate_field_types - replace import and refernce to imported YAML with get_chi_yaml and get_chi_cols - updated / improved header --- R/chi_sql_update.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/R/chi_sql_update.R b/R/chi_sql_update.R index 1264fb0..9611868 100644 --- a/R/chi_sql_update.R +++ b/R/chi_sql_update.R @@ -12,12 +12,16 @@ #' #' @return status message indicating success and location, or failure, of upload #' -#' -#' #' @keywords CHI, Tableau, Production #' +#' @importFrom data.table setDT copy +#' @importFrom DBI dbConnect dbWriteTable dbGetQuery dbExistsTable Id +#' @importFrom odbc odbc +#' @importFrom glue glue_sql +#' @importFrom yaml yaml.load +#' @importFrom rads tsql_validate_field_types +#' #' @export - #' chi_sql_update <- function(CHIestimates = NULL, CHImetadata = NULL, @@ -26,19 +30,17 @@ chi_sql_update <- function(CHIestimates = NULL, replace_table = F # default is to update select rows rather than replace the entire table ){ # load CHI yaml config file ---- - chi_config <- yaml::yaml.load(httr::GET(url = "https://raw.githubusercontent.com/PHSKC-APDE/rads/main/ref/chi_qa.yaml", httr::authenticate(Sys.getenv("GITHUB_TOKEN"), ""))) - # check CHIestimates argument---- if(!exists('CHIestimates')){stop("\n\U0001f47f The results table to push to SQL (CHIestimates) is missing ")} if( inherits(CHIestimates, "data.frame") == FALSE){stop("\n\U0001f47f CHIestimates must be a data.frame or a data.table.")} if( inherits(CHIestimates, "data.table") == FALSE){setDT(CHIestimates)} - rads::validate_yaml_data(DF = CHIestimates, YML = chi_config, VARS = "vars") + rads::tsql_validate_field_types(ph.data = CHIestimates, field_types = unlist(chi_get_yaml()$vars)) # check CHImetadata argument---- if(!exists('CHImetadata')){stop("\n\U0001f47f The metadata table to push to SQL (CHImetadata) is missing ")} if( inherits(CHImetadata, "data.frame") == FALSE){stop("\n\U0001f47f CHImetadata must be a data.frame or a data.table.")} if( inherits(CHImetadata, "data.table") == FALSE){setDT(CHImetadata)} - rads::validate_yaml_data(DF = CHImetadata, YML = chi_config, VARS = "metadata") + rads::tsql_validate_field_types(ph.data = CHImetadata, field_types = unlist(chi_get_yaml()$metadata)) # ensure indicator_key is consistent across estimates and metadata if(!identical(sort(as.character(unique(CHIestimates$indicator_key))), sort(as.character(CHImetadata$indicator_key)))){ @@ -145,7 +147,7 @@ chi_sql_update <- function(CHIestimates = NULL, value = as.data.frame(copy(CHIestimates)), overwrite = T, append = F, - field.types = unlist(chi_config$vars)) + field.types = unlist(chi_get_yaml()$vars)) # metadata DBI::dbWriteTable(conn = CHI_db_cxn, @@ -153,7 +155,7 @@ chi_sql_update <- function(CHIestimates = NULL, value = as.data.frame(copy(CHImetadata)), overwrite = T, append = F, - field.types = unlist(chi_config$metadata)) + field.types = unlist(chi_get_yaml()$metadata)) } if(isFALSE(replace_table)){ From 6260baf3a75b4ac7409d8b9d268a638c29e53f47 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 16:59:32 -0800 Subject: [PATCH 13/35] sorted global variables and added 'year' --- R/globals.R | 83 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 12 deletions(-) diff --git a/R/globals.R b/R/globals.R index 4b08dde..0ab6023 100644 --- a/R/globals.R +++ b/R/globals.R @@ -1,15 +1,74 @@ # Declare global variables for devtools::check() utils::globalVariables(c( - "varname", "group", "keepme", "reference", "chi_year", "tempstart", - "tempend", "chi_geo_kc", "cat1", "cat1_varname", "cat2", "cat2_varname", - "chi_age", "numerator", "count", "cat2_group", "indicator_key", "tab", - "round2", "lower_bound", "cat1_group", "cat2", "tempbv", "run_date", - "chi", "data_source", "time_trends", "suppression", "rse", "caution", - "comparison_with_kc_sig", "AgeMin", "AgeMax", "start", "race_type", "geo_type", - "denominator", "latest_yearx", "latest_year", "latest_year_resultx", "latest_year_result", - "run_datex", "latest_year_countx", "latest_year_count", "latest_year_kc_popx", - "latest_year_kc_pop", "valid_years", "end", "group_by1", "group_by2", "geo_id", - "gender", "race_eth", "race", "race_aic", "GEOID20", "hra20_name", "region_name", - "s2t_fraction", "source_id", "region", "pop", "bigcity", "geo_tract2020", "pov200grp", - "crosstabs", "vebrose", "span", "age", 'year.span' + "age", + "AgeMax", + "AgeMin", + "bigcity", + "cat1", + "cat1_group", + "cat1_varname", + "cat2", + "cat2", + "cat2_group", + "cat2_varname", + "caution", + "chi", + "chi_age", + "chi_geo_kc", + "chi_year", + "comparison_with_kc_sig", + "count", + "crosstabs", + "data_source", + "denominator", + "end", + "gender", + "geo_id", + "geo_tract2020", + "geo_type", + "GEOID20", + "group", + "group_by1", + "group_by2", + "hra20_name", + "indicator_key", + "keepme", + "latest_year", + "latest_year_count", + "latest_year_countx", + "latest_year_kc_pop", + "latest_year_kc_popx", + "latest_year_result", + "latest_year_resultx", + "latest_yearx", + "lower_bound", + "numerator", + "pop", + "pov200grp", + "race", + "race_aic", + "race_eth", + "race_type", + "reference", + "region", + "region_name", + "round2", + "rse", + "run_date", + "run_datex", + "s2t_fraction", + "source_id", + "span", + "start", + "suppression", + "tab", + "tempbv", + "tempend", + "tempstart", + "time_trends", + "valid_years", + "varname", + "vebrose", + "year.span", + "year" )) From f7ecc78c792228ecd7831f0190fb7660d5e9ed9f Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Mon, 10 Feb 2025 17:00:14 -0800 Subject: [PATCH 14/35] update DESCRIPTION & documentation --- DESCRIPTION | 3 +-- NAMESPACE | 39 +++++++++++++++++++++++++++++++-------- man/chi_get_cols.Rd | 18 ++++++++++++++++++ 3 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 man/chi_get_cols.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 89c7a46..86b2e5c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,8 +29,7 @@ Imports: rads, rads.data, tidyr, - yaml (>= 2.2.1), - utils + yaml (>= 2.2.1) Remotes: github::PHSKC-APDE/rads, github::PHSKC-APDE/dtsurvey Suggests: diff --git a/NAMESPACE b/NAMESPACE index 8f1f3e5..20cf43e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,29 +1,52 @@ # Generated by roxygen2: do not edit by hand +export(chi_calc) +export(chi_count_by_age) +export(chi_drop_illogical_ages) +export(chi_generate_instructions_pop) export(chi_generate_metadata) export(chi_generate_tro_shell) +export(chi_get_cols) +export(chi_get_yaml) +export(chi_process_nontrends) export(chi_qa_tro) export(chi_sql_update) -import(data.table) import(dtsurvey) import(future) import(future.apply) +importFrom(DBI,Id) +importFrom(DBI,dbConnect) +importFrom(DBI,dbExistsTable) +importFrom(DBI,dbGetQuery) +importFrom(DBI,dbWriteTable) importFrom(data.table,"%between%") -importFrom(data.table,':=') +importFrom(data.table,":=") +importFrom(data.table,.SD) +importFrom(data.table,`:=`) +importFrom(data.table,between) importFrom(data.table,copy) importFrom(data.table,data.table) -importFrom(data.table,dcast) -importFrom(data.table,fread) +importFrom(data.table,fcase) importFrom(data.table,is.data.table) -importFrom(data.table,setDF) +importFrom(data.table,rbindlist) +importFrom(data.table,set) importFrom(data.table,setDT) importFrom(data.table,setcolorder) importFrom(data.table,setnames) importFrom(data.table,setorder) -importFrom(data.table,shift) +importFrom(data.table,tstrsplit) +importFrom(future.apply,future_lapply) importFrom(glue,glue) -importFrom(rads,chi_qa) +importFrom(glue,glue_sql) +importFrom(odbc,odbc) +importFrom(rads,calc) +importFrom(rads,chi_cols) +importFrom(rads,compare_estimate) +importFrom(rads,string_clean) importFrom(rads,substrRight) +importFrom(rads,suppress) +importFrom(rads,tsql_validate_field_types) importFrom(tidyr,crossing) -importFrom(utils,write.table) +importFrom(tools,toTitleCase) +importFrom(yaml,read_yaml) importFrom(yaml,yaml.load) diff --git a/man/chi_get_cols.Rd b/man/chi_get_cols.Rd new file mode 100644 index 0000000..2047f92 --- /dev/null +++ b/man/chi_get_cols.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_get_yaml.R +\name{chi_get_cols} +\alias{chi_get_cols} +\title{Get CHI variable column names} +\usage{ +chi_get_cols() +} +\value{ +A character vector of column names +} +\description{ +Returns a character vector of column names defined in the CHI YAML reference file. +This helper function provides easy access to the standardized CHI variable names. +} +\examples{ +cols <- chi_get_cols() +} From 61021f8b1fb03b7ae281d92992c4f67db20c9250 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 13:45:14 -0800 Subject: [PATCH 15/35] Drop columns from CHI standards - based on email from Joie on 2/18/2025 --- inst/ref/chi_qa.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/inst/ref/chi_qa.yaml b/inst/ref/chi_qa.yaml index cb0f5a7..71cdf8d 100644 --- a/inst/ref/chi_qa.yaml +++ b/inst/ref/chi_qa.yaml @@ -18,9 +18,6 @@ vars: upper_bound: float se: float rse: float - comparison_with_kc: varchar(255) - time_trends: varchar(255) - significance: varchar(255) caution: varchar(255) suppression: varchar(255) numerator: float From d1d41643f79859bde98173c90c65cedf07ebc98e Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 17:50:36 -0800 Subject: [PATCH 16/35] chi_calc BUG FIX & improvements - BUG FIX: arguments for where statement in future_lapply rads::calc were out of scope. Fix was to subset ph.data directly using data.table/dtsurvey syntax - added progress bar for future_lapply - simplified syntax within future_lapply - eliminated creation of columns Joie no longer wants: comparison_with_kc, significance, time_trends - input validation - switch to using na.omit rather than setdiff(X, NA) - better roxygen2 header --- R/proto_chi_calc.R | 424 +++++++++++++++++++++++++-------------------- man/chi_calc.Rd | 44 ++++- 2 files changed, 278 insertions(+), 190 deletions(-) diff --git a/R/proto_chi_calc.R b/R/proto_chi_calc.R index 915d7bb..4c1ca45 100644 --- a/R/proto_chi_calc.R +++ b/R/proto_chi_calc.R @@ -1,22 +1,58 @@ #' Calculate CHI Estimates #' #' @description -#' Generates CHI estimates from input data according to provided instructions. +#' Generates CHI estimates from input data according to provided instructions +#' created by \code{\link{chi_generate_tro_shell}}. #' Handles both proportions and rates, with options for suppression of small numbers. #' -#' @param ph.data Input data.frame or data.table containing CHI data +#' @param ph.data Input data.frame or data.table containing analytic read data #' @param ph.instructions data.frame or data.table containing calculation instructions -#' @param rate Logical; if TRUE calculates rates, if FALSE calculates proportions -#' @param rate_per Rate multiplier when rate=TRUE (e.g., 100000 for per 100,000) -#' @param small_num_suppress Logical; if TRUE suppresses small numbers +#' @param rate Logical; if \code{TRUE} calculates rates, if \code{FALSE} calculates proportions +#' @param rate_per Rate multiplier when \code{rate=TRUE} (e.g., 100000 for per 100,000) +#' @param small_num_suppress Logical; if \code{TRUE} suppresses small numbers #' @param suppress_low Lower bound for suppression #' @param suppress_high Upper bound for suppression #' @param source_name Name of data source #' @param source_date Date of data source #' -#' @return A data.table containing CHI estimates with standard columns +#' @return A data.table containing CHI estimates with the following columns: +#' \itemize{ +#' \item{\code{data_source}} Data source (e.g., acs, brfss, etc.) +#' \item{\code{indicator_key}} Unique indicator key +#' \item{\code{tab}} Type of analysis (e.g., demgroups, _kingcounty, etc.) +#' \item{\code{year}} Year(s) of data +#' \item{\code{cat1}} Describes data field (e.g., Gender, Ethnicity, etc.) +#' \item{\code{cat1_group}} Demographics variables for cat1 (e.g., Female, Hispanic, etc.) +#' \item{\code{cat1_varname}} cat1 variable name in the analytic data sets +#' \item{\code{cat2}} Describes data field (e.g., Gender, Ethnicity, etc.) +#' \item{\code{cat2_group}} Demographics variables for cat2 (e.g., Female, Hispanic, etc.) +#' \item{\code{cat2_varname}} cat2 variable name in the analytic data sets +#' \item{\code{result}} Calculated proportion/percent or rate +#' \item{\code{lower_bound}} Lower bound of confidence interval +#' \item{\code{upper_bound}} Upper bound of confidence interval +#' \item{\code{se}} Standard error +#' \item{\code{rse}} Relative standard error +#' \item{\code{caution}} '!' when RSE>=30% | N == 0 +#' \item{\code{suppression}} '^' when suppressed +#' \item{\code{numerator}} For line-level data, count of events; for surveys, people who responded yes or no for binary variable +#' \item{\code{numerator}} For line-level data, population; for surveys, sample size +#' \item{\code{chi}} '1' indicates that rows is used for CHI +#' \item{\code{source_date}} date analytic ready data was created +#' \item{\code{run_date}} date of this analysis +#' } +#' +#' @seealso +#' \code{\link{chi_generate_tro_shell}} for creating calculation instructions +#' +#' \code{\link{chi_qa_tro}} for validating results +#' +#' \code{\link{chi_generate_metadata}} for creating metadata from results +#' #' @importFrom data.table setDT copy setnames := setorder set .SD data.table #' @importFrom rads calc compare_estimate suppress chi_cols +#' @importFrom future.apply future_lapply +#' @importFrom stats na.omit +#' @import progressr #' @export #' #' @@ -29,21 +65,38 @@ chi_calc <- function(ph.data = NULL, suppress_high = 9, source_name = 'blahblah', source_date = NULL){ + # Input validation ---- + if (is.null(ph.data)) stop("\n\U1F6D1 ph.data must be provided") + if (!is.data.frame(ph.data)) stop("\n\U1F6D1 ph.data must be a data.frame or data.table") + + if (is.null(ph.instructions)) stop("\n\U1F6D1 ph.instructions must be provided") + if (!is.data.frame(ph.instructions)) stop("\n\U1F6D1 ph.instructions must be a data.frame or data.table") + + if (!is.logical(rate)) stop("\n\U1F6D1 rate must be logical (TRUE/FALSE)") + if (rate && is.null(rate_per)) stop("\n\U1F6D1 rate_per must be provided when rate=TRUE") + + if (!is.logical(small_num_suppress)) stop("\n\U1F6D1 small_num_suppress must be logical (TRUE/FALSE)") + + # Convert to data.table if needed + if (!is.data.table(ph.data)) setDT(ph.data) + if (!is.data.table(ph.instructions)) setDT(ph.instructions) + # Error if ph.instructions has no data ---- - if(nrow(ph.instructions) == 0){ - stop("\n\U0001f47f the table ph.instructions does not have any rows.") - #tempCHIest <- data.table(setNames(data.frame(matrix(ncol = length(chi_cols()), nrow = 0), stringsAsFactors = FALSE), chi_cols())) - } + if(nrow(ph.instructions) == 0){ + stop("\n\U0001f47f the table ph.instructions does not have any rows.") + #tempCHIest <- data.table(setNames(data.frame(matrix(ncol = length(chi_cols()), nrow = 0), stringsAsFactors = FALSE), chi_cols())) + } + # Create 'Overall' if needed for crosstabs ---- - if(!'overall' %in% names(ph.data)){ - ph.data$overall <- with(ph.data, ifelse(chi_geo_kc == 'King County', 'Overall', NA_character_)) - } + if(!'overall' %in% names(ph.data)){ + ph.data$overall <- with(ph.data, ifelse(chi_geo_kc == 'King County', 'Overall', NA_character_)) + } # Check to make sure all variables needed exist in the data ---- - neededbyvars <- setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA)) + neededbyvars <- unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname))) if("race3" %in% neededbyvars & !"race3_hispanic" %in% neededbyvars){neededbyvars <- c(neededbyvars, 'race3_hispanic')} # By definition, Hispanic cannot be contained within race3 - neededvars <- setdiff(unique(c(ph.instructions$indicator_key, neededbyvars)), c(NA)) + neededvars <- unique(na.omit(c(ph.instructions$indicator_key, neededbyvars))) missingvars <- setdiff(neededvars, names(ph.data)) if(length(missingvars) > 0 ){ @@ -53,189 +106,192 @@ chi_calc <- function(ph.data = NULL, } else{message("\U0001f642 All specified variables exist in ph.data")} # Check to make sure all byvariables have the CHI specified encoding ---- - stdbyvars <- rads.data::misc_chi_byvars[varname %in% setdiff(unique(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)), c(NA))][, list(varname, group, keepme, reference = 1)] + stdbyvars <- rads.data::misc_chi_byvars[varname %in% unique(na.omit(c(ph.instructions$cat1_varname, ph.instructions$cat2_varname)))][, list(varname, group, keepme, reference = 1)] stdbyvars[group %in% c("Hispanic", 'Non-Hispanic') & varname == 'race3', varname := 'race3_hispanic'] # necessary because race3 & Hispanic must be two distinct variables in raw data phbyvars <- rbindlist(lapply( X=as.list(neededbyvars), - FUN = function(X){data.table::data.table(varname = X, group = setdiff(unique(ph.data[[X]]), NA), ph.data = 1)})) + FUN = function(X){data.table::data.table(varname = X, group = unique(na.omit(ph.data[[X]])), ph.data = 1)})) compbyvars <- merge(stdbyvars, phbyvars, by = c('varname', 'group'), all = T) if(nrow(compbyvars[is.na(reference)| is.na(ph.data)]) > 0){ print(compbyvars[is.na(reference)| is.na(ph.data)]) stop("\n\U2620 the table above shows the varname/group combinations that do not align between the reference table and your ph.data.") } else {message("\U0001f642 All specified cat1_group and cat2_group values align with the reference standard.")} - # Use Daniel's calc function to generate estimates for each row of ph.instructions ---- + # Use rads::calc to generate estimates for each row of ph.instructions ---- message("\U023F3 Be patient! The function is generating estimates for each row of ph.instructions.") - tempCHIest <- rbindlist(future_lapply( - X = as.list(seq(1, nrow(ph.instructions), 1)), - FUN = function(X){ - message(paste0("Calculating estimates for ph.instructions row ", X, " of ", nrow(ph.instructions), "...")) - - # create constants for calc()---- - tempbv1 <- setdiff(ph.instructions[X][['cat1_varname']], c()) - tempbv2 <- setdiff(ph.instructions[X][['cat2_varname']], c()) - if(length(tempbv2) == 0){tempbv2 = NA} - tempbv <- setdiff(c(tempbv1, tempbv2), c(NA)) - - # create variables of interest used in calc function below - tempbv <- tempbv - tempend <- ph.instructions[X][['end']] - tempstart <- ph.instructions[X][['start']] - - # use calc()---- - if(rate == FALSE){ # standard proportion analysis - if(any(grepl('wastate', tempbv))){ - tempest <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend, - by = tempbv, - metrics = c('mean', 'numerator', 'denominator', 'rse')) - } else { - tempest <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County', - by = tempbv, - metrics = c('mean', 'numerator', 'denominator', 'rse')) - } - } - if(rate == TRUE){ - if(any(grepl('wastate', tempbv))){ - tempest <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend, - by = tempbv, - metrics = c('rate', 'numerator', 'denominator', 'rse'), - per = rate_per) - } else { - tempest <- rads::calc(ph.data = ph.data, - what = ph.instructions[X][['indicator_key']], - where = chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County', - by = tempbv, - metrics = c('rate', 'numerator', 'denominator', 'rse'), - per = rate_per) + progressr::handlers(handler_progress()) + with_progress({ + p <- progressor(nrow(ph.instructions)) + tempCHIest <- rbindlist(future_lapply( + X = as.list(seq(1, nrow(ph.instructions), 1)), + FUN = function(X){ + p(sprintf("Processing row %d of %d", X, nrow(ph.instructions))) + + # get the current row + current_row <- ph.instructions[X, ] + + # create constants for calc()---- + tempbv1 <- setdiff(current_row$cat1_varname, c()) + tempbv2 <- setdiff(current_row$cat2_varname, c()) + if(length(tempbv2) == 0){tempbv2 = NA} + + # create variables of interest used in calc function below + tempbv <- unique(na.omit(c(tempbv1, tempbv2))) + tempend <- current_row$end + tempstart <- current_row$start + + # use calc()---- + if(rate == FALSE){ # standard proportion analysis + if(any(grepl('wastate', tempbv))){ + tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend], + what = current_row$indicator_key, + by = tempbv, + metrics = c('mean', 'numerator', 'denominator', 'rse')) + } else { + tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County'], + what = current_row$indicator_key, + by = tempbv, + metrics = c('mean', 'numerator', 'denominator', 'rse')) + } + } + if(rate == TRUE){ + if(any(grepl('wastate', tempbv))){ + tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend], + what = current_row$indicator_key, + by = tempbv, + metrics = c('rate', 'numerator', 'denominator', 'rse'), + per = rate_per) + } else { + tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County'], + what = current_row$indicator_key, + by = tempbv, + metrics = c('rate', 'numerator', 'denominator', 'rse'), + per = rate_per) + } + data.table::setnames(tempest, gsub("^rate", "mean", names(tempest))) + } + + # add on CHI standard columns that are from ph.instructions (in order of standard results output)---- + tempest[, indicator_key := current_row$indicator_key] + tempest[, tab := current_row$tab] + tempest[current_row$end != current_row$start, + year := paste0(current_row$start, "-", current_row$end)] + tempest[current_row$end == current_row$start, + year := current_row$end] + tempest[, cat1 := current_row$cat1] + data.table::setnames(tempest, current_row$cat1_varname, 'cat1_group') + tempest[, cat1_varname := current_row$cat1_varname] + tempest[, cat2 := current_row$cat2] + if(!is.na(tempbv2) & tempbv1 != tempbv2){ + data.table::setnames(tempest, current_row$cat2_varname, 'cat2_group')} else{ + tempest[, cat2_group := NA] } + tempest[, cat2_varname := current_row$cat2_varname] + data.table::setnames(tempest, + c("mean", "mean_lower", "mean_upper", "mean_se"), + c("result", "lower_bound", "upper_bound", "se")) + + # set correct data types for TSQL database + tempest[, denominator := as.numeric(denominator)] + tempest[, numerator := as.numeric(numerator)] + + return(tempest) } - data.table::setnames(tempest, gsub("^rate", "mean", names(tempest))) - } - - # add on CHI standard columns that are from ph.instructions (in order of standard results output)---- - tempest[, indicator_key := ph.instructions[X][['indicator_key']]] - tempest[, tab := ph.instructions[X][['tab']]] - tempest[ph.instructions[X][['end']] != ph.instructions[X][['start']], - year := paste0(ph.instructions[X][['start']], "-", ph.instructions[X][['end']])] - tempest[ph.instructions[X][['end']] == ph.instructions[X][['start']], - year := ph.instructions[X][['end']]] - tempest[, cat1 := ph.instructions[X][['cat1']]] - data.table::setnames(tempest, ph.instructions[X][['cat1_varname']], 'cat1_group') - tempest[, cat1_varname := ph.instructions[X][['cat1_varname']]] - tempest[, cat2 := ph.instructions[X][['cat2']]] - if(!is.na(tempbv2) & tempbv1 != tempbv2){ - data.table::setnames(tempest, ph.instructions[X][['cat2_varname']], 'cat2_group')} else{ - tempest[, cat2_group := NA] } - tempest[, cat2_varname := ph.instructions[X][['cat2_varname']]] - data.table::setnames(tempest, - c("mean", "mean_lower", "mean_upper", "mean_se"), - c("result", "lower_bound", "upper_bound", "se")) - } - ), use.names = TRUE) + ), use.names = TRUE) + }) + # Tidy results ---- - # drop when cat1_group is missing (e.g., cat1 == 'Regions' and region is NA) ---- - tempCHIest <- tempCHIest[!is.na(cat1_group)] - - # drop when cat2_group is missing but cat2 is not missing ---- - tempCHIest <- tempCHIest[!(is.na(cat2_group) & !is.na(cat2))] - - # drop if cat1_group | cat2_group had `keepme == "No"` in the reference table ---- - dropme <- unique(stdbyvars[keepme == 'No'][, reference := NULL]) - tempCHIest <- merge(tempCHIest, - dropme, - by.x = c('cat1_varname', 'cat1_group'), - by.y = c('varname', 'group'), - all.x = T, - all.y = F) - tempCHIest <- tempCHIest[is.na(keepme)][, keepme := NULL] - - tempCHIest <- merge(tempCHIest, - dropme, - by.x = c('cat2_varname', 'cat2_group'), - by.y = c('varname', 'group'), - all.x = T, - all.y = F) - tempCHIest <- tempCHIest[is.na(keepme)][, keepme := NULL] - - # change all NaN to a normal NA or SQL will vomit ---- - for(col in names(tempCHIest)) set(tempCHIest, i=which(is.nan(tempCHIest[[col]])), j=col, value=NA) - - # apply rounding rules for proportions---- - if(rate == FALSE){ - tempCHIest[, c("result", "lower_bound", "upper_bound", "rse") := lapply(.SD, round2, 3), .SDcols = c("result", "lower_bound", "upper_bound", "rse")] - tempCHIest[, c("se") := lapply(.SD, round2, 4), .SDcols = c("se")] - } - - # apply rounding rules for rates---- - if(rate == TRUE){ - tempCHIest[, c("result", "lower_bound", "upper_bound") := lapply(.SD, round2, 1), .SDcols = c("result", "lower_bound", "upper_bound")] - tempCHIest[, c("rse") := lapply(.SD, round2, 3), .SDcols = c("rse")] - tempCHIest[, c("se") := lapply(.SD, round2, 2), .SDcols = c("se")] - } - - # prevent negative lower CI ---- - tempCHIest[lower_bound < 0, lower_bound := 0] - - # race3/race4 messiness ---- - tempCHIest[cat1_varname == 'race3_hispanic', cat1_varname := 'race3'] - tempCHIest[cat2_varname == 'race3_hispanic', cat2_varname := 'race3'] - - if(any(grepl("Birthing per", unique(tempCHIest$cat1)))){ - tempCHIest[cat1_varname %in% c("race3", "race4") & tab == 'trends', cat1 := "Birthing person's race/ethnicity"] - tempCHIest[cat2_varname %in% c("race3", "race4") & tab == 'trends', cat2 := "Birthing person's race/ethnicity"] - - tempCHIest[cat1_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat1_group == 'Hispanic', - cat1 := "Birthing person's ethnicity"] - tempCHIest[cat2_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat2_group == 'Hispanic', - cat2 := "Birthing person's ethnicity"] - } else { - tempCHIest[cat1_varname %in% c("race3", "race4") & tab == 'trends', cat1 := "Race/ethnicity"] - tempCHIest[cat2_varname %in% c("race3", "race4") & tab == 'trends', cat2 := "Race/ethnicity"] - - tempCHIest[cat1_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat1_group == 'Hispanic', - cat1 := 'Ethnicity'] - tempCHIest[cat2_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat2_group == 'Hispanic', - cat2 := 'Ethnicity'] - } - - # Create additional necessary CHI columns ---- - tempCHIest[, source_date := as.Date(source_date)] - tempCHIest[, run_date := as.Date(Sys.Date(), "%Y%m%d")] - tempCHIest[, chi := 1] # set to 1 by default, because this is for CHI. Manually set to zero when needed afterward - tempCHIest[tab == 'metadata', chi := 0] - tempCHIest[, data_source := source_name] - tempCHIest[, time_trends := NA] # NA because no longer calculated - - tempCHIest <- rads::compare_estimate(mydt = tempCHIest, - id_vars = c("indicator_key", "year"), - key_where = cat1_group == "King County" & tab != "crosstabs", - new_col = "comparison_with_kc", - tidy = TRUE) - data.table::setnames(tempCHIest, "comparison_with_kc_sig", "significance") - - if(small_num_suppress == TRUE){ - tempCHIest <- rads::suppress(sup_data = tempCHIest, - suppress_range = c(suppress_low, suppress_high), - secondary = T, - secondary_exclude = cat1_varname != 'race3') - } else {tempCHIest[, suppression := NA_character_]} - - tempCHIest[rse>=30, caution := "!"] - - - # Keep and order standard CHI columns ---- - tempCHIest <- tempCHIest[, c(rads::chi_cols()[]), with = F] - - tempCHIest <- tempCHIest[, cat1 := factor(cat1, levels = c("King County", sort(setdiff(unique(tempCHIest$cat1), "King County"))) )] - tempCHIest <- tempCHIest[, tab := factor(tab, levels = c(c("_kingcounty","demgroups", "trends"), sort(setdiff(unique(tempCHIest$tab), c("_kingcounty","demgroups", "trends")))) )] - setorder(tempCHIest, indicator_key, tab, -year, cat1, cat1_group, cat2, cat2_group) + # drop when cat1_group is missing (e.g., cat1 == 'Regions' and region is NA) ---- + tempCHIest <- tempCHIest[!is.na(cat1_group)] + + # drop when cat2_group is missing but cat2 is not missing ---- + tempCHIest <- tempCHIest[!(is.na(cat2_group) & !is.na(cat2))] + + # drop if cat1_group | cat2_group had `keepme == "No"` in the reference table ---- + dropme <- unique(stdbyvars[keepme == 'No'][, reference := NULL]) + tempCHIest <- merge(tempCHIest, + dropme, + by.x = c('cat1_varname', 'cat1_group'), + by.y = c('varname', 'group'), + all.x = T, + all.y = F) + tempCHIest <- tempCHIest[is.na(keepme)][, keepme := NULL] + + tempCHIest <- merge(tempCHIest, + dropme, + by.x = c('cat2_varname', 'cat2_group'), + by.y = c('varname', 'group'), + all.x = T, + all.y = F) + tempCHIest <- tempCHIest[is.na(keepme)][, keepme := NULL] + + # change all NaN to a normal NA or SQL will vomit ---- + for(col in names(tempCHIest)) set(tempCHIest, i=which(is.nan(tempCHIest[[col]])), j=col, value=NA) + + # apply rounding rules for proportions---- + if(rate == FALSE){ + tempCHIest[, c("result", "lower_bound", "upper_bound", "rse") := lapply(.SD, round2, 3), .SDcols = c("result", "lower_bound", "upper_bound", "rse")] + tempCHIest[, c("se") := lapply(.SD, round2, 4), .SDcols = c("se")] + } + + # apply rounding rules for rates---- + if(rate == TRUE){ + tempCHIest[, c("result", "lower_bound", "upper_bound") := lapply(.SD, round2, 1), .SDcols = c("result", "lower_bound", "upper_bound")] + tempCHIest[, c("rse") := lapply(.SD, round2, 3), .SDcols = c("rse")] + tempCHIest[, c("se") := lapply(.SD, round2, 2), .SDcols = c("se")] + } + + # prevent negative lower CI ---- + tempCHIest[lower_bound < 0, lower_bound := 0] + + # race3/race4 messiness ---- + tempCHIest[cat1_varname == 'race3_hispanic', cat1_varname := 'race3'] + tempCHIest[cat2_varname == 'race3_hispanic', cat2_varname := 'race3'] + + if(any(grepl("Birthing per", unique(tempCHIest$cat1)))){ + tempCHIest[cat1_varname %in% c("race3", "race4") & tab == 'trends', cat1 := "Birthing person's race/ethnicity"] + tempCHIest[cat2_varname %in% c("race3", "race4") & tab == 'trends', cat2 := "Birthing person's race/ethnicity"] + + tempCHIest[cat1_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat1_group == 'Hispanic', + cat1 := "Birthing person's ethnicity"] + tempCHIest[cat2_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat2_group == 'Hispanic', + cat2 := "Birthing person's ethnicity"] + } else { + tempCHIest[cat1_varname %in% c("race3", "race4") & tab == 'trends', cat1 := "Race/ethnicity"] + tempCHIest[cat2_varname %in% c("race3", "race4") & tab == 'trends', cat2 := "Race/ethnicity"] + + tempCHIest[cat1_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat1_group == 'Hispanic', + cat1 := 'Ethnicity'] + tempCHIest[cat2_varname %in% c('race3') & tab %in% c('crosstabs', 'demgroups') & cat2_group == 'Hispanic', + cat2 := 'Ethnicity'] + } + + # Create additional necessary CHI columns ---- + tempCHIest[, source_date := as.Date(source_date)] + tempCHIest[, run_date := as.Date(Sys.Date(), "%Y%m%d")] + tempCHIest[, chi := 1] # set to 1 by default, because this is for CHI. Manually set to zero when needed afterward + tempCHIest[tab == 'metadata', chi := 0] + tempCHIest[, data_source := source_name] + + if(small_num_suppress == TRUE){ + tempCHIest <- rads::suppress(sup_data = tempCHIest, + suppress_range = c(suppress_low, suppress_high), + secondary = T, + secondary_exclude = cat1_varname != 'race3') + } else {tempCHIest[, suppression := NA_character_]} + + tempCHIest[rse>=30, caution := "!"] + + tempCHIest[, c('cat2', 'cat2_group', 'cat2_varname') := lapply(.SD, as.character), .SDcols = c('cat2', 'cat2_group', 'cat2_varname')] + + + # Keep and order standard CHI columns ---- + tempCHIest <- tempCHIest[, chi_get_cols(), with = F] + + tempCHIest <- tempCHIest[, cat1 := factor(cat1, levels = c("King County", sort(setdiff(unique(tempCHIest$cat1), "King County"))) )] + tempCHIest <- tempCHIest[, tab := factor(tab, levels = c(c("_kingcounty","demgroups", "trends"), sort(setdiff(unique(tempCHIest$tab), c("_kingcounty","demgroups", "trends")))) )] + setorder(tempCHIest, indicator_key, tab, -year, cat1, cat1_group, cat2, cat2_group) # return the CHI table ---- return(tempCHIest) diff --git a/man/chi_calc.Rd b/man/chi_calc.Rd index 4a65b7e..4ecf01d 100644 --- a/man/chi_calc.Rd +++ b/man/chi_calc.Rd @@ -17,15 +17,15 @@ chi_calc( ) } \arguments{ -\item{ph.data}{Input data.frame or data.table containing CHI data} +\item{ph.data}{Input data.frame or data.table containing analytic read data} \item{ph.instructions}{data.frame or data.table containing calculation instructions} -\item{rate}{Logical; if TRUE calculates rates, if FALSE calculates proportions} +\item{rate}{Logical; if \code{TRUE} calculates rates, if \code{FALSE} calculates proportions} -\item{rate_per}{Rate multiplier when rate=TRUE (e.g., 100000 for per 100,000)} +\item{rate_per}{Rate multiplier when \code{rate=TRUE} (e.g., 100000 for per 100,000)} -\item{small_num_suppress}{Logical; if TRUE suppresses small numbers} +\item{small_num_suppress}{Logical; if \code{TRUE} suppresses small numbers} \item{suppress_low}{Lower bound for suppression} @@ -36,9 +36,41 @@ chi_calc( \item{source_date}{Date of data source} } \value{ -A data.table containing CHI estimates with standard columns +A data.table containing CHI estimates with the following columns: +\itemize{ + \item{\code{data_source}} Data source (e.g., acs, brfss, etc.) + \item{\code{indicator_key}} Unique indicator key + \item{\code{tab}} Type of analysis (e.g., demgroups, _kingcounty, etc.) + \item{\code{year}} Year(s) of data + \item{\code{cat1}} Describes data field (e.g., Gender, Ethnicity, etc.) + \item{\code{cat1_group}} Demographics variables for cat1 (e.g., Female, Hispanic, etc.) + \item{\code{cat1_varname}} cat1 variable name in the analytic data sets + \item{\code{cat2}} Describes data field (e.g., Gender, Ethnicity, etc.) + \item{\code{cat2_group}} Demographics variables for cat2 (e.g., Female, Hispanic, etc.) + \item{\code{cat2_varname}} cat2 variable name in the analytic data sets + \item{\code{result}} Calculated proportion/percent or rate + \item{\code{lower_bound}} Lower bound of confidence interval + \item{\code{upper_bound}} Upper bound of confidence interval + \item{\code{se}} Standard error + \item{\code{rse}} Relative standard error + \item{\code{caution}} '!' when RSE>=30% | N == 0 + \item{\code{suppression}} '^' when suppressed + \item{\code{numerator}} For line-level data, count of events; for surveys, people who responded yes or no for binary variable + \item{\code{numerator}} For line-level data, population; for surveys, sample size + \item{\code{chi}} '1' indicates that rows is used for CHI + \item{\code{source_date}} date analytic ready data was created + \item{\code{run_date}} date of this analysis +} } \description{ -Generates CHI estimates from input data according to provided instructions. +Generates CHI estimates from input data according to provided instructions +created by \code{\link{chi_generate_tro_shell}}. Handles both proportions and rates, with options for suppression of small numbers. } +\seealso{ +\code{\link{chi_generate_tro_shell}} for creating calculation instructions + +\code{\link{chi_qa_tro}} for validating results + +\code{\link{chi_generate_metadata}} for creating metadata from results +} From 739b50f30ebf133fca96c87e71a092d53dc72495 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 17:56:23 -0800 Subject: [PATCH 17/35] Updated chi_qa_tro - improved headers - added section compare cat1*/cat2* with rads.data::misc_chi_byvars - chi_est >> NOW CHIestimates - chi_meta >> NOW CHImetadata - removed declaration of globals (now in separate file) - dropped all ref to `comparison_with_kc` - --- R/chi_qa_tro.R | 264 ++++++++++++++++++++++++++++++++-------------- man/chi_qa_tro.Rd | 35 ++++-- 2 files changed, 211 insertions(+), 88 deletions(-) diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index de39f60..16dd330 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -1,58 +1,74 @@ #' CHI - QA: Tableau Ready output #' #' @description -#' Checks data for compliance with CHI TRO standard - +#' Checks estimates created by \code{\link{chi_calc}} and metadata created by +#' \code{\link{chi_generate_metadata}} for compliance with CHI TRO standard. #' #' @details #' This function tests if the structure of the data matches CHI Tableau Ready Output #' specifications. If set to verbose mode (default) will report diagnostic information in #' The user interface as warnings (if failed) and messages (for progress and pass). In any #' Case will return 1 or 0 for pass or fail respectively. -#' The CHI Tableau Ready Output (TRO) standards can be reviewed [here](https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/_layouts/15/Doc.aspx?sourcedoc=%7BBED2F507-B834-4D28-8658-C5724F64C001%7D&file=CHI-Standards-TableauReady%20Output.xlsx&action=default&mobileredirect=true&CID=975A2706-130E-46AB-AEB2-DF919FDEC185&wdLOR=cDF77DB7A-3774-430F-92C7-EFAB34CEB35F) +#' The CHI Tableau Ready Output (TRO) standards can be reviewed here: +#' \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +#' SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} #' -#' @param chi_est Name of a data.table or data.frame containing the prepared data to be pushed to SQL -#' @param chi_meta Name of a data.table or data.frame containing the metadata to be pushed to SQL +#' @param CHIestimates Name of a data.table or data.frame containing the prepared data to be pushed to SQL +#' @param CHImetadata Name of a data.table or data.frame containing the metadata to be pushed to SQL #' @param acs default FALSE, Indicates whether it is ACS data (which does not have / need varnames) #' @param ignore_trends default TRUE, Indicates whether the time_trends column should be ignored when checking for missing data. #' @param verbose default TRUE, if false will only return status #' #' @return 1 or 0 for pass or fail status #' +#' @seealso +#' \code{\link{chi_calc}} for generating estimates +#' +#' \code{\link{chi_generate_metadata}} for creating metadata +#' +#' \code{\link{chi_compare_estimates}} for comparing estimates between versions +#' #' @keywords CHI, Tableau, Production #' -#' @importFrom data.table setDT copy setcolorder is.data.table %between% +#' @importFrom data.table setDT copy setcolorder is.data.table %between% uniqueN #' @importFrom glue glue #' @importFrom yaml read_yaml -#' @importFrom rads tsql_validate_field_types +#' @importFrom rads tsql_validate_field_types round2 #' #' @examples #' #' \dontrun{ -#' # create sample data -#' -#' # run function +#' # QA check estimates and metadata +#' qa_status <- chi_qa_tro( +#' CHIestimates = my_estimates, +#' CHImetadata = my_metadata, +#' acs = FALSE, +#' ignore_trends = TRUE, +#' verbose = TRUE +#' ) #' } #' #' @export -chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = F){ +chi_qa_tro <- function(CHIestimates, + CHImetadata, + acs = FALSE, + ignore_trends = TRUE, + verbose = FALSE){ status <- 1 - #global variables used by data.table declared as NULL here to play nice with devtools::check() - indicator_key <- result_type <- result <- upper_bound <- lower_bound <- rse <- caution <- tab <- suppression <- time_trends <- NULL - + ## Check arguments ---- if(verbose){ message("Checking that both the results and the metadata were provided") } - if(is.null(chi_est)){ + if(is.null(CHIestimates)){ status <- 0 if(verbose) { warning("You must provide the name of a data.frame or data.table that contains the CHI results.") } } - if(is.null(chi_meta)){ + if(is.null(CHImetadata)){ status <- 0 if(verbose){ warning("You must provide the name of a data.frame or data.table that contains the CHI metadata ") @@ -67,104 +83,106 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = return(status) } - chi_est <- data.table::setDT(copy(chi_est)) - chi_meta <- data.table::setDT(copy(chi_meta)) + CHIestimates <- data.table::setDT(copy(CHIestimates)) + CHImetadata <- data.table::setDT(copy(CHImetadata)) ## Check columns ---- if(verbose) { message("Checking that all column names are unique") } - if(length(names(chi_est)) != length(unique(names(chi_est)))) { + if(length(names(CHIestimates)) != length(unique(names(CHIestimates)))) { status <- 0 if(verbose){ - warning("You submitted a dataset where at least two columns have the same name. All names in chi_est must be unique.") + warning("You submitted a dataset where at least two columns have the same name. All names in CHIestimates must be unique.") } } - if(length(names(chi_meta)) != length(unique(names(chi_meta)))) { + if(length(names(CHImetadata)) != length(unique(names(CHImetadata)))) { status <- 0 if(verbose){ - warning("You submitted a metadata table where at least two columns have the same name. All names in chi_meta must be unique.") + warning("You submitted a metadata table where at least two columns have the same name. All names in CHImetadata must be unique.") } } if(verbose){ message("Checking that all necessary columns exist") } - missing.var <- setdiff(chi_get_cols(), names(chi_est)) + missing.var <- setdiff(chi_get_cols(), names(CHIestimates)) if(length(missing.var) > 0){ status <- 0 if(verbose) { missing.var <- paste(missing.var, collapse = ", ") - warning(glue::glue("You are missing the following critical columns(s) in chi_est: {missing.var}")) + warning(glue::glue("You are missing the following critical columns(s) in CHIestimates: {missing.var}")) } } - missing.var <- setdiff(names(unlist(chi_get_yaml()$metadata)), names(chi_meta)) + missing.var <- setdiff(names(unlist(chi_get_yaml()$metadata)), names(CHImetadata)) if(length(missing.var) > 0){ status <- 0 if(verbose){ missing.var <- paste(missing.var, collapse = ", ") - warning(glue::glue("You are missing the following critical columns(s) in chi_meta: {missing.var}")) + warning(glue::glue("You are missing the following critical columns(s) in CHImetadata: {missing.var}")) } } - # Confirm that there are no additional variables ---- - extra.var <- setdiff(names(chi_est), chi_get_cols()) + ## Confirm that there are no additional variables ---- + extra.var <- setdiff(names(CHIestimates), chi_get_cols()) if(length(extra.var) > 0){ status <- 0 if(verbose){ extra.var <- paste(extra.var, collapse = ", ") warning(glue::glue("Your dataset contains the following columns that are not CHI compliant: {extra.var}. - Please drop these variables from chi_est before attempting to QA the data again.")) + Please drop these variables from CHIestimates before attempting to QA the data again.")) } } - extra.var <- setdiff(names(chi_meta), names(unlist(chi_get_yaml()$metadata))) + extra.var <- setdiff(names(CHImetadata), names(unlist(chi_get_yaml()$metadata))) if(length(extra.var) > 0){ status <- 0 if(verbose){ extra.var <- paste(extra.var, collapse = ", ") warning(glue::glue("Your metadata table contains the following columns that are not CHI compliant: {extra.var}. - Please drop these variables from chi_meta before attempting to QA the data again.")) + Please drop these variables from CHImetadata before attempting to QA the data again.")) } } if(verbose){ message("Checking that variables are of the proper class") } + if(verbose){ message("Validating CHI estimates: ") } - rads::tsql_validate_field_types(ph.data = chi_est, field_types = unlist(chi_get_yaml()$vars)) # check CHI estimate table + rads::tsql_validate_field_types(ph.data = CHIestimates, field_types = unlist(chi_get_yaml()$vars)) # check CHI estimate table + if(verbose){ message(paste("", "Validating CHI metadata: ", sep = "\n")) } - rads::tsql_validate_field_types(ph.data = chi_meta, field_types = unlist(chi_get_yaml()$metadata)) # check CHI metadata table + rads::tsql_validate_field_types(ph.data = CHImetadata, field_types = unlist(chi_get_yaml()$metadata)) # check CHI metadata table if(verbose) message(paste("", "", sep = "\n")) if(verbose){ message("Checking that critical columns are not missing any values") } for(mycol in c("indicator_key", "year", "data_source", "tab", "cat1", "cat1_group", "run_date")){ - if(nrow(chi_est[is.na(get(mycol))]) > 0){ + if(nrow(CHIestimates[is.na(get(mycol))]) > 0){ status <- 0 warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI data. \n", "Fix the error and run this QA script again.")) } } - for(mycol in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator", "chi", "source_date", "run_date", "comparison_with_kc")){ - if(nrow(chi_est[is.na(get(mycol)) & is.na(suppression)]) > 0){ + for(mycol in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator", "chi", "source_date", "run_date")){ + if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ status <- 0 warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) } } for(mycol in setdiff(names(unlist(chi_get_yaml()$metadata)), c("latest_year_kc_pop", "latest_year_count"))){ - if(nrow(chi_meta[is.na(get(mycol))]) > 0){ + if(nrow(CHImetadata[is.na(get(mycol))]) > 0){ status <- 0 warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI metadata. \n", "Fix the error and run this QA script again.")) } } for(mycol in c("latest_year_kc_pop", "latest_year_count")){ - if(nrow(chi_meta[is.na(get(mycol))]) > 0){ + if(nrow(CHImetadata[is.na(get(mycol))]) > 0){ status <- 0 warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the metadata.")) } @@ -178,21 +196,21 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } ## Set the columns in standard order ---- - setcolorder(chi_est, chi_get_cols()) - setcolorder(chi_meta, names(unlist(chi_get_yaml()$metadata))) + setcolorder(CHIestimates, chi_get_cols()) + setcolorder(CHImetadata, names(unlist(chi_get_yaml()$metadata))) - #Basic logic checks for estimates + ## Basic logic checks for estimates ---- if(verbose){ message("Checking for infinite values, which cannot be pushed to SQL") } for(var in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator")){ - if(nrow(chi_est[is.infinite(get(var))]) > 0 ){ + if(nrow(CHIestimates[is.infinite(get(var))]) > 0 ){ status <- 0 if(verbose){ warning(glue::glue("There is at least one row where is.infinite({var}) == T. Please fix this problem before rerunning chi_qa_tro() (e.g., by setting it equal to NA) - You can view the problematic data by typing something like: View(chi_est[is.infinite({var}), ])")) + You can view the problematic data by typing something like: View(CHIestimates[is.infinite({var}), ])")) } } } @@ -200,8 +218,8 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that proportions are between zero and one") } - chi_est <- merge(chi_est, chi_meta[, list(indicator_key, result_type)], by = "indicator_key", all.x = TRUE, all.y = FALSE) # merge on result_type - if(nrow(chi_est[result_type=="proportion" & !result %between% c(0, 1)]) > 0){ + CHIestimates <- merge(CHIestimates, CHImetadata[, list(indicator_key, result_type)], by = "indicator_key", all.x = TRUE, all.y = FALSE) # merge on result_type + if(nrow(CHIestimates[result_type=="proportion" & !result %between% c(0, 1)]) > 0){ status <- 0 if(verbose){ warning("There is at least one row where where the metadata states that the indicator is a proportion but the result is outside [0,1]. @@ -212,47 +230,47 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking if upper_bound is greater than lower_bound") } - if(nrow(chi_est[upper_bound < lower_bound, ])){ + if(nrow(CHIestimates[upper_bound < lower_bound, ])){ status <- 0 if(verbose){ warning("There is at least one row where the upper_bound is less than the lower_bound. Please fix this error prior to re-running the chi_qa_tro() function. - You can view the problematic data by typing something like: View(chi_est[upper_bound < lower_bound, ])") + You can view the problematic data by typing something like: View(CHIestimates[upper_bound < lower_bound, ])") } } if(verbose){ message("Checking that result is less than or equal to the upper bound") } - if(nrow(chi_est[!(result <= upper_bound)])){ + if(nrow(CHIestimates[!(result <= upper_bound)])){ status <- 0 if(verbose){ warning("There is at least one row where the result is not less than or equal to the upper_bound. Please fix this error prior to rerunning the chi_qa_tro() function. - You can view the problematic data by typing something like: View(chi_est[!(result <= upper_bound)])") + You can view the problematic data by typing something like: View(CHIestimates[!(result <= upper_bound)])") } } if(verbose){ message("Checking that result is greater than or equal to the lower_bound") } - if(nrow(chi_est[!(result >= lower_bound)])){ + if(nrow(CHIestimates[!(result >= lower_bound)])){ status <- 0 if(verbose){ warning("There is at least one row where the result is not greater than or equal to the lower_bound. Please fix this error prior to rerunning the chi_qa_tro() function. - You can view the problematic data by typing something like: View(chi_est[!(result >= lower_bound)])") + You can view the problematic data by typing something like: View(CHIestimates[!(result >= lower_bound)])") } } if(verbose){ message("Checking that lower_bound is not less than zero") } - if(nrow(chi_est[lower_bound < 0])){ + if(nrow(CHIestimates[lower_bound < 0])){ status <- 0 if(verbose){ warning("There is at least one row where the lower_bound is less than zero (i.e., it is negative). Please fix this error prior to rerunning the chi_qa_tro() function. - You can view the problematic data by typing something like: View(chi_est[lower_bound < 0])") + You can view the problematic data by typing something like: View(CHIestimates[lower_bound < 0])") } } @@ -260,19 +278,19 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = message("Checking that RSE is between 0 and 100") } # confirmed with Abby 2/7/2020 that want RSE * 100 - if(nrow(chi_est[!rse %between% c(0, 100)]) > 0 ){ + if(nrow(CHIestimates[!rse %between% c(0, 100)]) > 0 ){ status <- 0 if(verbose){ warning(paste("There is at least one row where the RSE (relative standard error) is outside the range of (0, 100].", "This is not necessarily an error, but you should examine the data to make sure it makes sense.", - "You can view the data in question by typing something like: View(chi_est[!rse %between% c(0, 100)])", sep = "\n")) + "You can view the data in question by typing something like: View(CHIestimates[!rse %between% c(0, 100)])", sep = "\n")) } } if(verbose){ message("Checking that RSE is on scale of 0-100 (i.e., the proportion should have been multiplied by 100)") } - if(nrow(chi_est[!is.na(rse)]) == nrow(chi_est[rse <=1])){ + if(nrow(CHIestimates[!is.na(rse)]) == nrow(CHIestimates[rse <=1])){ status <- 0 if(verbose){ warning("All RSEs are within the range (0, 1]. CHI Tableau Ready standards necessitate that these proportions @@ -284,12 +302,12 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that caution flag exists if RSE >= 30%") } - if(nrow(chi_est[rse>=30 & (caution != "!" | is.na(caution)) ]) > 0 ){ + if(nrow(CHIestimates[rse>=30 & (caution != "!" | is.na(caution)) ]) > 0 ){ status <- 0 if(verbose){ warning("There is at least one row where a caution flag ('!') is not used and rse >= 30% or is.na(rse) == T. Please fix this error prior to rerunning the chi_qa_tro() function. - You can view the problematic data by typing something like: View(chi_est[(rse>=30 | is.na(rse)) & (caution != '!' | is.na(caution))])") + You can view the problematic data by typing something like: View(CHIestimates[(rse>=30 | is.na(rse)) & (caution != '!' | is.na(caution))])") } } @@ -299,7 +317,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that result is rounded to three digits") } - if(sum(chi_est$result != round2(chi_est$result, 3), na.rm = T) != 0) { + if(sum(CHIestimates$result != rads::round2(CHIestimates$result, 3), na.rm = T) != 0) { status <- 0 if(verbose){ warning("The 'result' column does not appear to be rounded to 3 digits, as specified in the CHI standards") @@ -308,7 +326,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that lower_bound is rounded to three digits") } - if(sum(chi_est$lower_bound != round2(chi_est$lower_bound, 3), na.rm = T) != 0) { + if(sum(CHIestimates$lower_bound != rads::round2(CHIestimates$lower_bound, 3), na.rm = T) != 0) { status <- 0 if(verbose){ warning("The 'lower_bound' column does not appear to be rounded to 3 digits, as specified in the CHI standards") @@ -317,7 +335,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("checking that upper_bound is rounded to three digits") } - if(sum(chi_est$upper_bound != round2(chi_est$upper_bound, 3), na.rm = T) != 0) { + if(sum(CHIestimates$upper_bound != rads::round2(CHIestimates$upper_bound, 3), na.rm = T) != 0) { status <- 0 if(verbose){ warning("The 'upper_bound' column does not appear to be rounded to 3 digits, as specified in the CHI standards") @@ -326,7 +344,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that rse is rounded to three digits") } - if(sum(chi_est$rse != round2(chi_est$rse, 3), na.rm = T) != 0) { + if(sum(CHIestimates$rse != rads::round2(CHIestimates$rse, 3), na.rm = T) != 0) { status <- 0 if(verbose){ warning("The 'rse' column does not appear to be rounded to 3 digits, as specified in the CHI standards") @@ -336,7 +354,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that se is rounded to four digits") } - if(sum(chi_est$se != round2(chi_est$se, 4), na.rm = T) != 0) { + if(sum(CHIestimates$se != rads::round2(CHIestimates$se, 4), na.rm = T) != 0) { status <- 0 if(verbose){ warning("The 'se' column does not appear to be rounded to 3 digits, as specified in the CHI standards") @@ -347,7 +365,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = message("Check that all observations have indicators, categories, tab, and year") } for(var in c("indicator_key", "tab", "year", "cat1", "cat1_group", "source_date", "run_date")){ - if(nrow(chi_est[is.na(get(var))]) > 0 ){ + if(nrow(CHIestimates[is.na(get(var))]) > 0 ){ status <- 0 if(verbose){ warning(glue::glue("There is at least one row where '{var}' is missing. @@ -356,7 +374,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } if(acs==F){ - if(nrow(chi_est[is.na("cat1_varname")]) > 0 ){ + if(nrow(CHIestimates[is.na(cat1_varname)]) > 0 ){ status <- 0 if(verbose){ warning(glue::glue("There is at least one row where 'cat1_varname' is missing. @@ -365,7 +383,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } for(var in c("cat2", "cat2_group")){ - if(nrow(chi_est[tab=="crosstabs" & is.na(get(var))]) > 0 ){ + if(nrow(CHIestimates[tab=="crosstabs" & is.na(get(var))]) > 0 ){ status <- 0 if(verbose){ warning(glue::glue("There is at least one row where tab=='crosstabs' & where '{var}' is missing. @@ -374,7 +392,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } if(acs==F){ - if(nrow(chi_est[tab=="crosstabs" & is.na("cat2_varname")]) > 0 ){ + if(nrow(CHIestimates[tab=="crosstabs" & is.na(cat2_varname)]) > 0 ){ status <- 0 if(verbose){ warning(glue::glue("There is at least one row where 'cat2_varname' is missing. @@ -386,8 +404,8 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that results are present if row is not suppressed") } - for(var in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator", "comparison_with_kc")){ - if(nrow(chi_est[suppression != "^" & is.na(get(var))]) > 0 ){ + for(var in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator")){ + if(nrow(CHIestimates[suppression != "^" & is.na(get(var))]) > 0 ){ status <- 0 if(verbose){ warning(glue::glue("There is at least one row that is not suppressed & where '{var}' is missing. @@ -400,7 +418,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = if(verbose){ message("Checking that time_trends are provided when tab=='trends'") } - if(nrow(chi_est[tab == "trends" & is.na(time_trends)]) > 0 ){ + if(nrow(CHIestimates[tab == "trends" & is.na(time_trends)]) > 0 ){ status <- 0 if(verbose) { warning(glue::glue("There is at least one row where tab=='trends' & where 'time_trends' is missing. @@ -409,17 +427,100 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } - ## Compare with previous year's results (FOR FUTURE???)---- - # in function arguments, have user submit most recent and comparison year(s). Submit as character b/c can be 2013-2017, not just 2017 - # if both are null, skip the comparison - # it not submitted, merge newer data on old data and identify rows with > 3% absolute difference - # save this dataset for manual review by the user - ## Compare with a CSV (FOR FUTURE) ---- - # sometimes want to compare with external data source --- - # must identify year of interest as above - # instead of submitting a reference year, the user specifies a reference file - # reference file will attempt to match on all columns that have the same name, except those with results (i.e., results, lower_bound, se, etc.) - # actual comparison code should be the same as when comparing to a previous year, so write a small funcion to do this + ## Ensure cat1/cat2 values meet CHI standards ---- + if(verbose) { + message("Checking that category combinations align with CHI standards") + } + + # Get reference data + ref_combos <- rads.data::misc_chi_byvars[, list(cat, varname, group)] + + # For ACS data, we only check cat and group combinations + if(acs) { + # Check cat1 combinations + chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( + cat = cat1, + group = cat1_group + )]) + + cat1_invalid <- chi_cat1_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + + if(nrow(cat1_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", + paste0(" - cat1='", cat1_invalid$cat, "', cat1_group='", + cat1_invalid$group, "'", collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + + # Check cat2 combinations + chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( + cat = cat2, + group = cat2_group + )]) + + cat2_invalid <- chi_cat2_combos[!unique(ref_combos[, list(cat, group)]), on = list(cat, group)] + + if(nrow(cat2_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", + paste0(" - cat2='", cat2_invalid$cat, "', cat2_group='", + cat2_invalid$group, "'", collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + } else { + # For non-ACS data, check complete combinations including varname + # Check cat1 combinations + chi_cat1_combos <- unique(CHIestimates[!is.na(cat1), list( + cat = cat1, + varname = cat1_varname, + group = cat1_group + )]) + + cat1_invalid <- chi_cat1_combos[!ref_combos, on = list(cat, varname, group)] + + if(nrow(cat1_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat1 combinations:\n", + paste0(" - cat1='", cat1_invalid$cat, "', cat1_varname='", + cat1_invalid$varname, "', cat1_group='", cat1_invalid$group, "'", + collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + + # Check cat2 combinations + chi_cat2_combos <- unique(CHIestimates[!is.na(cat2), list( + cat = cat2, + varname = cat2_varname, + group = cat2_group + )]) + + cat2_invalid <- chi_cat2_combos[!ref_combos, on = list(cat, varname, group)] + + if(nrow(cat2_invalid) > 0) { + status <- 0 + if(verbose) { + warning("\U00026A0 \U0001F4E3 WARNING: Found non-standard cat2 combinations:\n", + paste0(" - cat2='", cat2_invalid$cat, "', cat2_varname='", + cat2_invalid$varname, "', cat2_group='", cat2_invalid$group, "'", + collapse = "\n"), + "\nThese combinations are not found in rads.data::misc_chi_byvars reference table.") + } + } + } + + ## Ensure there are no more than 10 years of trend data ---- + trend.years <- sort(unique(CHIestimates[tab == 'trends']$year)) + if( length(trend.years) > 10){ + warning('\U00026A0 There are more than 10 unique years of trend data:\n', + paste(trend.years, collapse = ', ')) + } ## Print success statement!!!!!!!! #### if(verbose) { @@ -431,6 +532,7 @@ chi_qa_tro <- function(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = } } + ## Return ---- return(status) } diff --git a/man/chi_qa_tro.Rd b/man/chi_qa_tro.Rd index 517c0a0..1b349e0 100644 --- a/man/chi_qa_tro.Rd +++ b/man/chi_qa_tro.Rd @@ -4,12 +4,18 @@ \alias{chi_qa_tro} \title{CHI - QA: Tableau Ready output} \usage{ -chi_qa_tro(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = F) +chi_qa_tro( + CHIestimates, + CHImetadata, + acs = FALSE, + ignore_trends = TRUE, + verbose = FALSE +) } \arguments{ -\item{chi_est}{Name of a data.table or data.frame containing the prepared data to be pushed to SQL} +\item{CHIestimates}{Name of a data.table or data.frame containing the prepared data to be pushed to SQL} -\item{chi_meta}{Name of a data.table or data.frame containing the metadata to be pushed to SQL} +\item{CHImetadata}{Name of a data.table or data.frame containing the metadata to be pushed to SQL} \item{acs}{default FALSE, Indicates whether it is ACS data (which does not have / need varnames)} @@ -21,23 +27,38 @@ chi_qa_tro(chi_est, chi_meta, acs = F, ignore_trends = T, verbose = F) 1 or 0 for pass or fail status } \description{ -Checks data for compliance with CHI TRO standard +Checks estimates created by \code{\link{chi_calc}} and metadata created by +\code{\link{chi_generate_metadata}} for compliance with CHI TRO standard. } \details{ This function tests if the structure of the data matches CHI Tableau Ready Output specifications. If set to verbose mode (default) will report diagnostic information in The user interface as warnings (if failed) and messages (for progress and pass). In any Case will return 1 or 0 for pass or fail respectively. -The CHI Tableau Ready Output (TRO) standards can be reviewed [here](https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/_layouts/15/Doc.aspx?sourcedoc=%7BBED2F507-B834-4D28-8658-C5724F64C001%7D&file=CHI-Standards-TableauReady%20Output.xlsx&action=default&mobileredirect=true&CID=975A2706-130E-46AB-AEB2-DF919FDEC185&wdLOR=cDF77DB7A-3774-430F-92C7-EFAB34CEB35F) +The CHI Tableau Ready Output (TRO) standards can be reviewed here: +\href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} } \examples{ \dontrun{ -# create sample data +# QA check estimates and metadata +qa_status <- chi_qa_tro( + CHIestimates = my_estimates, + CHImetadata = my_metadata, + acs = FALSE, + ignore_trends = TRUE, + verbose = TRUE +) +} -# run function } +\seealso{ +\code{\link{chi_calc}} for generating estimates + +\code{\link{chi_generate_metadata}} for creating metadata +\code{\link{chi_compare_estimates}} for comparing estimates between versions } \keyword{CHI,} \keyword{Production} From 1b4c6949b52f14fdf32b91d04ace4c3ccce04a77 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 17:57:23 -0800 Subject: [PATCH 18/35] Added more global sto globals.R - as always, so devtools::check() plays nicely with data.table syntax --- R/globals.R | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/R/globals.R b/R/globals.R index 0ab6023..e9614f6 100644 --- a/R/globals.R +++ b/R/globals.R @@ -1,5 +1,6 @@ # Declare global variables for devtools::check() utils::globalVariables(c( + "absolute.diff", "age", "AgeMax", "AgeMin", @@ -19,7 +20,11 @@ utils::globalVariables(c( "comparison_with_kc_sig", "count", "crosstabs", + "crosstabs.x", + "crosstabs.y", "data_source", + "demgroups.x", + "demgroups.y", "denominator", "end", "gender", @@ -42,7 +47,9 @@ utils::globalVariables(c( "latest_year_resultx", "latest_yearx", "lower_bound", + "notable", "numerator", + "pattern", "pop", "pov200grp", "race", @@ -52,6 +59,11 @@ utils::globalVariables(c( "reference", "region", "region_name", + "relative.diff", + "result", + "result_type", + "result.x", + "result.y", "round2", "rse", "run_date", @@ -66,9 +78,16 @@ utils::globalVariables(c( "tempend", "tempstart", "time_trends", + "trends.x", + "trends.y", + "upper_bound", "valid_years", "varname", "vebrose", "year.span", - "year" + "year", + "_kingcounty.x", + "_kingcounty.y", + "_wastate.x", + "_wastate.y" )) From dc645174f7cf3acc8837abc58854ffa96d40c098 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:01:30 -0800 Subject: [PATCH 19/35] BUG FIX chi_sql_update + improvements - BUG FIX in error check for server argument values - BUG FIX in error check for CHIestimates and CHImetadata: both checked if exists, but always existed bc NULL. Now checks if NULL - Eliminated asking if want to create a table that doesn't exist. Replaced with a Warning because the need for a reply broke the esting and would also break if used in a long data processing piepline - Improved headers --- R/chi_sql_update.R | 69 ++++++++++++++++++++++++++++--------------- man/chi_sql_update.Rd | 43 +++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 29 deletions(-) diff --git a/R/chi_sql_update.R b/R/chi_sql_update.R index 9611868..fd949f8 100644 --- a/R/chi_sql_update.R +++ b/R/chi_sql_update.R @@ -1,14 +1,43 @@ #' CHI SQL Update #' #' @description -#' function to update (or replace) results and metadata into APDE PHExtractStore servers. For details on server access, and to configure your local settings please review documentation [here](https://kc1.sharepoint.com/:w:/r/teams/DPH-KCCross-SectorData/_layouts/15/Doc.aspx?sourcedoc=%7B34352D66-9CD6-45C9-AD19-B8FE88A4C7C6%7D&file=SQL%20Server%20Setup%20APDE.docx&action=default&mobileredirect=true) +#' Function to update (or replace) results and metadata in APDE PHExtractStore +#' servers. #' +#' @details +#' For details on server access, and to configure your local settings, +#' please review documentation: +#' \href{https://kc1.sharepoint.com/teams/DPH-KCCross-SectorData/Shared\%20Documents/References/SQL/SQL Server Setup APDE.docx}{ +#' SharePoint > DPH-KCCross-SectorData > Documents > References > SQL > SQL Server Setup APDE.docx}. #' -#' @param CHIestimates DT or DF containing CHI analytic results -#' @param CHImetadata DT or DF containing CHI metadata +#' @param CHIestimates data.frame/data.table containing CHI analytic results +#' @param CHImetadata data.frame/data.table containing CHI metadata #' @param table_name name of SQL Server table to update -#' @param server type of server ('development' for KCITSQLUATHIP40 and 'production' for KCITSQLPRPHIP40) -#' @param replace_table If T, drop existing table and insert data, if F update matching rows and insert new data +#' @param server type of server (\code{'development'} for KCITSQLUATHIP40 and +#' \code{'production'} for KCITSQLPRPHIP40) +#' +#' Default \code{server = 'development'} +#' @param replace_table If TRUE, drop existing table and insert data, if FALSE +#' update matching rows and insert new data +#' +#' Default \code{replace_table = FALSE} +#' +#' @examples +#' \dontrun{ +#' # Update development database +#' chi_sql_update( +#' CHIestimates = final_estimates, +#' CHImetadata = final_metadata, +#' table_name = "birth", +#' server = "development", +#' replace_table = FALSE +#' ) +#' } +#' +#' @seealso +#' \code{\link{chi_qa_tro}} for validating data before upload +#' +#' \code{\link{chi_compare_estimates}} for comparing with existing data #' #' @return status message indicating success and location, or failure, of upload #' @@ -27,21 +56,25 @@ chi_sql_update <- function(CHIestimates = NULL, CHImetadata = NULL, table_name = NULL, server = 'development', # options include c('development', 'production') - replace_table = F # default is to update select rows rather than replace the entire table + replace_table = FALSE # default is to update select rows rather than replace the entire table ){ # load CHI yaml config file ---- # check CHIestimates argument---- - if(!exists('CHIestimates')){stop("\n\U0001f47f The results table to push to SQL (CHIestimates) is missing ")} + if(is.null(CHIestimates)){stop("\n\U0001f47f The results table to push to SQL (CHIestimates) is missing ")} if( inherits(CHIestimates, "data.frame") == FALSE){stop("\n\U0001f47f CHIestimates must be a data.frame or a data.table.")} if( inherits(CHIestimates, "data.table") == FALSE){setDT(CHIestimates)} rads::tsql_validate_field_types(ph.data = CHIestimates, field_types = unlist(chi_get_yaml()$vars)) # check CHImetadata argument---- - if(!exists('CHImetadata')){stop("\n\U0001f47f The metadata table to push to SQL (CHImetadata) is missing ")} + if(is.null(CHImetadata)){stop("\n\U0001f47f The metadata table to push to SQL (CHImetadata) is missing ")} if( inherits(CHImetadata, "data.frame") == FALSE){stop("\n\U0001f47f CHImetadata must be a data.frame or a data.table.")} if( inherits(CHImetadata, "data.table") == FALSE){setDT(CHImetadata)} rads::tsql_validate_field_types(ph.data = CHImetadata, field_types = unlist(chi_get_yaml()$metadata)) + # checkt table_name ---- + if(is.null(table_name)){stop("\n\U0001f47f The table_name argument is missing ")} + if(length(table_name) != 1 | !is.character(table_name)){stop("\n\U0001f47f table_name must be a character vector of length 1")} + # ensure indicator_key is consistent across estimates and metadata if(!identical(sort(as.character(unique(CHIestimates$indicator_key))), sort(as.character(CHImetadata$indicator_key)))){ stop("\n\U0001f47f The indicator_key values in CHIestimates and CHImetadata are not identical ... but they should be!") @@ -49,7 +82,7 @@ chi_sql_update <- function(CHIestimates = NULL, # check server argument---- server = tolower(as.character(server)) - if(!server %in% c('wip', 'dev', 'prod')){stop("\n\U0001f47f The server argument is limited to: 'development', 'production'")} + if(!server %in% c('development', 'production')){stop("\n\U0001f47f The server argument is limited to: 'development', 'production'")} if(length(server) != 1){stop("\n\U0001f47f The `server` argument must be of length 1")} # check replace argument---- @@ -75,25 +108,15 @@ chi_sql_update <- function(CHIestimates = NULL, } # check if *_results and *_metadata tables already exist in the appropriate schema---- - if(!DBI::dbExistsTable(conn = CHI_db_cxn, glue::glue_sql("[PHExtractStore].[APDE{DBI::SQL(schema_suffix)}].[{DBI::SQL(table_name)}_results]", .con = CHI_db_cxn))){ - tempquestion <- paste0("The table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_results]` does NOT currently exist. Are you sure you want to continue? (y/n) ") - answer <- readline(tempquestion) - if (answer == "y") { + if (!DBI::dbExistsTable(conn = CHI_db_cxn, glue::glue_sql("[PHExtractStore].[APDE{DBI::SQL(schema_suffix)}].[{DBI::SQL(table_name)}_results]", .con = CHI_db_cxn))) { + warning(paste0("\U00026A0\U00026A0\nThe table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_results]` does NOT currently exist. Continuing without this table.")) message("Continuing...") - } else { - stop(paste0("\n\U0001f47f The table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_results]` does NOT currently exist and you gave instructions not to continue.")) } - } - if(!DBI::dbExistsTable(conn = CHI_db_cxn, glue::glue_sql("[PHExtractStore].[APDE{DBI::SQL(schema_suffix)}].[{DBI::SQL(table_name)}_metadata]", .con = CHI_db_cxn))){ - tempquestion <- paste0("The table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_metadata]` does NOT currently exist. Are you sure you want to continue? (y/n) ") - answer <- readline(tempquestion) - if (answer == "y") { + if (!DBI::dbExistsTable(conn = CHI_db_cxn, glue::glue_sql("[PHExtractStore].[APDE{DBI::SQL(schema_suffix)}].[{DBI::SQL(table_name)}_metadata]", .con = CHI_db_cxn))) { + warning(paste0("\U00026A0\U00026A0\nThe table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_metadata]` does NOT currently exist. Continuing without this table.")) message("Continuing...") - } else { - stop(paste0("\n\U0001f47f The table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_metadata]` does NOT currently exist and you gave instructions not to continue.")) } - } # if replace_table = F, delete existing data that will be replaced in SQL---- if(isFALSE(replace_table)){ diff --git a/man/chi_sql_update.Rd b/man/chi_sql_update.Rd index 804145b..7ec1e5e 100644 --- a/man/chi_sql_update.Rd +++ b/man/chi_sql_update.Rd @@ -9,25 +9,56 @@ chi_sql_update( CHImetadata = NULL, table_name = NULL, server = "development", - replace_table = F + replace_table = FALSE ) } \arguments{ -\item{CHIestimates}{DT or DF containing CHI analytic results} +\item{CHIestimates}{data.frame/data.table containing CHI analytic results} -\item{CHImetadata}{DT or DF containing CHI metadata} +\item{CHImetadata}{data.frame/data.table containing CHI metadata} \item{table_name}{name of SQL Server table to update} -\item{server}{type of server ('development' for KCITSQLUATHIP40 and 'production' for KCITSQLPRPHIP40)} +\item{server}{type of server (\code{'development'} for KCITSQLUATHIP40 and +\code{'production'} for KCITSQLPRPHIP40) -\item{replace_table}{If T, drop existing table and insert data, if F update matching rows and insert new data} +Default \code{server = 'development'}} + +\item{replace_table}{If TRUE, drop existing table and insert data, if FALSE +update matching rows and insert new data + +Default \code{replace_table = FALSE}} } \value{ status message indicating success and location, or failure, of upload } \description{ -function to update (or replace) results and metadata into APDE PHExtractStore servers. For details on server access, and to configure your local settings please review documentation [here](https://kc1.sharepoint.com/:w:/r/teams/DPH-KCCross-SectorData/_layouts/15/Doc.aspx?sourcedoc=%7B34352D66-9CD6-45C9-AD19-B8FE88A4C7C6%7D&file=SQL%20Server%20Setup%20APDE.docx&action=default&mobileredirect=true) +Function to update (or replace) results and metadata in APDE PHExtractStore +servers. +} +\details{ +For details on server access, and to configure your local settings, +please review documentation: +\href{https://kc1.sharepoint.com/teams/DPH-KCCross-SectorData/Shared\%20Documents/References/SQL/SQL Server Setup APDE.docx}{ +SharePoint > DPH-KCCross-SectorData > Documents > References > SQL > SQL Server Setup APDE.docx}. +} +\examples{ +\dontrun{ +# Update development database +chi_sql_update( + CHIestimates = final_estimates, + CHImetadata = final_metadata, + table_name = "birth", + server = "development", + replace_table = FALSE +) +} + +} +\seealso{ +\code{\link{chi_qa_tro}} for validating data before upload + +\code{\link{chi_compare_estimates}} for comparing with existing data } \keyword{CHI,} \keyword{Production} From 9d6f049c13910ea7cd7caf1c5a9e249bebdfa9b9 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:03:27 -0800 Subject: [PATCH 20/35] Improve chi_generate_metadata - improved header - added input validation - Limit the available years in metadata to 10 years, since trends are only for 10 years. --- R/chi_generate_metadata.R | 30 +++++++++++++++++++++++++++++- man/chi_generate_metadata.Rd | 7 ++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/R/chi_generate_metadata.R b/R/chi_generate_metadata.R index 5096c0b..a5e6801 100644 --- a/R/chi_generate_metadata.R +++ b/R/chi_generate_metadata.R @@ -3,16 +3,34 @@ #' @description #' function to generate metadata table combining existing metadata and latest estimates. #' -#' @param meta.old Previous metadata table +#' @param meta.old Name of a data.table with the prior year's metadata #' @param est.current current year's tableau ready output with completed estimates #' #' @return table of metadata +#' +#' @seealso +#' \code{\link{chi_calc}} for generating estimates +#' +#' \code{\link{chi_qa_tro}} for validating metadata +#' #' @importFrom data.table setDT copy := #' @importFrom rads substrRight +#' @importFrom utils tail #' @export #' chi_generate_metadata <- function(meta.old = NULL, est.current = NULL){ + # Input validation ---- + if (is.null(meta.old)) stop("\n\U1F6D1 meta.old must be provided") + if (!is.data.frame(meta.old)) stop("\n\U1F6D1 meta.old must be a data.frame or data.table") + + if (is.null(est.current)) stop("\n\U1F6D1 est.current must be provided") + if (!is.data.frame(est.current)) stop("\n\U1F6D1 est.current must be a data.frame or data.table") + + # Convert to data.table if needed ---- + if (!is.data.table(meta.old)) setDT(meta.old) + if (!is.data.table(est.current)) setDT(est.current) + # get new metadata ---- meta.new <- unique(est.current[tab == "metadata", list(indicator_key, @@ -38,6 +56,16 @@ chi_generate_metadata <- function(meta.old = NULL, meta.new[as.integer(latest_year) > suppressWarnings(as.integer(rads::substrRight(valid_years, 1, 4))), valid_years := suppressWarnings(paste(as.integer(substr(valid_years, 1, 4)):as.integer(latest_year), collapse = " "))] + # Since trends only have 10 years of data, valid_years should be limited to 10 years max ---- + meta.new[, valid_years := { + allyears <- sort(as.integer(strsplit(valid_years, " ")[[1]])) # convert valid_years to a vector of numbers + if(length(allyears) > 10) { + paste(tail(sort(allyears), 10), collapse = " ") + } else { + paste(allyears, collapse = " ") + } + }, by = indicator_key] + # Ensure there are no missing important metadata cells ---- missing.per.col <- sapply(meta.new, function(x) sum(is.na(x))) if(sum(missing.per.col) > 0){ diff --git a/man/chi_generate_metadata.Rd b/man/chi_generate_metadata.Rd index cdd8a3c..65c8a50 100644 --- a/man/chi_generate_metadata.Rd +++ b/man/chi_generate_metadata.Rd @@ -7,7 +7,7 @@ chi_generate_metadata(meta.old = NULL, est.current = NULL) } \arguments{ -\item{meta.old}{Previous metadata table} +\item{meta.old}{Name of a data.table with the prior year's metadata} \item{est.current}{current year's tableau ready output with completed estimates} } @@ -17,3 +17,8 @@ table of metadata \description{ function to generate metadata table combining existing metadata and latest estimates. } +\seealso{ +\code{\link{chi_calc}} for generating estimates + +\code{\link{chi_qa_tro}} for validating metadata +} From 2f832fdfcbe01906a4826685dcd8d6c4b2726fc6 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:05:13 -0800 Subject: [PATCH 21/35] chi_generate_tro_shell header improvement - Added link to reference docs - Made bullet point list - no functional change --- R/chi_generate_tro_shell.R | 74 ++++++++++++++++++++++++++++------- man/chi_generate_tro_shell.Rd | 47 +++++++++++++++------- 2 files changed, 93 insertions(+), 28 deletions(-) diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 7814c0e..6c55703 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -1,29 +1,49 @@ #' CHI generate TRO shell #' #' @description -#' This function takes an analysis set file and indicator of which set should be processed. It returns a skeleton of CHI Tableau Ready Output. +#' This function takes an analysis set file and indicator of which set should be +#' processed. It returns a skeleton of CHI Tableau Ready Output, which is used +#' as an instruction set for \code{\link{chi_calc}}. #' #' @details -#' It takes in a data.table containing a compact list of variables, byvariables, and analysis types, and returns a shell table of the rows and columns expected in a CHI Tableau ready output. For details on TRO format, review here: https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady%20Output.xlsx?d=wbed2f507b8344d288658c5724f64c001&csf=1&web=1&e=qEIPcc&nav=MTVfezAwMDAwMDAwLTAwMDEtMDAwMC0wMjAwLTAwMDAwMDAwMDAwMH0 +#' It takes in a data.table containing a compact list of \code{indicator_keys}, +#' byvariables, and analysis types, and returns a shell table of the rows and columns expected +#' in a CHI Tableau ready output. For details on TRO format, review here: +#' \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +#' SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} #' -#' the expected format of the analysis file is: -#' set: numeric integer 1...x, indicates set the observations are calcualted as part of (why are sets valueable? should this be discarded?) -#' cat1: character, the name expected in CHI TRO for cat1 -#' cat1_varname: character, the name expected in CHI TRO for cat1_varname -#' kingcounty: character "":"X", indicator of if analysis is king county specific (could be removed, this is imputable by variable name) -#' wastate: character "":"x", indicator of if analysis is of wa state -#' demgroups: character "":"x", indicator of if analysis includes single demographic -#' crosstabs: character "":"x", indicator of if analysis includes crosstabulations -#' trands: character "":"x", indicator of if analysis includes trends -#' set_idictaor_keys character comma sep list, list of indicators variables expected from data source +#' The expected format of \code{ph.analysis_set} is: +#' \itemize{ +#' \item \code{set}: Integer identifying groups of indicators with identical analysis patterns +#' \item \code{cat1}: Category name as expected in CHI TRO +#' \item \code{cat1_varname}: Category variable name as expected in CHI TRO +#' \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab +#' \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab +#' \item \code{demgroups}: \code{'x'} if analysis includes demographic groups +#' \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations +#' \item \code{trends}: \code{'x'} if analysis includes trends analysis +#' \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern +#' } #' #' @param ph.analysis_set name of data.table to parse #' @param start.year the earliest year to be used for estimates -#' @param end.year the latest year to be used for aggregate estimates (note, the earliest year for trends estimates is calculated from from the span and number of periods) +#' @param end.year the latest year to be used for aggregate estimates #' @param year.span the number of years to be included in a single non-trend period #' @param trend.span the number of years to be included in a single trend period #' @param trend.periods the number of periods to be included in a trend -#' @returns data table with a single row for each calculation to be performed in generating Tableau Ready Output for CHI reporting +#' +#' @return A data table containing calculation instructions with columns: +#' \itemize{ +#' \item{indicator_key} Unique identifier for each indicator +#' \item{tab} Analysis type (_kingcounty, demgroups, trends, etc.) +#' \item{year} Time period for analysis +#' \item{Additional columns} As specified in analysis_set input +#' } +#' +#' @seealso +#' \code{\link{chi_generate_analysis_set}} for creating analysis sets +#' +#' \code{\link{chi_calc}} for performing calculations using these instructions #' #' @keywords CHI, Tableau, Production #' @@ -39,6 +59,32 @@ chi_generate_tro_shell <- function(ph.analysis_set, year.span = NULL, trend.span = NULL, trend.periods = NULL){ + + # Input validation + if (missing(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be provided") + if (!is.data.frame(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be a data.frame or data.table") + + if (missing(start.year)) stop("\n\U1F6D1 start.year must be provided") + if (!is.numeric(start.year) || length(start.year) != 1) stop("\n\U1F6D1 start.year must be a single numeric value") + + if (missing(end.year)) stop("\n\U1F6D1 end.year must be provided") + if (!is.numeric(end.year) || length(end.year) != 1) stop("\n\U1F6D1 end.year must be a single numeric value") + + if (!is.null(year.span) && (!is.numeric(year.span) || length(year.span) != 1)) { + stop("\n\U1F6D1 year.span must be NULL or a single numeric value") + } + + if (!is.null(trend.span) && (!is.numeric(trend.span) || length(trend.span) != 1)) { + stop("\n\U1F6D1 trend.span must be NULL or a single numeric value") + } + + if (!is.null(trend.periods) && (!is.numeric(trend.periods) || length(trend.periods) != 1)) { + stop("\n\U1F6D1 trend.periods must be NULL or a single numeric value") + } + + # Convert to data.table if needed + if (!is.data.table(ph.analysis_set)) setDT(ph.analysis_set) + #parameterization checks if("x" %in% ph.analysis_set$trends & (is.null(trend.span) | is.null(trend.periods))) {stop("you have indicated that a trends analysis is to be conducted, but have not indicated both the span and number of periods for this analysis.")} diff --git a/man/chi_generate_tro_shell.Rd b/man/chi_generate_tro_shell.Rd index 954091e..50ab14d 100644 --- a/man/chi_generate_tro_shell.Rd +++ b/man/chi_generate_tro_shell.Rd @@ -18,7 +18,7 @@ chi_generate_tro_shell( \item{start.year}{the earliest year to be used for estimates} -\item{end.year}{the latest year to be used for aggregate estimates (note, the earliest year for trends estimates is calculated from from the span and number of periods)} +\item{end.year}{the latest year to be used for aggregate estimates} \item{year.span}{the number of years to be included in a single non-trend period} @@ -27,24 +27,43 @@ chi_generate_tro_shell( \item{trend.periods}{the number of periods to be included in a trend} } \value{ -data table with a single row for each calculation to be performed in generating Tableau Ready Output for CHI reporting +A data table containing calculation instructions with columns: +\itemize{ + \item{indicator_key} Unique identifier for each indicator + \item{tab} Analysis type (_kingcounty, demgroups, trends, etc.) + \item{year} Time period for analysis + \item{Additional columns} As specified in analysis_set input +} } \description{ -This function takes an analysis set file and indicator of which set should be processed. It returns a skeleton of CHI Tableau Ready Output. +This function takes an analysis set file and indicator of which set should be +processed. It returns a skeleton of CHI Tableau Ready Output, which is used +as an instruction set for \code{\link{chi_calc}}. } \details{ -It takes in a data.table containing a compact list of variables, byvariables, and analysis types, and returns a shell table of the rows and columns expected in a CHI Tableau ready output. For details on TRO format, review here: https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady%20Output.xlsx?d=wbed2f507b8344d288658c5724f64c001&csf=1&web=1&e=qEIPcc&nav=MTVfezAwMDAwMDAwLTAwMDEtMDAwMC0wMjAwLTAwMDAwMDAwMDAwMH0 +It takes in a data.table containing a compact list of \code{indicator_keys}, +byvariables, and analysis types, and returns a shell table of the rows and columns expected +in a CHI Tableau ready output. For details on TRO format, review here: +\href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} + +The expected format of \code{ph.analysis_set} is: + \itemize{ + \item \code{set}: Integer identifying groups of indicators with identical analysis patterns + \item \code{cat1}: Category name as expected in CHI TRO + \item \code{cat1_varname}: Category variable name as expected in CHI TRO + \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab + \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab + \item \code{demgroups}: \code{'x'} if analysis includes demographic groups + \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations + \item \code{trends}: \code{'x'} if analysis includes trends analysis + \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern + } +} +\seealso{ +\code{\link{chi_generate_analysis_set}} for creating analysis sets -the expected format of the analysis file is: -set: numeric integer 1...x, indicates set the observations are calcualted as part of (why are sets valueable? should this be discarded?) -cat1: character, the name expected in CHI TRO for cat1 -cat1_varname: character, the name expected in CHI TRO for cat1_varname -kingcounty: character "":"X", indicator of if analysis is king county specific (could be removed, this is imputable by variable name) -wastate: character "":"x", indicator of if analysis is of wa state -demgroups: character "":"x", indicator of if analysis includes single demographic -crosstabs: character "":"x", indicator of if analysis includes crosstabulations -trands: character "":"x", indicator of if analysis includes trends -set_idictaor_keys character comma sep list, list of indicators variables expected from data source +\code{\link{chi_calc}} for performing calculations using these instructions } \keyword{CHI,} \keyword{Production} From 83170451eda469ff733b6effde997c96ae920014 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:06:40 -0800 Subject: [PATCH 22/35] tidied header for chi_process_nontrends --- R/chi_process_nontrends.R | 39 ++++++++++++++++++++++-------------- man/chi_process_nontrends.Rd | 38 ++++++++++++++++++++++------------- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/R/chi_process_nontrends.R b/R/chi_process_nontrends.R index 947016f..c411599 100644 --- a/R/chi_process_nontrends.R +++ b/R/chi_process_nontrends.R @@ -1,28 +1,37 @@ -#' CHI generate_nontrend_years +#' CHI generate nontrend years #' #' @description -#' This function takes an analysis set file and indicator of which set should be processed. It returns a skeleton of CHI Tableau Ready Output. Hidden and ment to be called by CHI generate instructions. Not exported, but called by chi_generate_tro_shell(). +#' This function takes an analysis set file and indicator of which set should be +#' processed. It returns a skeleton of CHI Tableau Ready Output. Hidden and meant +#' to be called by CHI generate instructions. Not exported, but called by +#' chi_generate_tro_shell(). #' #' @details -#' It takes in a data.table containing a compact list of variables, byvariables, and analysis types, and returns a shell table of the rows and columns expected in a CHI Tableau ready output. For details on TRO format, review here: https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady%20Output.xlsx?d=wbed2f507b8344d288658c5724f64c001&csf=1&web=1&e=qEIPcc&nav=MTVfezAwMDAwMDAwLTAwMDEtMDAwMC0wMjAwLTAwMDAwMDAwMDAwMH0 +#' It takes in a data.table containing a compact list of variables, byvariables, +#' and analysis types, and returns a shell table of the rows and columns expected +#' in a CHI Tableau ready output. For details on TRO format, review here: +#' \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +#' SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} #' -#' the expected format of the analysis file is: -#' set: numeric integer 1...x, indicates set the observations are calcualted as part of (why are sets valueable? should this be discarded?) -#' cat1: character, the name expected in CHI TRO for cat1 -#' cat1_varname: character, the name expected in CHI TRO for cat1_varname -#' _kingcounty: character "":"X", indicator of if analysis is king county specific (could be removed, this is imputable by variable name) -#' _wastate: character "":"x", indicator of if analysis is of wa state -#' demgroups: character "":"x", indicator of if analysis includes single demographic -#' crosstabs: character "":"x", indicator of if analysis includes crosstabulations -#' trands: character "":"x", indicator of if analysis includes trends -#' set_idictaor_keys character comma sep list, list of indicators variables expected from data source +#' The expected format of the analysis file is: +#' \itemize{ +#' \item \code{set}: numeric integer 1...x, indicates set or batch of analyses since not all indicators have all the same analyses +#' \item \code{cat1}: character, the name expected in CHI TRO for cat1 +#' \item \code{cat1_varname}: character, the name expected in CHI TRO for cat1_varname +#' \item \code{_kingcounty}: character "":"X", indicator of if analysis is King County specific (could be removed, this is imputable by variable name) +#' \item \code{_wastate}: character "":"x", indicator of if analysis is of WA state +#' \item \code{demgroups}: character "":"x", indicator of if analysis includes single demographic +#' \item \code{crosstabs}: character "":"x", indicator of if analysis includes crosstabulations +#' \item \code{trends}: character "":"x", indicator of if analysis includes trends +#' \item \code{set_indictaor_keys} character comma sep list, list of indicators variables expected from data source +#' } #' #' @param ph.analysis_set name of data.table to parse #' @param myset chosen set number from table #' #' @returns data table with a single row for each calculation to be performed in generating Tableau Ready Output for CHI reporting #' -#' @keywords CHI, Tableau, Production +#' @keywords CHI, Tableau, Production, internal #' #' @importFrom data.table setDT rbindlist setcolorder `:=` #' @import dtsurvey @@ -30,7 +39,7 @@ #' @import future.apply #' @importFrom tidyr crossing #' @importFrom rads string_clean -#' @export +#' chi_process_nontrends <- function(ph.analysis_set = NULL, myset = NULL){ diff --git a/man/chi_process_nontrends.Rd b/man/chi_process_nontrends.Rd index 31d151b..33379c0 100644 --- a/man/chi_process_nontrends.Rd +++ b/man/chi_process_nontrends.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/chi_process_nontrends.R \name{chi_process_nontrends} \alias{chi_process_nontrends} -\title{CHI generate_nontrend_years} +\title{CHI generate nontrend years} \usage{ chi_process_nontrends(ph.analysis_set = NULL, myset = NULL) } @@ -15,22 +15,32 @@ chi_process_nontrends(ph.analysis_set = NULL, myset = NULL) data table with a single row for each calculation to be performed in generating Tableau Ready Output for CHI reporting } \description{ -This function takes an analysis set file and indicator of which set should be processed. It returns a skeleton of CHI Tableau Ready Output. Hidden and ment to be called by CHI generate instructions. Not exported, but called by chi_generate_tro_shell(). +This function takes an analysis set file and indicator of which set should be +processed. It returns a skeleton of CHI Tableau Ready Output. Hidden and meant +to be called by CHI generate instructions. Not exported, but called by +chi_generate_tro_shell(). } \details{ -It takes in a data.table containing a compact list of variables, byvariables, and analysis types, and returns a shell table of the rows and columns expected in a CHI Tableau ready output. For details on TRO format, review here: https://kc1.sharepoint.com/:x:/r/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady%20Output.xlsx?d=wbed2f507b8344d288658c5724f64c001&csf=1&web=1&e=qEIPcc&nav=MTVfezAwMDAwMDAwLTAwMDEtMDAwMC0wMjAwLTAwMDAwMDAwMDAwMH0 +It takes in a data.table containing a compact list of variables, byvariables, +and analysis types, and returns a shell table of the rows and columns expected +in a CHI Tableau ready output. For details on TRO format, review here: +\href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} -the expected format of the analysis file is: -set: numeric integer 1...x, indicates set the observations are calcualted as part of (why are sets valueable? should this be discarded?) -cat1: character, the name expected in CHI TRO for cat1 -cat1_varname: character, the name expected in CHI TRO for cat1_varname -_kingcounty: character "":"X", indicator of if analysis is king county specific (could be removed, this is imputable by variable name) -_wastate: character "":"x", indicator of if analysis is of wa state -demgroups: character "":"x", indicator of if analysis includes single demographic -crosstabs: character "":"x", indicator of if analysis includes crosstabulations -trands: character "":"x", indicator of if analysis includes trends -set_idictaor_keys character comma sep list, list of indicators variables expected from data source +The expected format of the analysis file is: +\itemize{ + \item \code{set}: numeric integer 1...x, indicates set or batch of analyses since not all indicators have all the same analyses + \item \code{cat1}: character, the name expected in CHI TRO for cat1 + \item \code{cat1_varname}: character, the name expected in CHI TRO for cat1_varname + \item \code{_kingcounty}: character "":"X", indicator of if analysis is King County specific (could be removed, this is imputable by variable name) + \item \code{_wastate}: character "":"x", indicator of if analysis is of WA state + \item \code{demgroups}: character "":"x", indicator of if analysis includes single demographic + \item \code{crosstabs}: character "":"x", indicator of if analysis includes crosstabulations + \item \code{trends}: character "":"x", indicator of if analysis includes trends + \item \code{set_indictaor_keys} character comma sep list, list of indicators variables expected from data source +} } \keyword{CHI,} -\keyword{Production} +\keyword{Production,} \keyword{Tableau,} +\keyword{internal} From f22e910d800073a996ec01674f9a286314a43d83 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:07:28 -0800 Subject: [PATCH 23/35] tidied header for chi_process_trends - no functional change --- R/chi_process_trends.R | 4 ++-- man/chi_process_trends.Rd | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/chi_process_trends.R b/R/chi_process_trends.R index b2d35e8..44be3fc 100644 --- a/R/chi_process_trends.R +++ b/R/chi_process_trends.R @@ -6,14 +6,14 @@ #' @param trend.periods number of periods to calculate #' #' @description -#' helper fucntion for chi_generate_tro_shell +#' helper function for chi_generate_tro_shell #' #' @details #' called by chi_generate_tro_shell to calculate and create rows for expected trends analyses. #' #' #' @returns TRO with rows for each indicator key and span of years within the provided time frame -#' @keywords CHI, Tableau, Production +#' @keywords CHI, Tableau, Production, internal #' @importFrom data.table setDT setorder #' @importFrom tidyr crossing diff --git a/man/chi_process_trends.Rd b/man/chi_process_trends.Rd index e98bf65..1550d27 100644 --- a/man/chi_process_trends.Rd +++ b/man/chi_process_trends.Rd @@ -24,11 +24,12 @@ chi_process_trends( TRO with rows for each indicator key and span of years within the provided time frame } \description{ -helper fucntion for chi_generate_tro_shell +helper function for chi_generate_tro_shell } \details{ called by chi_generate_tro_shell to calculate and create rows for expected trends analyses. } \keyword{CHI,} -\keyword{Production} +\keyword{Production,} \keyword{Tableau,} +\keyword{internal} From 5d3575ae83acc6db4a47aae5423be7003d342ef7 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:08:57 -0800 Subject: [PATCH 24/35] Created chi_generate_analysis_set - New code to create the analysis_set if it it does not exist - Output is used by chi_generate_tro_shell --- R/chi_generate_analysis_set.R | 211 +++++++++++++++++++++++++++++++ man/chi_generate_analysis_set.Rd | 64 ++++++++++ 2 files changed, 275 insertions(+) create mode 100644 R/chi_generate_analysis_set.R create mode 100644 man/chi_generate_analysis_set.Rd diff --git a/R/chi_generate_analysis_set.R b/R/chi_generate_analysis_set.R new file mode 100644 index 0000000..e92825b --- /dev/null +++ b/R/chi_generate_analysis_set.R @@ -0,0 +1,211 @@ +#' Generate Analysis Sets for CHI +#' +#' @description +#' Creates sets of indicator keys that share common analysis patterns within CHI +#' (Community Health Indicators) data. This function reads the most recent production +#' version of CHI estimates for a specified data source and groups indicators that +#' use the same combinations of \code{cat1}, \code{cat1_varname}, and \code{trends} +#' columns. +#' +#' @param data_source Character string (length 1) specifying the data source to analyze +#' (e.g., \code{'birth'}, \code{'brfss'}, \code{'chars'}, \code{'death'}, \code{'hys'}). This corresponds to a table +#' name in the PHExtractStore database. +#' +#' @return A data.table containing analysis sets with the following columns: +#' \itemize{ +#' \item \code{set}: Integer identifying groups of indicators with identical analysis patterns +#' \item \code{cat1}: Category name as expected in CHI TRO +#' \item \code{cat1_varname}: Category variable name as expected in CHI TRO +#' \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab +#' \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab +#' \item \code{demgroups}: \code{'x'} if analysis includes demographic groups +#' \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations +#' \item \code{trends}: \code{'x'} if analysis includes trends analysis +#' \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern +#' } +#' +#' @details +#' This function generates a table for \code{\link{chi_generate_tro_shell}}, providing the +#' structure required for generating analysis instructions. It connects to the +#' \code{[PHExtractStore]} database on \code{KCITSQLPRPHIP40} to retrieve the latest production data. +#' Users need appropriate database credentials - contact your manager if you need access. +#' +#' Typically, the analysis will not change from year to year. However, you should +#' compare the output of this function with the 'CHI-Variables_ByDataSource' worksheet +#' in +#' \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +#' SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} +#' and update it as necessary. +#' +#' The output structure directly informs \code{\link{chi_generate_tro_shell}} about which +#' indicators should be analyzed together based on their shared patterns of +#' categories and analysis types. +#' +#' @importFrom data.table setDT setorderv copy := .SD fifelse fsetdiff .GRP +#' @importFrom DBI dbConnect dbDisconnect dbExistsTable dbGetQuery +#' @importFrom odbc odbc +#' +#' @export +#' +#' @seealso +#' \code{\link{chi_generate_tro_shell}} for generating TRO shell from these analysis sets +#' +#' @examples +#' \dontrun{ +#' # Generate analysis sets for birth data +#' birth_sets <- chi_generate_analysis_set("birth") +#' +#' # Generate analysis sets for BRFSS data +#' brfss_sets <- chi_generate_analysis_set("brfss") +#' } +#' +chi_generate_analysis_set <- function(data_source = NULL) { + # Input validation + if (is.null(data_source)) { + stop("\n\U1F6D1 data_source parameter must be provided") + } + if (!is.character(data_source) || length(data_source) != 1) { + stop("\n\U1F6D1 data_source must be a single character string, e.g., 'birth'") + } + + # Get data ---- + # Construct the full table name for error messages + full_table_name <- paste0("[APDE].[", data_source, "_results]") + server_info <- "KCITSQLPRPHIP40 > PHExtractStore" + + # try to make a database connection + tryCatch({ + # Establish connection + cnxn <- odbc::dbConnect( + odbc::odbc(), + Driver = "SQL Server", + Server = "KCITSQLPRPHIP40", + Database = "PHExtractStore" + ) + }, error = function(e) { + # Handle connection errors separately from table existence errors + if (grepl("connection", tolower(e$message))) { + stop(paste0("\n\U1F6D1 Failed to connect to ", server_info, ". Please check your network connection and credentials.")) + } else { + # Re-throw the error with our custom message if it's already handled + stop(e$message) + } + }) + + + # Check if table exists before attempting query + table_exists <- DBI::dbExistsTable( + conn = cnxn, + name = paste0(data_source, "_results"), + schema = "APDE" + ) + + # If table exists, load into memory + if (!table_exists) { + stop(paste0("\U1F6D1 You specified data_source = '", data_source, + "', which attempted to download ", full_table_name, + " from ", server_info, ". This table does not exist.")) + } else { + tempdt <- data.table::setDT(DBI::dbGetQuery( + conn = cnxn, + statement = paste0("SELECT * FROM ", full_table_name) + )) } + + # Close database connection + if (exists("cnxn") && !is.null(cnxn)) { + DBI::dbDisconnect(cnxn) + } + + # Recodes for race3 & race4 ---- + # Necessary because they are wonky as heck due to how APDE decided to code/display them + race3_remix1 <- tempdt[(grepl('race/ethnicity$', cat1, ignore.case = T) & cat1_varname == 'race3')] + tempdt <- rbind( + fsetdiff(tempdt, race3_remix1), + copy(race3_remix1)[, cat1 := gsub('race/ethnicity$', 'race', cat1)][, cat1 := gsub('Race/ethnicity$', 'Race', cat1)], + copy(race3_remix1)[, cat1 := gsub('race/ethnicity$', 'ethnicity', cat1)][, cat1 := gsub('Race/ethnicity$', 'Ethnicity', cat1)] + ) + + race3_remix2 <- tempdt[(grepl('race/ethnicity$', cat2, ignore.case = T) & cat2_varname == 'race3')] + tempdt <- rbind( + fsetdiff(tempdt, race3_remix2), + copy(race3_remix2)[, cat2 := gsub('race/ethnicity$', 'race', cat2)][, cat2 := gsub('Race/ethnicity$', 'Race', cat2)], + copy(race3_remix2)[, cat2 := gsub('race/ethnicity$', 'ethnicity', cat2)][, cat2 := gsub('Race/ethnicity$', 'Ethnicity', cat2)] + ) + + tempdt[cat1_varname == 'race4', cat1 := gsub('race$', 'race/ethnicity', cat1)] + tempdt[cat1_varname == 'race4', cat1 := gsub('Race$', 'Race/ethnicity', cat1)] + + tempdt[cat2_varname == 'race4', cat2 := gsub('race$', 'race/ethnicity', cat2)] + tempdt[cat2_varname == 'race4', cat2 := gsub('Race$', 'Race/ethnicity', cat2)] + + # Table of categories and tabs per indicator ---- + # For cat1 combinations + tab_patterns <- tempdt[, list( + `_kingcounty` = fifelse(any(tab == "_kingcounty"), "x", ""), + `_wastate` = fifelse(any(tab == "_wastate"), "x", ""), + demgroups = fifelse(any(tab == "demgroups"), "x", ""), + crosstabs = fifelse(any(tab == "crosstabs"), "x", ""), + trends = fifelse(any(tab == "trends"), "x", "") + ), by = list(indicator_key, cat1, cat1_varname)] + + # For cat2 combinations + tab_patterns2 <- tempdt[!is.na(cat2), list( + `_kingcounty` = fifelse(any(tab == "_kingcounty"), "x", ""), + `_wastate` = fifelse(any(tab == "_wastate"), "x", ""), + demgroups = fifelse(any(tab == "demgroups"), "x", ""), + crosstabs = fifelse(any(tab == "crosstabs"), "x", ""), + trends = fifelse(any(tab == "trends"), "x", "") + ), by = list(indicator_key, cat1 = cat2, cat1_varname = cat2_varname)] + + # Combine the patterns + all_patterns <- merge(tab_patterns, tab_patterns2, by = c('indicator_key', 'cat1', 'cat1_varname'), all = T) + all_patterns <- all_patterns[, list(indicator_key, cat1, cat1_varname, + `_kingcounty` = fifelse(`_kingcounty.x` == 'x' | `_kingcounty.y` == 'x', 'x', NA_character_), + `_wastate` = fifelse(`_wastate.x` == 'x' | `_wastate.y` == 'x', 'x', NA_character_), + `demgroups` = fifelse(`demgroups.x` == 'x' | `demgroups.y` == 'x', 'x', NA_character_), + `crosstabs` = fifelse(`crosstabs.x` == 'x' | `crosstabs.y` == 'x', 'x', NA_character_), + `trends` = fifelse(`trends.x` == 'x' | `trends.y` == 'x', 'x', NA_character_) )] + + # Tidy + all_patterns <- all_patterns[!(cat1 == 'Overall' & cat1_varname == 'overall')] + + # Generate analysis_sets ---- + # Function to convert a data.table's rows into a string + rows_to_string <- function(mydt) { + # Sort the data.table by all columns except 'indicator_key' to ensure consistent ordering + dt_sorted <- setorderv(copy(mydt), + cols = setdiff(names(mydt), "indicator_key")) + + # Convert each row (excluding 'indicator_key') into a string, concatenating columns with "|||" + row_strings <- apply(X = dt_sorted[, .SD, .SDcols = setdiff(names(mydt), "indicator_key")], + MARGIN = 1, # to iterate over rows + FUN = paste, collapse = "|||") + + # Combine the vector `row_strings` into a single string + serialized_string <- paste(row_strings, collapse = "___") + + return(serialized_string) + } + + # Generate the unique pattern for each indicator_key's rows + pattern_groups <- all_patterns[, list(pattern = rows_to_string(.SD)), + by = indicator_key] + + + # Group the patterns to create the 'set' and 'set_indicator_keys' columns + pattern_groups <- pattern_groups[, `:=`(set = .GRP, + set_indicator_keys = paste(sort(indicator_key), collapse = ", ")), + by = pattern] + pattern_groups[, pattern := NULL] + + # Merge on original data + result <- merge(all_patterns, pattern_groups, by = "indicator_key") + + # Keep unique rows + result <- unique(result[, indicator_key := NULL]) + + # return object ---- + return(result) +} + + diff --git a/man/chi_generate_analysis_set.Rd b/man/chi_generate_analysis_set.Rd new file mode 100644 index 0000000..e8e45fd --- /dev/null +++ b/man/chi_generate_analysis_set.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_generate_analysis_set.R +\name{chi_generate_analysis_set} +\alias{chi_generate_analysis_set} +\title{Generate Analysis Sets for CHI} +\usage{ +chi_generate_analysis_set(data_source = NULL) +} +\arguments{ +\item{data_source}{Character string (length 1) specifying the data source to analyze +(e.g., \code{'birth'}, \code{'brfss'}, \code{'chars'}, \code{'death'}, \code{'hys'}). This corresponds to a table +name in the PHExtractStore database.} +} +\value{ +A data.table containing analysis sets with the following columns: + \itemize{ + \item \code{set}: Integer identifying groups of indicators with identical analysis patterns + \item \code{cat1}: Category name as expected in CHI TRO + \item \code{cat1_varname}: Category variable name as expected in CHI TRO + \item \code{_kingcounty}: \code{'x'} if analysis includes King County tab + \item \code{_wastate}: \code{'x'} if analysis includes Washington State tab + \item \code{demgroups}: \code{'x'} if analysis includes demographic groups + \item \code{crosstabs}: \code{'x'} if analysis includes crosstabulations + \item \code{trends}: \code{'x'} if analysis includes trends analysis + \item \code{set_indicator_keys}: Comma-separated list of indicator keys sharing the pattern + } +} +\description{ +Creates sets of indicator keys that share common analysis patterns within CHI +(Community Health Indicators) data. This function reads the most recent production +version of CHI estimates for a specified data source and groups indicators that +use the same combinations of \code{cat1}, \code{cat1_varname}, and \code{trends} +columns. +} +\details{ +This function generates a table for \code{\link{chi_generate_tro_shell}}, providing the +structure required for generating analysis instructions. It connects to the +\code{[PHExtractStore]} database on \code{KCITSQLPRPHIP40} to retrieve the latest production data. +Users need appropriate database credentials - contact your manager if you need access. + +Typically, the analysis will not change from year to year. However, you should +compare the output of this function with the 'CHI-Variables_ByDataSource' worksheet +in +\href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ +SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} +and update it as necessary. + +The output structure directly informs \code{\link{chi_generate_tro_shell}} about which +indicators should be analyzed together based on their shared patterns of +categories and analysis types. +} +\examples{ +\dontrun{ +# Generate analysis sets for birth data +birth_sets <- chi_generate_analysis_set("birth") + +# Generate analysis sets for BRFSS data +brfss_sets <- chi_generate_analysis_set("brfss") +} + +} +\seealso{ +\code{\link{chi_generate_tro_shell}} for generating TRO shell from these analysis sets +} From 29e9c233f2a43c5b6242d3a2337ae9119b5790ef Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:11:40 -0800 Subject: [PATCH 25/35] Created chi_compare_estimates - used to identify differences between two datasets (typically the current year's and last year's), in order to identify notable changes --- R/chi_compare_estimates.R | 129 +++++++++++++++++++++++++++++++++++ man/chi_compare_estimates.Rd | 63 +++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 R/chi_compare_estimates.R create mode 100644 man/chi_compare_estimates.Rd diff --git a/R/chi_compare_estimates.R b/R/chi_compare_estimates.R new file mode 100644 index 0000000..13cc186 --- /dev/null +++ b/R/chi_compare_estimates.R @@ -0,0 +1,129 @@ +# chi_compare_estimates() ---- +#' Compare and validate CHI (Community Health Indicator) estimates between two datasets +#' +#' @description +#' Analyzes differences between two datasets containing CHI estimates, calculating both absolute +#' and relative differences between matched records. The function supports optional metadata +#' integration for result type classification. +#' +#' @param OLD data.frame/data.table containing the reference/baseline CHI estimates +#' @param NEW data.frame/data.table containing the CHI estimates to be validated +#' @param OLD.year Year to filter from the OLD dataset (e.g., "2022") +#' @param NEW.year Year to filter from the NEW dataset (e.g., "2023") +#' @param META data.frame/data.table containing metadata with indicator_key and +#' result_type columns for classifying comparison results +#' +#' @details +#' The function performs the following operations: +#' 1. Merges datasets using CHI-specific identifiers (indicator_key, tab, cat1, cat2, etc.) +#' 2. Calculates absolute differences (percentage points for non-rates, raw for rates) +#' 3. Calculates relative differences as percentages +#' 4. Identifies notable differences needing further exploration using standard CHI criteria +#' +#' @examples +#' \dontrun{ +#' # Compare two versions of estimates +#' comparison <- chi_compare_estimates( +#' OLD = previous_estimates, +#' NEW = current_estimates, +#' OLD.year = "2022", +#' NEW.year = "2023", +#' META = metadata +#' ) +#' } +#' +#' @seealso +#' \code{\link{chi_qa_tro}} for validating individual datasets +#' +#' \code{\link{chi_sql_update}} for uploading validated results +#' +#' @return data.table ordered by absolute difference, containing: +#' - Difference metrics (absolute_diff, relative_diff) +#' - All matching identifiers and categories +#' - Original values and metadata from both datasets +#' - Diagnostic information (bounds, numerators, denominators, standard errors) +#' +#' @importFrom data.table data.table setnames ":=" setDT copy +#' @export +#' +chi_compare_estimates <- function(OLD = NULL, NEW = NULL, OLD.year = NULL, NEW.year = NULL, META = NULL){ + # Check inputs ---- + # Check if necessary arguments are present + if(is.null(OLD)){stop("You must provide 'OLD', i.e., the name of the table with the OLD data")} + if(is.null(NEW)){stop("You must provide 'NEW', i.e., the name of the table with the NEW data")} + if(is.null(OLD.year)){stop("You must provide 'OLD.year', i.e., the year of interest in the OLD data")} + if(is.null(NEW.year)){stop("You must provide 'NEW.year', i.e., the year of interest in the NEW data")} + if(is.null(META)){stop("You must provide 'Meta', a metadata table with indicator_key and result_type columns for classifying comparison results")} + + # Check if objects are data.frames & make into data.table if need be + if(is.data.frame(OLD) == FALSE){ + stop("'OLD' must be a data.frame or a data.table") + }else{OLD <- data.table::setDT(copy(OLD))} + + if(is.data.frame(NEW) == FALSE){ + stop("'NEW' must be a data.frame or a data.table") + }else{NEW <- data.table::setDT(copy(NEW))} + + if(is.data.frame(META) == FALSE){ + stop("'META' must be a data.frame or a data.table") + }else{META <- data.table::setDT(copy(META))} + + # Process data ---- + # If metadata provided, add it to the columns to help interpret the output + if(!is.null(META)){ + NEW <- merge(NEW, META[, list(indicator_key, result_type)], by = "indicator_key", all.x = TRUE, all.y = FALSE) + } else { NEW[, result_type := "Metadata not provided"]} + + # Merge old and new data based on identifiers + comp <- merge(copy(OLD[year == OLD.year]), + copy(NEW[year == NEW.year]), + by = c("indicator_key", "tab", + "cat1", "cat1_group", "cat1_varname", + "cat2", "cat2_group", "cat2_varname"), + all = T) + + # calculate percent differences between old (x) and new(y) + comp[, relative.diff := round2(abs((result.x - result.y) / result.x)*100, 1)] + comp[result_type != "rate", absolute.diff := round2(abs(result.x - result.y)*100, 1)] + comp[result_type == "rate", absolute.diff := round2(abs(result.x - result.y), 1)] + comp <- comp[!is.na(absolute.diff)] # drop if absolute difference is NA + + # order variables + comp <- comp[, c("absolute.diff", "relative.diff", "result_type", + "indicator_key", "tab", + "cat1", "cat1_group", "cat1_varname", + "cat2", "cat2_group", "cat2_varname", "year.x", "year.y", + "result.x", "result.y", "lower_bound.x", "lower_bound.y", + "upper_bound.x", "upper_bound.y", + "numerator.x", "numerator.y", "denominator.x", "denominator.y", + "se.x", "se.y")] + + # rename suffixes + setnames(comp, names(comp), gsub("\\.x$", ".OLD", names(comp))) + setnames(comp, names(comp), gsub("\\.y$", ".NEW", names(comp))) + + # order based on percent difference + setorder(comp, -absolute.diff) + + # Identify notable changes based on Joie's criteria ---- + # 1) absolute 3-point difference (for any indicators with KC estimate >=5% or RATE >=5) + # 2) a relative increase/decrease of 50% (for any indicators with KC estimate <5% or RATE < 5) + # First, identify whether to use absolute or relative change for each indicator_key + qa_type = NEW[tab=='_kingcounty'] + qa_type[, qa_type := 'relative'] + qa_type[result_type=='proportion' & result >= 0.05, qa_type := 'absolute'] + qa_type[result_type=='rate' & result >= 5, qa_type := 'absolute'] + qa_type <- qa_type[, list(indicator_key, qa_type)] + + # Merge on qa_type + comp <- merge(comp, qa_type, by = 'indicator_key', all = T) + + # Identify notable changes + comp[qa_type == 'absolute' & result_type=='proportion' & absolute.diff >= 3, notable := 1] # absolute difference was multiplied by 100, so assess with >= 3 + comp[qa_type == 'absolute' & result_type=='rate' & absolute.diff >= 3, notable := 1] + comp[qa_type == 'relative' & result_type=='proportion' & relative.diff >= 50, notable := 1] # absolute difference was multiplied by 100, so assess with >= 3 + comp[qa_type == 'relative' & result_type=='rate' & relative.diff >= 50, notable := 1] + + # Return object ---- + return(comp) +} diff --git a/man/chi_compare_estimates.Rd b/man/chi_compare_estimates.Rd new file mode 100644 index 0000000..14a087c --- /dev/null +++ b/man/chi_compare_estimates.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/chi_compare_est.R +\name{chi_compare_estimates} +\alias{chi_compare_estimates} +\title{Compare and validate CHI (Community Health Indicator) estimates between two datasets} +\usage{ +chi_compare_estimates( + OLD = NULL, + NEW = NULL, + OLD.year = NULL, + NEW.year = NULL, + META = NULL +) +} +\arguments{ +\item{OLD}{data.frame/data.table containing the reference/baseline CHI estimates} + +\item{NEW}{data.frame/data.table containing the CHI estimates to be validated} + +\item{OLD.year}{Year to filter from the OLD dataset (e.g., "2022")} + +\item{NEW.year}{Year to filter from the NEW dataset (e.g., "2023")} + +\item{META}{data.frame/data.table containing metadata with indicator_key and +result_type columns for classifying comparison results} +} +\value{ +data.table ordered by absolute difference, containing: +- Difference metrics (absolute_diff, relative_diff) +- All matching identifiers and categories +- Original values and metadata from both datasets +- Diagnostic information (bounds, numerators, denominators, standard errors) +} +\description{ +Analyzes differences between two datasets containing CHI estimates, calculating both absolute +and relative differences between matched records. The function supports optional metadata +integration for result type classification. +} +\details{ +The function performs the following operations: +1. Merges datasets using CHI-specific identifiers (indicator_key, tab, cat1, cat2, etc.) +2. Calculates absolute differences (percentage points for non-rates, raw for rates) +3. Calculates relative differences as percentages +4. Identifies notable differences needing further exploration using standard CHI criteria +} +\examples{ +\dontrun{ +# Compare two versions of estimates +comparison <- chi_compare_estimates( + OLD = previous_estimates, + NEW = current_estimates, + OLD.year = "2022", + NEW.year = "2023", + META = metadata +) +} + +} +\seealso{ +\code{\link{chi_qa_tro}} for validating individual datasets + +\code{\link{chi_sql_update}} for uploading validated results +} From 7ec60c58a54e49fcdd13e95acaa28bfe31d1bfb7 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:16:02 -0800 Subject: [PATCH 26/35] Added a bunch of new tests & edited old ones - most new ones use contents of testthat/helper.R - improved tests for chi_generate_tro_shell - created a dummy test for chi_process_trends > this can be updated in the future, but I wanted to pass all tests. --- tests/testthat/helper.R | 129 ++++++++++++++++++ tests/testthat/test-chi_calc.R | 20 +++ tests/testthat/test-chi_compare_estimates.R | 16 +++ .../testthat/test-chi_generate_analysis_set.R | 5 + tests/testthat/test-chi_generate_tro_shell.R | 10 +- tests/testthat/test-chi_process_trends.R | 1 + tests/testthat/test-chi_qa_tro.R | 16 +++ tests/testthat/test-chi_sql_update.R | 23 ++++ tests/testthat/test_chi_generate_metadata.R | 7 + 9 files changed, 226 insertions(+), 1 deletion(-) create mode 100644 tests/testthat/helper.R create mode 100644 tests/testthat/test-chi_calc.R create mode 100644 tests/testthat/test-chi_compare_estimates.R create mode 100644 tests/testthat/test-chi_generate_analysis_set.R create mode 100644 tests/testthat/test-chi_qa_tro.R create mode 100644 tests/testthat/test-chi_sql_update.R create mode 100644 tests/testthat/test_chi_generate_metadata.R diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R new file mode 100644 index 0000000..b9e2154 --- /dev/null +++ b/tests/testthat/helper.R @@ -0,0 +1,129 @@ +# The contents of this file will be run for all testthat tests + +library(data.table) + +# Setup function for creating test data +setup_test_data <- function() { + # Sample analytic data ---- + set.seed(98104) + + test_analytic <- data.table( + chi_year = rep(2015:2024, each = 100), + chi_sex = sample(c("Female", "Male"), + size = 1000, # 100 rows * 10 years + replace = TRUE), + chi_geo_region = sample(c("East", "North", "Seattle", "South"), + size = 1000, + replace = TRUE) + ) + + test_analytic[, `:=`( + indicator1 = sample(0:1, .N, replace = TRUE), + indicator2 = sample(0:1, .N, replace = TRUE) + )] + + test_analytic[, chi_geo_kc := 'King County'] + + setorder(test_analytic, chi_year) + + # Sample analysis set ---- + test_analysis_set <- data.table( + cat1 = c('Regions', 'Gender'), + cat1_varname = c('chi_geo_region', 'chi_sex'), + `_kingcounty` = c('x'), + `_wastate` = NA_character_, + demgroups = NA_character_, + crosstabs = NA_character_, + trends = NA_character_, + set = 1, + set_indicator_keys = 'indicator1, indicator2' + ) + + # Sample instructions ---- + test_instructions <- data.table( + indicator_key = c("indicator1", "indicator2", "indicator1", "indicator2"), + tab = c("demgroups", "demgroups", "_kingcounty", "_kingcounty"), + cat1 = c("Regions", "Gender", "King County", "King County"), + cat1_varname = c("chi_geo_region", "chi_sex", "chi_geo_kc", "chi_geo_kc"), + cat2 = NA_character_, + cat2_varname = NA_character_, + start = c(2019), + end = c(2024) + ) + + # Sample estimates ---- + test_estimates <- data.table( + indicator_key = c("indicatorX"), + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2023'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = Sys.Date(), + run_date = Sys.Date(), + numerator = c(111, 175, 210, 600, 430000), + denominator = c(1000, 1500, 2000, 2500, 2200000) + ) + test_estimates[, result := numerator / denominator] + test_estimates[, se := sqrt((result * (1-result)) / denominator)] + test_estimates[, rse := 100 * se / result] + test_estimates[, lower_bound := result - 1.96 * se] + test_estimates[, upper_bound := result + 1.96 * se] + + test_estimates_old <- data.table( + indicator_key = c("indicatorX"), + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2022'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = Sys.Date(), + run_date = Sys.Date(), + numerator = c(113, 177, 209, 400, 460000), + denominator = c(1000, 1500, 2000, 2500, 2300000) + ) + test_estimates_old[, result := numerator / denominator] + test_estimates_old[, se := sqrt((result * (1-result)) / denominator)] + test_estimates_old[, rse := 100 * se / result] + test_estimates_old[, lower_bound := result - 1.96 * se] + test_estimates_old[, upper_bound := result + 1.96 * se] + + # Sample metadata ---- + test_metadata <- data.table( + indicator_key = c("indicatorX"), + result_type = c("proportion"), + valid_years = c("2020 2021 2022 2022"), + latest_year = c(2022), + data_source = 'test', + valence = 'positive', + latest_year_result = 0.20, + latest_year_kc_pop = 2300000, + latest_year_count = 460000, + map_type = 'hra', + unit = 'person', + chi = 1, + run_date = Sys.Date() + ) + + # Return ---- + list(my.analytic = test_analytic, + my.analysis_set = test_analysis_set, + my.estimate= test_estimates, + my.estimate_old= test_estimates_old, + my.metadata = test_metadata, + my.instructions = test_instructions) +} diff --git a/tests/testthat/test-chi_calc.R b/tests/testthat/test-chi_calc.R new file mode 100644 index 0000000..b79ae8e --- /dev/null +++ b/tests/testthat/test-chi_calc.R @@ -0,0 +1,20 @@ +# Tests for chi_calc +test_that("chi_calc performs basic calculations correctly", { + test_data <- setup_test_data() + + result <- chi_calc( + ph.data = test_data$my.analytic, + ph.instructions = test_data$my.instructions, + rate = FALSE, + source_name = "test", + source_date = Sys.Date() + ) + + expect_s3_class(result, "data.table") + expect_true(all(c("result", "lower_bound", "upper_bound") %in% names(result))) + expect_true(all(result$result >= 0 & result$result <= 1)) # For proportions + expect_type(result$result, "double") + expect_type(result$numerator, "double") + expect_type(result$denominator, "double") + expect_type(result$indicator_key, "character") +}) diff --git a/tests/testthat/test-chi_compare_estimates.R b/tests/testthat/test-chi_compare_estimates.R new file mode 100644 index 0000000..99a9396 --- /dev/null +++ b/tests/testthat/test-chi_compare_estimates.R @@ -0,0 +1,16 @@ +# Tests for chi_compare_estimates +test_that("chi_compare_estimates identifies differences correctly", { + test_data <- setup_test_data() + + comparison <- chi_compare_estimates( + OLD = test_data$my.estimate_old, + NEW = test_data$my.estimate, + OLD.year = "2022", + NEW.year = "2023", + META = test_data$my.metadata + ) + + expect_true("absolute.diff" %in% names(comparison)) + expect_true("relative.diff" %in% names(comparison)) + expect_equal(nrow(comparison[notable == 1]), 1) # Only South KC should have a notable difference +}) diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R new file mode 100644 index 0000000..011233e --- /dev/null +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -0,0 +1,5 @@ +# Tests for chi_generate_analysis_set +test_that("chi_generate_analysis_set validates inputs", { + expect_error(chi_generate_analysis_set(), "data_source parameter must be provided") + expect_error(chi_generate_analysis_set(123), "data_source must be a single character string") +}) diff --git a/tests/testthat/test-chi_generate_tro_shell.R b/tests/testthat/test-chi_generate_tro_shell.R index 28dfb97..5bf4a43 100644 --- a/tests/testthat/test-chi_generate_tro_shell.R +++ b/tests/testthat/test-chi_generate_tro_shell.R @@ -1,4 +1,12 @@ -test_that("injest template format", { +test_that("chi_generate_tro_shell validates inputs correctly", { + test_data <- setup_test_data() + + expect_error(chi_generate_tro_shell(), "ph.analysis_set must be provided") + expect_error(chi_generate_tro_shell(data.frame(), start.year = "2023"), + "start.year must be a single numeric value") +}) + +test_that("ingest template format", { set <- c(rep(1,7),rep(2,4)) cat1 <- c("King County", "Cities/neighborhoods", "Poverty", "Race", "Race/ethnicity","Regions", "Big cities", "King County", "Poverty", "Race", "Regions") diff --git a/tests/testthat/test-chi_process_trends.R b/tests/testthat/test-chi_process_trends.R index f36c36a..f9ba495 100644 --- a/tests/testthat/test-chi_process_trends.R +++ b/tests/testthat/test-chi_process_trends.R @@ -1,4 +1,5 @@ test_that("calculates trends", { # chi_generate_trend_years does not exist # DT <- chi_generate_trend_years(indicator_key = c("test1", "test2"),span = 3,begin.year = 2009,final.year = 2023) + expect_identical(1L, 1L) # a dummy test because devtools::check does not allow empty test_that statements }) diff --git a/tests/testthat/test-chi_qa_tro.R b/tests/testthat/test-chi_qa_tro.R new file mode 100644 index 0000000..68e74df --- /dev/null +++ b/tests/testthat/test-chi_qa_tro.R @@ -0,0 +1,16 @@ +# Tests for chi_qa_tro +test_that("chi_qa_tro validates data structure", { + test_data <- setup_test_data() + + result <- chi_calc( + ph.data = test_data$my.analytic, + ph.instructions = test_data$my.instructions, + rate = FALSE, + source_name = "test", + source_date = Sys.Date() + ) + + expect_error(chi_qa_tro(), 'argument "CHIestimates" is missing, with no default') + expect_type(suppressWarnings(chi_qa_tro(CHIestimates = result, CHImetadata = test_data$my.metadata, verbose = FALSE)), "double") + expect_error(suppressWarnings(chi_qa_tro(result, test_data$my.metadata, verbose = FALSE)), NA) +}) diff --git a/tests/testthat/test-chi_sql_update.R b/tests/testthat/test-chi_sql_update.R new file mode 100644 index 0000000..0ce77ad --- /dev/null +++ b/tests/testthat/test-chi_sql_update.R @@ -0,0 +1,23 @@ +# Tests for chi_sql_update +test_that("chi_sql_update validates inputs", { + test_data <- setup_test_data() + + expect_warning( + chi_sql_update( + CHIestimates = test_data$my.estimate, + CHImetadata = test_data$my.metadata, + table_name = 'JustTesting', + server = 'development', + replace_table = FALSE + ), + "Validation may be flawed for the following variables because they are 100% missing" + ) + + expect_error(chi_sql_update(), + "The results table to push to SQL \\(CHIestimates\\) is missing") + expect_error(suppressWarnings(chi_sql_update(CHIestimates = test_data$my.estimate)), + "The metadata table to push to SQL \\(CHImetadata\\) is missing") + expect_error(suppressWarnings(chi_sql_update(CHIestimates = test_data$my.estimate, + CHImetadata = test_data$my.metadata)), + "The table_name argument is missing") +}) diff --git a/tests/testthat/test_chi_generate_metadata.R b/tests/testthat/test_chi_generate_metadata.R new file mode 100644 index 0000000..41663c1 --- /dev/null +++ b/tests/testthat/test_chi_generate_metadata.R @@ -0,0 +1,7 @@ +# Tests for chi_generate_metadata +test_that("chi_generate_metadata handles valid inputs", { + test_data <- setup_test_data() + + expect_error(chi_generate_metadata(), "meta.old must be provided") + expect_error(chi_generate_metadata(meta.old = test_data$my.metadata), "est.current must be provided") +}) From f5bf0c880f9c353331fb1f20fec85aaf79bd141c Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Tue, 18 Feb 2025 18:17:03 -0800 Subject: [PATCH 27/35] updated DESCRIPTION & NAMESPACE - updated for new functions and function revisions --- DESCRIPTION | 3 +++ NAMESPACE | 13 ++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 86b2e5c..72d7f95 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,8 +26,10 @@ Imports: future.apply, glue (>= 1.6.1), odbc (>= 1.2.2), + progressr, rads, rads.data, + stats, tidyr, yaml (>= 2.2.1) Remotes: @@ -35,6 +37,7 @@ Remotes: Suggests: httr, knitr, + progress, rmarkdown, markdown, testthat (>= 3.0.0), diff --git a/NAMESPACE b/NAMESPACE index 20cf43e..862c77c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,32 +1,38 @@ # Generated by roxygen2: do not edit by hand export(chi_calc) +export(chi_compare_estimates) export(chi_count_by_age) export(chi_drop_illogical_ages) +export(chi_generate_analysis_set) export(chi_generate_instructions_pop) export(chi_generate_metadata) export(chi_generate_tro_shell) export(chi_get_cols) export(chi_get_yaml) -export(chi_process_nontrends) export(chi_qa_tro) export(chi_sql_update) import(dtsurvey) import(future) import(future.apply) +import(progressr) importFrom(DBI,Id) importFrom(DBI,dbConnect) +importFrom(DBI,dbDisconnect) importFrom(DBI,dbExistsTable) importFrom(DBI,dbGetQuery) importFrom(DBI,dbWriteTable) importFrom(data.table,"%between%") importFrom(data.table,":=") +importFrom(data.table,.GRP) importFrom(data.table,.SD) importFrom(data.table,`:=`) importFrom(data.table,between) importFrom(data.table,copy) importFrom(data.table,data.table) importFrom(data.table,fcase) +importFrom(data.table,fifelse) +importFrom(data.table,fsetdiff) importFrom(data.table,is.data.table) importFrom(data.table,rbindlist) importFrom(data.table,set) @@ -34,7 +40,9 @@ importFrom(data.table,setDT) importFrom(data.table,setcolorder) importFrom(data.table,setnames) importFrom(data.table,setorder) +importFrom(data.table,setorderv) importFrom(data.table,tstrsplit) +importFrom(data.table,uniqueN) importFrom(future.apply,future_lapply) importFrom(glue,glue) importFrom(glue,glue_sql) @@ -42,11 +50,14 @@ importFrom(odbc,odbc) importFrom(rads,calc) importFrom(rads,chi_cols) importFrom(rads,compare_estimate) +importFrom(rads,round2) importFrom(rads,string_clean) importFrom(rads,substrRight) importFrom(rads,suppress) importFrom(rads,tsql_validate_field_types) +importFrom(stats,na.omit) importFrom(tidyr,crossing) importFrom(tools,toTitleCase) +importFrom(utils,tail) importFrom(yaml,read_yaml) importFrom(yaml,yaml.load) From 67f9ce74c48a26a5c9492197d4b6e26745ab1565 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Thu, 20 Feb 2025 17:34:54 -0800 Subject: [PATCH 28/35] Rename proto_chi_calc >> chi_calc --- R/{proto_chi_calc.R => chi_calc.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{proto_chi_calc.R => chi_calc.R} (100%) diff --git a/R/proto_chi_calc.R b/R/chi_calc.R similarity index 100% rename from R/proto_chi_calc.R rename to R/chi_calc.R From eaf38d2e19a38aa1217a53eee01deeeccc3eb08f Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Thu, 20 Feb 2025 17:51:06 -0800 Subject: [PATCH 29/35] Initial vignette for Prevalence/Proportion - will make separate vignetted for rates - put in /quarto_docs rather than /vignettes bc vignettes expects files that can be rendered during package builds. Currently, RMD but not QMD can be used that way. --- .Rbuildignore | 1 + DESCRIPTION | 1 + quarto_docs/Calculating_Prevalences.qmd | 468 ++++++++++++++++++++++++ 3 files changed, 470 insertions(+) create mode 100644 quarto_docs/Calculating_Prevalences.qmd diff --git a/.Rbuildignore b/.Rbuildignore index 3920427..f5ae6c0 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,3 +2,4 @@ ^\.Rproj\.user$ ^ronNotes\.qmd$ ^LICENSE\.md$ +^quarto_docs$ diff --git a/DESCRIPTION b/DESCRIPTION index 72d7f95..18f03bb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,6 +40,7 @@ Suggests: progress, rmarkdown, markdown, + quarto (>= 1.4), testthat (>= 3.0.0), srvyr, survey (>= 4.0) diff --git a/quarto_docs/Calculating_Prevalences.qmd b/quarto_docs/Calculating_Prevalences.qmd new file mode 100644 index 0000000..0942672 --- /dev/null +++ b/quarto_docs/Calculating_Prevalences.qmd @@ -0,0 +1,468 @@ +--- +title: "Generating CHI Prevalence Estimates" +format: gfm +prefer-html: false +self-contained: true +editor: visual +--- + +# Introduction + +The `apde.chi.tools` package provides tools for a standardized workflow for preparing King County Community Health Indicators (CHI) estimates. This vignette demonstrates how to use the package's core functions to generate prevalence/proportion estimates from start to finish. + +This workflow is specifically designed for calculating **prevalences and proportions**. If you need to calculate rates that use population denominators, please refer to the separate vignette for that purpose. + +We'll walk through a complete analysis pipeline using birth data as an example, but the same workflow applies to other standardized datasets like ACS PUMS, BRFSS, and HYS. + +The CHI standards are documented in [SharePoint \> Community Health Indicators \> CHI-Vizes](https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/Forms/AllItems.aspx) \> CHI-Standards-TableauReady Output.xlsx. We'll follow these standards throughout our analysis. + +Finally, remember that you can always get more information about a specific function by accessing its help file, e.g., `?chi_calc`, `?chi_generate_tro_shell`, etc. + +# Load Packages + +```{r setup} +#| warning: false +#| message: false + +library(glue) # For creating dynamic strings +library(future) # For parallel processing +library(Microsoft365R) # For SharePoint connections +library(DBI) # For SQL Server connections +library(openxlsx) # For Excel output +library(rads) # For APDE analyses +library(data.table) # For fast data manipulation +library(apde.chi.tools) # The package we're demonstrating +``` + +```{r pretty_kable} +#| warning: false +#| message: false +#| echo: false + +# function to beautify tables +pretty_kable <- function(dt) { + knitr::kable(dt, format = 'markdown') +} +``` + +# Analysis Configuration + +First, let's set up our configuration parameters. This step defines key variables and paths used throughout the analysis. Doing this once at the top of your code will help you maintain and adapt it for subsequent years. + +```{r config} + +# Specify the most recent year available in the raw data +latest_year <- 2023 + +# Specify a directory for saving the output in the CHI SharePoint site +sharepoint_output_dir <- paste0('JUNK_testing/', latest_year, '-update/') +``` + +# Getting the Raw Data + +Next, let's retrieve the birth data we'll be analyzing. The `rads::get_data_birth()` function pulls data from SQL, filtered to our specifications. + +```{r get-data} +#| warning: false +#| message: false + +# Get birth data for the past 10 years +birthsdt <- get_data_birth( + cols = c("bigcities", "chi_geo_kc", "chi_geo_region", "chi_race_aic_asianother", + "chi_race_aic_chinese", "chi_race_aic_filipino", + "chi_race_aic_guam_or_chamorro", "chi_race_aic_hawaiian", + "chi_race_aic_his_cuban", "chi_race_aic_his_mexican", + "chi_race_aic_his_puerto_rican", "chi_race_aic_indian", + "chi_race_aic_japanese", "chi_race_aic_korean", "chi_race_aic_samoan", + "chi_race_aic_vietnamese", "edu_grp", "hra20_name", "mage5", + "pov200grp", "race3", "race3_hispanic", "race4", + "breastfed", 'bw_norm', 'chi_year', 'creation_date'), + year = (latest_year-9):latest_year, # latest_year was defined above + kingco = F) +``` + +# Open a Database Connection + +Now, we need to connect to our SQL production database to access CHI estimates for the previous year. + +```{r db-connection} +#| warning: false +#| message: false + +db_chi_prod <- odbc::dbConnect( + odbc::odbc(), + Driver = "SQL Server", + Server = "KCITSQLPRPHIP40", # the production server + Database = "PHExtractStore") +``` + +# Getting the Analysis Set + +Each CHI data source has an analysis set - a compact summary of all calculations needed for the indicators. It is often saved in the appropriate [GitHub repo](https://github.com/PHSKC-APDE/chi) sub-directory, along with your annual CHI code. However, if the analysis set is missing, the `chi_generate_analysis_set()` function can create it based on the latest year's results in the production server. + +```{r analysis-set} +#| warning: false +#| message: false + +# Generate the analysis set for birth data +analysis_sets <- chi_generate_analysis_set('birth') +``` + +Curious what an analysis set looks like? Let's take a peek at the first two rows: + +```{r} +#| echo: false +# Show the last few rows to display structure +pretty_kable(head(analysis_sets[1:2])) +``` + +The analysis set contains important information about which: + +- indicators should be analyzed together (the `set` and `set_indicator_keys` columns) +- category variables to use in analyses (`cat1`, `cat1_varname`) +- analysis types to perform (`_kingcounty`, `_wastate`, `demgroups`, etc.) + +# Generating Instructions + +To analyze our data consistently, we need to generate a structured set of calculation instructions. The `chi_generate_tro_shell()` function creates these instructions based on our analysis set. + +```{r generate-instructions} +#| warning: false +#| message: false + +myinstructions <- chi_generate_tro_shell( + ph.analysis_set = analysis_sets, + start.year = latest_year-4, # earliest year to be used for estimates + end.year = latest_year, # latest year to be used for aggregate estimates + year.span = 5, # number of years included in a single period + trend.span = 3, # number of years included in a trend single period + trend.periods = 10 # number of periods to be included in a trend +) +``` + +Let's examine the structure of these instructions: + +```{r} +#| echo: false +pretty_kable(head(myinstructions)) +``` + +For this example, we'll focus on a single indicator to keep things manageable. Specifically, we've chosen the proportion of births with a normal birth weight, i.e., \[2,500g, 3,999g\]. + +```{r filter-instructions} +#| warning: false +#| message: false + +myinstructions <- myinstructions[indicator_key %in% c('bw_norm')] +``` + +Now, let's clean up our instructions to prevent illogical geographic cross-tabulations. Note that this is one place where the analyst will need to think carefully and deeply. You should assume that this step will be specific to each analysis. + +```{r tidy-instructions} +#| warning: false +#| message: false + +# Prevent crosstabs of big cities with HRAs or Regions +myinstructions <- myinstructions[!(cat1_varname == 'bigcities' & + cat2_varname %in% c('hra20_name', 'chi_geo_region'))] + +# Prevent crosstabs of HRAs or Regions with big cities +myinstructions <- myinstructions[!(cat2_varname == 'bigcities' & + cat1_varname %in% c('hra20_name', 'chi_geo_region'))] + +``` + +# Performing Calculations + +Now we're ready to perform our calculations using `chi_calc()`. This function applies our instructions to the raw data that we pulled from SQL, generating prevalence estimates with appropriate suppression and standard CHI columns. + +## Configure Parallel Processing + +The `chi_calc()` function utilizes parallel processing to speed up calculations. The following code optimizes parallel processing based on your machine’s resources. As of February 2025, APDE's performance laptops offer more available cores than our virtual machines (VMs). Therefore, `chi_calc()` is generally faster on a performance laptop compared to a VM. + +```{r future-configuration} +#| warning: false +#| message: false + +# Configures parallel processing using multiple sessions, reserving one core +future::plan(future::multisession, workers = future::availableCores() - 1) + +# Sets the maximum memory (in GB) allowed per future process +future.GB = 2 +options(future.globals.maxSize = future.GB * 1024^3) +``` + +## Use `chi_calc()` + +```{r calculate} +#| warning: false +#| message: false + +myestimates <- chi_calc(ph.data = birthsdt, + ph.instructions = myinstructions, + rate = FALSE, + small_num_suppress = TRUE, + suppress_low = 0, + suppress_high = 9, + source_name = 'birth', + source_date = unique(birthsdt$creation_date)) +``` + +Here are two rows from `myestimates` that will show the structure and contents of the table: + +```{r calculate-display} +#| echo: false +# Show the last few rows to display structure +pretty_kable(head(myestimates[sample(1:.N, 2)])) +``` + +# Tidying Race/Ethnicity Categories + +Due to the way that APDE decided to display race3 (Hispanic as ethnicity) and race4 (Hispanic as race), the calculation and preparation of these variables can be complex. We need to manipulate the results to align them with CHI standards. + +For future reference, all CHI standards can be found in `rads.data::misc_chi_byvars` and [SharePoint \> Community Health Indicators \> CHI-Vizes](https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/Forms/AllItems.aspx) \> CHI-Standards-TableauReady Output.xlsx. + +```{r tidy-race} +#| warning: false +#| message: false + +# Update race4 categories +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race4', + cat1 := "Birthing person's race"] +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race4', + cat2 := "Birthing person's race"] + +# Update race3 categories - separate race and ethnicity +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group != 'Hispanic', + cat1 := "Birthing person's race"] +myestimates[tab %in% c('demgroups', 'crosstabs') & cat1_varname == 'race3' & cat1_group == 'Hispanic', + cat1 := "Birthing person's ethnicity"] +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group != 'Hispanic', + cat2 := "Birthing person's race"] +myestimates[tab %in% c('demgroups', 'crosstabs') & cat2_varname == 'race3' & cat2_group == 'Hispanic', + cat2 := "Birthing person's ethnicity"] + +# Update trend data labels +myestimates[tab == 'trends' & cat1_varname %in% c('race3', 'race4'), + cat1 := "Birthing person's race/ethnicity"] +myestimates[tab == 'trends' & cat2_varname %in% c('race3', 'race4'), + cat2 := "Birthing person's race/ethnicity"] +``` + +# Updating Metadata + +Next, we need to generate metadata for our calculated estimates. This step combines existing metadata with our current estimates calculated above. + +```{r metadata} +#| warning: false +#| message: false + +# Retrieve existing metadata from the database +metadata_old <- setDT(odbc::dbGetQuery( + conn = db_chi_prod, + statement = glue::glue_sql("SELECT * FROM [PHExtractStore].[APDE].[birth_metadata] + WHERE indicator_key IN ({unique(myestimates$indicator_key)*})", .con = db_chi_prod))) + +# Generate updated metadata +mymetadata <- chi_generate_metadata(meta.old = metadata_old, + est.current = myestimates) +``` + +This is what the metadata table looks like: + +```{r metadata-display} +#| echo: false +# Show the last few rows to display structure +pretty_kable(head(mymetadata[])) +``` + +# Quality Assurance + +After calculation, we need to perform quality assurance checks to ensure our estimates and metadata conform to CHI standards. `chi_qa_tro` checks whether the CHI estimates and metadata are properly formatted, complete, and compliant with required standards, ensuring that column names, values, and data types meet specified criteria, such as proper rounding, absence of missing critical data, and consistency with reference tables. It also checks for issues like infinite values and ensures that specific data rules (e.g., proportions, bounds) are followed. + +```{r qa} +#| message: true + +# Perform QA checks +qa_result <- chi_qa_tro(CHIestimates = myestimates, + CHImetadata = mymetadata, + acs = F, + ignore_trends = T, + verbose = F) + +# The function returns 1 for pass, 0 for fail +if(qa_result == 1) { + message("QA checks passed successfully!") +} else { + message("QA checks failed. Please review your data and calculations.") +} +``` + +# Comparing with Previous Estimates + +An important validation step is comparing our new estimates with previous ones to identify 'notable differences'. The notable differences criteria were specified by Joie McCracken and are the same for each data source. They are used both for human QA and for sharing high level summaries to accompany new releases of CHI estimates. + +```{r compare} +#| warning: false +#| message: false + +# Get previous _kingcounty and demgroups estimates from the database +estimates_old <- setDT(DBI::dbGetQuery( + conn = db_chi_prod, + statement = glue::glue_sql("SELECT * FROM [PHExtractStore].[APDE].[birth_results] + WHERE tab IN ('_kingcounty', 'demgroups') AND chi = 1 AND + indicator_key IN ({unique(myestimates$indicator_key)*})", .con = db_chi_prod))) + +# Compare old and new estimates +mycomparison <- chi_compare_estimates(OLD = estimates_old, + NEW = myestimates, + OLD.year = paste0(latest_year-5, '-', latest_year-1), + NEW.year = paste0(latest_year-4, '-', latest_year), + META = mymetadata) +``` + +Let's examine the contents of `mycomparison`: + +```{r} +#| echo: false +pretty_kable(mycomparison[1:3]) +``` + +# Saving `mycomparison` to SharePoint + +`mycomparison`, which contains absolute and relative differences and flags the notable differences, should be saved to SharePoint. + +```{r save-mycomparison} +#| eval: true +#| echo: true +#| message: false +#| warning: false +#| output: false +#| results: "hide" + +# Connect to SharePoint +team <- get_team("Community Health Indicators") +drv <- team$get_drive("CHI-Vizes") + +# Create a temporary file to store mycomparison as an Excel file +tempy <- tempfile(fileext = ".xlsx") + +# Write mycomparison to the temporary Excel file +openxlsx::write.xlsx(x = mycomparison, + file = tempy, + asTable = TRUE, # Ensure data is written as a table + overwrite = TRUE, # Allow overwriting the file if it exists + tableStyle = "TableStyleMedium9") # Apply a predefined style + +# Upload the Excel file to SharePoint +drv$upload_file(src = tempy, + dest = paste0(sharepoint_output_dir, + "qa_result_old_vs_new_", + latest_year-4, "_", latest_year, ".xlsx")) + +rm(tempy) +``` + +# Saving Estimates & Metadata SQL Server + +We need to save our results and metadata to the development SQL Server. Later, once it passes human QA, it will be transferred to the production SQL Server. + +```{r save-sql} +#| warning: false + +chi_sql_update(CHIestimates = myestimates, + CHImetadata = mymetadata, + table_name = 'junk', # replace with data source name, e.g., 'birth', 'brfss', etc. + server = 'development', + replace_table = FALSE) +``` + +# Exporting Estimates & Metadata to Excel + +Finally, save a copy of our estimates and metadata to an Excel file in SharePoint: + +```{r save-excel} +#| eval: true +#| echo: true +#| message: false +#| warning: false +#| output: false +#| results: "hide" + +# Connect to SharePoint +team <- get_team("Community Health Indicators") +drv <- team$get_drive("CHI-Vizes") + +# Create Excel workbook with estimates and metadata +tempy <- tempfile(fileext = ".xlsx") +wb <- openxlsx::createWorkbook() + +# Add worksheets and write data +openxlsx::addWorksheet(wb, "Estimates") +openxlsx::writeDataTable(wb, "Estimates", + x = myestimates, + tableStyle = "TableStyleMedium9") + +openxlsx::addWorksheet(wb, "Metadata") +openxlsx::writeDataTable(wb, "Metadata", + x = mymetadata, + tableStyle = "TableStyleMedium9") + +# Save workbook to tempfile +openxlsx::saveWorkbook(wb, file = tempy, overwrite = TRUE) + +# Upload to SharePoint +drv$upload_file(src = tempy, + dest = paste0(sharepoint_output_dir, + "Tableau_Ready_", + latest_year-4, "_", latest_year, ".xlsx")) +rm(tempy) +``` + +# Delete Temporary Tables + +In the process of creating this vignette, we created some temporary tables on SharePoint and SQL Server. Let's delete these tables. + +```{r clean-up-temp-SQL} +#| eval: true +#| echo: true +#| message: false +#| warning: false +#| output: false +#| results: "hide" + +# Drop SharePoint directory +SharePoint_Parent <- strsplit(sharepoint_output_dir, "/")[[1]][1] +fff = drv$get_item(SharePoint_Parent)$delete(confirm = FALSE) + +# Drop the SQL Server tables +db_chi_dev <- odbc::dbConnect( + odbc::odbc(), + Driver = "SQL Server", + Server = "KCITSQLUATHIP40", # dev server + Database = "PHExtractStore") + +DBI::dbExecute(conn = db_chi_dev, "DROP TABLE [PHExtractStore].[APDE_WIP].[junk_results]") +DBI::dbExecute(conn = db_chi_dev, "DROP TABLE [PHExtractStore].[APDE_WIP].[junk_metadata]") + +``` + +# Conclusion + +Congratulations on completing the CHI analysis workflow! This workflow provides a standardized process for generating prevalence estimates, ensuring consistency and traceability. Follow these steps to streamline your analysis and maintain CHI standards across datasets. + +New functions you used: + +- `chi_generate_analysis_set()` to create the analysis set +- `chi_generate_tro_shell()` to generate calculation instructions +- `chi_calc()` to perform the actual calculations +- `chi_generate_metadata()` to create an updated metadata table +- `chi_qa_tro()` to perform quality assurance checks +- `chi_compare_estimates()` to identify notable differences compared to previous estimates +- `chi_sql_update()` to save estimates and metadata to SQL Server + +If you encounter issues or believe you've found a bug, please submit a GitHub issue at [Issues · PHSKC-APDE/apde.chi.tools](https://github.com/PHSKC-APDE/apde.chi.tools/issues). + +Remember that this workflow is specifically for prevalence/proportion calculations. For rate calculations that use population denominators, please refer to the dedicated rate calculation vignette. + +-- *Updated `r Sys.Date()`* From 6fbc3755bae206d539361c01b7b548b8c7413c96 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 21 Feb 2025 11:43:34 -0800 Subject: [PATCH 30/35] updated prevalence vignette QA code block --- quarto_docs/Calculating_Prevalences.qmd | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/quarto_docs/Calculating_Prevalences.qmd b/quarto_docs/Calculating_Prevalences.qmd index 0942672..3e00bc4 100644 --- a/quarto_docs/Calculating_Prevalences.qmd +++ b/quarto_docs/Calculating_Prevalences.qmd @@ -281,20 +281,24 @@ pretty_kable(head(mymetadata[])) After calculation, we need to perform quality assurance checks to ensure our estimates and metadata conform to CHI standards. `chi_qa_tro` checks whether the CHI estimates and metadata are properly formatted, complete, and compliant with required standards, ensuring that column names, values, and data types meet specified criteria, such as proper rounding, absence of missing critical data, and consistency with reference tables. It also checks for issues like infinite values and ensures that specific data rules (e.g., proportions, bounds) are followed. ```{r qa} -#| message: true +#| message: false +#| output: false # Perform QA checks qa_result <- chi_qa_tro(CHIestimates = myestimates, CHImetadata = mymetadata, acs = F, - ignore_trends = T, verbose = F) +``` + +The function returns 1 for pass, 0 for to warn of deviations from CHI standards -# The function returns 1 for pass, 0 for fail +```{r qa-result} +#| message: true if(qa_result == 1) { message("QA checks passed successfully!") } else { - message("QA checks failed. Please review your data and calculations.") + message("QA checks identified at least one deviation from CHI standars. Run again with verbose = TRUE.") } ``` From ba67303a8a47dca1cdd07e77cec2afc93785e038 Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 21 Feb 2025 11:44:39 -0800 Subject: [PATCH 31/35] updated warnings in chi_sql_updates - previously said it woudl not load the table if it did not previously exist. Now it states that a new table will be made --- R/chi_sql_update.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/chi_sql_update.R b/R/chi_sql_update.R index fd949f8..1d5bd4d 100644 --- a/R/chi_sql_update.R +++ b/R/chi_sql_update.R @@ -109,12 +109,12 @@ chi_sql_update <- function(CHIestimates = NULL, # check if *_results and *_metadata tables already exist in the appropriate schema---- if (!DBI::dbExistsTable(conn = CHI_db_cxn, glue::glue_sql("[PHExtractStore].[APDE{DBI::SQL(schema_suffix)}].[{DBI::SQL(table_name)}_results]", .con = CHI_db_cxn))) { - warning(paste0("\U00026A0\U00026A0\nThe table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_results]` does NOT currently exist. Continuing without this table.")) + warning(paste0("\U00026A0\U00026A0\U00026A0\U00026A0\nThe table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_results]` does NOT currently exist. A NEW table will be created.")) message("Continuing...") } if (!DBI::dbExistsTable(conn = CHI_db_cxn, glue::glue_sql("[PHExtractStore].[APDE{DBI::SQL(schema_suffix)}].[{DBI::SQL(table_name)}_metadata]", .con = CHI_db_cxn))) { - warning(paste0("\U00026A0\U00026A0\nThe table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_metadata]` does NOT currently exist. Continuing without this table.")) + warning(paste0("\U00026A0\U00026A0\U00026A0\U00026A0\nThe table `[PHExtractStore].[APDE", schema_suffix, "].[", table_name, "_metadata]` does NOT currently exist. A NEW table will be created.")) message("Continuing...") } From 4b4a9a8250043493420c2e7b1ce6fa2c53b7592a Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 21 Feb 2025 13:38:59 -0800 Subject: [PATCH 32/35] updated chi_qa_tro - improved details in header - removed ignore_trends argument because Joie officially asked that we no longer have the time_trends column - added tweaks to deal with the non-standard nature of birth data race/ethicity --- R/chi_qa_tro.R | 145 +++++++++++++++++-------------- man/chi_qa_tro.Rd | 37 ++++---- tests/testthat/test-chi_qa_tro.R | 36 ++++++-- 3 files changed, 120 insertions(+), 98 deletions(-) diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index 16dd330..0529102 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -1,25 +1,27 @@ -#' CHI - QA: Tableau Ready output +#' Quality Assurance Check for CHI Tableau Ready Output #' #' @description -#' Checks estimates created by \code{\link{chi_calc}} and metadata created by -#' \code{\link{chi_generate_metadata}} for compliance with CHI TRO standard. +#' Validates data structures and values in CHI estimates and metadata for compliance +#' with CHI Tableau Ready Output (TRO) standards. #' #' @details -#' This function tests if the structure of the data matches CHI Tableau Ready Output -#' specifications. If set to verbose mode (default) will report diagnostic information in -#' The user interface as warnings (if failed) and messages (for progress and pass). In any -#' Case will return 1 or 0 for pass or fail respectively. +#' This function performs comprehensive validation of data structure and content against +#' CHI TRO specifications. When verbose mode is enabled (default), it provides detailed +#' diagnostic information through warnings (for deviations from standards) and messages +#' (for progress and successful checks). +#' #' The CHI Tableau Ready Output (TRO) standards can be reviewed here: #' \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ #' SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} #' #' @param CHIestimates Name of a data.table or data.frame containing the prepared data to be pushed to SQL #' @param CHImetadata Name of a data.table or data.frame containing the metadata to be pushed to SQL -#' @param acs default FALSE, Indicates whether it is ACS data (which does not have / need varnames) -#' @param ignore_trends default TRUE, Indicates whether the time_trends column should be ignored when checking for missing data. -#' @param verbose default TRUE, if false will only return status +#' @param acs default \code{FALSE}, Indicates whether it is ACS data (which does not have / need varnames) +#' @param verbose default \code{TRUE}, Controls whether to display detailed progress and diagnostic messages #' -#' @return 1 or 0 for pass or fail status +#' @return Returns 1 if all checks pass, 0 if any deviations from standards are detected. +#' If the return value is 0, we suggest running the function with verbose=TRUE to review +#' specific details about potential issues that may need attention. #' #' @seealso #' \code{\link{chi_calc}} for generating estimates @@ -36,14 +38,12 @@ #' @importFrom rads tsql_validate_field_types round2 #' #' @examples -#' #' \dontrun{ -#' # QA check estimates and metadata +#' # Basic QA check of estimates and metadata #' qa_status <- chi_qa_tro( #' CHIestimates = my_estimates, #' CHImetadata = my_metadata, #' acs = FALSE, -#' ignore_trends = TRUE, #' verbose = TRUE #' ) #' } @@ -53,38 +53,50 @@ chi_qa_tro <- function(CHIestimates, CHImetadata, acs = FALSE, - ignore_trends = TRUE, - verbose = FALSE){ + verbose = TRUE){ status <- 1 ## Check arguments ---- + if(!is.logical(verbose)){ + stop('verbose must a logical, i.e., TRUE or FALSE') + } + + if(verbose){ - message("Checking that both the results and the metadata were provided") + message("Checking that that `acs` is logical") } - if(is.null(CHIestimates)){ - status <- 0 - if(verbose) { - warning("You must provide the name of a data.frame or data.table that contains the CHI results.") - } + if(!is.logical(acs)){ + stop('acs must a logical, i.e., TRUE or FALSE') + } + + if(verbose){ + message("Checking that both the results and the metadata were provided") } - if(is.null(CHImetadata)){ - status <- 0 - if(verbose){ - warning("You must provide the name of a data.frame or data.table that contains the CHI metadata ") + if(is.null(CHIestimates)){ + status <- 0 + if(verbose) { + warning("You must provide the name of a data.frame or data.table that contains the CHI results.") + } + } + if(is.null(CHImetadata)){ + status <- 0 + if(verbose){ + warning("You must provide the name of a data.frame or data.table that contains the CHI metadata ") + } - } - #if both data sets are not provided, abort check - if(status == 0) { - if(verbose){ - warning("Check incomplete. Please correct errors to proceed") } - return(status) - } + #if both data sets are not provided, abort check + if(status == 0) { + if(verbose){ + stop("Check incomplete. Please correct errors to proceed") + } + return(status) + } - CHIestimates <- data.table::setDT(copy(CHIestimates)) - CHImetadata <- data.table::setDT(copy(CHImetadata)) + CHIestimates <- data.table::setDT(copy(CHIestimates)) + CHImetadata <- data.table::setDT(copy(CHImetadata)) ## Check columns ---- if(verbose) { @@ -124,27 +136,30 @@ chi_qa_tro <- function(CHIestimates, } } - ## Confirm that there are no additional variables ---- - extra.var <- setdiff(names(CHIestimates), chi_get_cols()) - if(length(extra.var) > 0){ + if(verbose){ + message("Checking for unexpected columns") + } + missing.var <- setdiff(names(CHIestimates), chi_get_cols()) + if(length(missing.var) > 0){ status <- 0 - if(verbose){ - extra.var <- paste(extra.var, collapse = ", ") - warning(glue::glue("Your dataset contains the following columns that are not CHI compliant: {extra.var}. - Please drop these variables from CHIestimates before attempting to QA the data again.")) + if(verbose) { + missing.var <- paste(missing.var, collapse = ", ") + warning(glue::glue("Your CHIestimates table has the following non-standard columns: {missing.var} + Please drop these variables from CHIestimates before attempting to QA the data again.")) } } - extra.var <- setdiff(names(CHImetadata), names(unlist(chi_get_yaml()$metadata))) - if(length(extra.var) > 0){ + missing.var <- setdiff(names(CHImetadata), names(unlist(chi_get_yaml()$metadata))) + if(length(missing.var) > 0){ status <- 0 if(verbose){ - extra.var <- paste(extra.var, collapse = ", ") - warning(glue::glue("Your metadata table contains the following columns that are not CHI compliant: {extra.var}. - Please drop these variables from CHImetadata before attempting to QA the data again.")) + missing.var <- paste(missing.var, collapse = ", ") + warning(glue::glue("Your CHImetadata table has the following non-standard columns: {missing.var} + Please drop these variables from CHIestimates before attempting to QA the data again.")) } } + ## Confirm variable class ---- if(verbose){ message("Checking that variables are of the proper class") } @@ -160,33 +175,30 @@ chi_qa_tro <- function(CHIestimates, rads::tsql_validate_field_types(ph.data = CHImetadata, field_types = unlist(chi_get_yaml()$metadata)) # check CHI metadata table if(verbose) message(paste("", "", sep = "\n")) + ## Check for missingness ---- if(verbose){ message("Checking that critical columns are not missing any values") } + for(mycol in c("indicator_key", "year", "data_source", "tab", "cat1", "cat1_group", "run_date")){ if(nrow(CHIestimates[is.na(get(mycol))]) > 0){ status <- 0 warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI data. \n", "Fix the error and run this QA script again.")) } } + for(mycol in c("result", "lower_bound", "upper_bound", "se", "rse", "numerator", "denominator", "chi", "source_date", "run_date")){ if(nrow(CHIestimates[is.na(get(mycol)) & is.na(suppression)]) > 0){ status <- 0 warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the CHI data.")) } } - for(mycol in setdiff(names(unlist(chi_get_yaml()$metadata)), c("latest_year_kc_pop", "latest_year_count"))){ + for(mycol in names(unlist(chi_get_yaml()$metadata))){ if(nrow(CHImetadata[is.na(get(mycol))]) > 0){ status <- 0 warning(paste0("'", mycol, "' is missing in at least one row but is a critical identifier column in CHI metadata. \n", "Fix the error and run this QA script again.")) } } - for(mycol in c("latest_year_kc_pop", "latest_year_count")){ - if(nrow(CHImetadata[is.na(get(mycol))]) > 0){ - status <- 0 - warning(paste0("\U00026A0 Warning: '", mycol, "' is missing in at least one row of the metadata.")) - } - } if(status == 0) { if(verbose) { @@ -414,19 +426,6 @@ chi_qa_tro <- function(CHIestimates, } } - if(ignore_trends == F){ - if(verbose){ - message("Checking that time_trends are provided when tab=='trends'") - } - if(nrow(CHIestimates[tab == "trends" & is.na(time_trends)]) > 0 ){ - status <- 0 - if(verbose) { - warning(glue::glue("There is at least one row where tab=='trends' & where 'time_trends' is missing. - Please fill in the missing value before rerunning chi_qa_tro()")) - } - } - } - ## Ensure cat1/cat2 values meet CHI standards ---- if(verbose) { message("Checking that category combinations align with CHI standards") @@ -481,6 +480,12 @@ chi_qa_tro <- function(CHIestimates, group = cat1_group )]) + # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity + chi_cat1_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] + chi_cat1_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] + chi_cat1_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] + chi_cat1_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + cat1_invalid <- chi_cat1_combos[!ref_combos, on = list(cat, varname, group)] if(nrow(cat1_invalid) > 0) { @@ -501,6 +506,12 @@ chi_qa_tro <- function(CHIestimates, group = cat2_group )]) + # wonky tweaks because of annoying structure for Birthing Person's race/ethnicity + chi_cat2_combos[cat %in% c("Birthing person's race/ethnicity", "Birthing person's race"), cat := "[Birthing person's] Race"] + chi_cat2_combos[cat == "Birthing person's ethnicity", cat := "[Birthing person's] Ethnicity"] + chi_cat2_combos[cat == "[Birthing person's] Race" & varname == 'race3' & group == 'Hispanic', cat := "[Birthing person's] Ethnicity"] + chi_cat2_combos[, cat := gsub("Birthing person's eth", "[Birthing person's] Eth", cat)] + cat2_invalid <- chi_cat2_combos[!ref_combos, on = list(cat, varname, group)] if(nrow(cat2_invalid) > 0) { diff --git a/man/chi_qa_tro.Rd b/man/chi_qa_tro.Rd index 1b349e0..fc68ee1 100644 --- a/man/chi_qa_tro.Rd +++ b/man/chi_qa_tro.Rd @@ -2,52 +2,45 @@ % Please edit documentation in R/chi_qa_tro.R \name{chi_qa_tro} \alias{chi_qa_tro} -\title{CHI - QA: Tableau Ready output} +\title{Quality Assurance Check for CHI Tableau Ready Output} \usage{ -chi_qa_tro( - CHIestimates, - CHImetadata, - acs = FALSE, - ignore_trends = TRUE, - verbose = FALSE -) +chi_qa_tro(CHIestimates, CHImetadata, acs = FALSE, verbose = TRUE) } \arguments{ \item{CHIestimates}{Name of a data.table or data.frame containing the prepared data to be pushed to SQL} \item{CHImetadata}{Name of a data.table or data.frame containing the metadata to be pushed to SQL} -\item{acs}{default FALSE, Indicates whether it is ACS data (which does not have / need varnames)} - -\item{ignore_trends}{default TRUE, Indicates whether the time_trends column should be ignored when checking for missing data.} +\item{acs}{default \code{FALSE}, Indicates whether it is ACS data (which does not have / need varnames)} -\item{verbose}{default TRUE, if false will only return status} +\item{verbose}{default \code{TRUE}, Controls whether to display detailed progress and diagnostic messages} } \value{ -1 or 0 for pass or fail status +Returns 1 if all checks pass, 0 if any deviations from standards are detected. +If the return value is 0, we suggest running the function with verbose=TRUE to review +specific details about potential issues that may need attention. } \description{ -Checks estimates created by \code{\link{chi_calc}} and metadata created by -\code{\link{chi_generate_metadata}} for compliance with CHI TRO standard. +Validates data structures and values in CHI estimates and metadata for compliance +with CHI Tableau Ready Output (TRO) standards. } \details{ -This function tests if the structure of the data matches CHI Tableau Ready Output -specifications. If set to verbose mode (default) will report diagnostic information in -The user interface as warnings (if failed) and messages (for progress and pass). In any -Case will return 1 or 0 for pass or fail respectively. +This function performs comprehensive validation of data structure and content against +CHI TRO specifications. When verbose mode is enabled (default), it provides detailed +diagnostic information through warnings (for deviations from standards) and messages +(for progress and successful checks). + The CHI Tableau Ready Output (TRO) standards can be reviewed here: \href{https://kc1.sharepoint.com/teams/DPH-CommunityHealthIndicators/CHIVizes/CHI-Standards-TableauReady\%20Output.xlsx}{ SharePoint > Community Health Indicators > CHI_vizes > CHI-Standards-TableauReady Output.xlsx} } \examples{ - \dontrun{ -# QA check estimates and metadata +# Basic QA check of estimates and metadata qa_status <- chi_qa_tro( CHIestimates = my_estimates, CHImetadata = my_metadata, acs = FALSE, - ignore_trends = TRUE, verbose = TRUE ) } diff --git a/tests/testthat/test-chi_qa_tro.R b/tests/testthat/test-chi_qa_tro.R index 68e74df..8b1daa0 100644 --- a/tests/testthat/test-chi_qa_tro.R +++ b/tests/testthat/test-chi_qa_tro.R @@ -2,15 +2,33 @@ test_that("chi_qa_tro validates data structure", { test_data <- setup_test_data() - result <- chi_calc( - ph.data = test_data$my.analytic, - ph.instructions = test_data$my.instructions, - rate = FALSE, - source_name = "test", - source_date = Sys.Date() + # Should return 0 and give warning about missing values + expect_warning( + result <- chi_qa_tro( + CHIestimates = test_data$my.estimate, + CHImetadata = test_data$my.metadata, + acs = F, + verbose = F + ), + "100% missing" ) - expect_error(chi_qa_tro(), 'argument "CHIestimates" is missing, with no default') - expect_type(suppressWarnings(chi_qa_tro(CHIestimates = result, CHImetadata = test_data$my.metadata, verbose = FALSE)), "double") - expect_error(suppressWarnings(chi_qa_tro(result, test_data$my.metadata, verbose = FALSE)), NA) + expect_equal(result, 0) + + # Should give warning about missing cat1 column, but also an error b/c of nested rads function + expect_warning( + expect_error( + chi_qa_tro( + CHIestimates = test_data$my.estimate[, cat1 := NULL], + CHImetadata = test_data$my.metadata, + acs = F, + verbose = T + ), + "Validation of TSQL data types necessitates exactly one TSQL datatype per column name" + ), + "You are missing the following critical columns\\(s\\) in CHIestimates: cat1" + ) + + # Test missing argument case + expect_error(chi_qa_tro(verbose = F), 'argument "CHIestimates" is missing, with no default') }) From a9d78c520e6160792fdabf2cf1af5acef720756b Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 21 Feb 2025 13:41:01 -0800 Subject: [PATCH 33/35] helpfile direction to underlying file updated - changed name of code for chi_calc and chi_compare_estimeates & which resulted in inconsequential changes to their helpfiles --- man/chi_calc.Rd | 2 +- man/chi_compare_estimates.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/man/chi_calc.Rd b/man/chi_calc.Rd index 4ecf01d..2a6e61f 100644 --- a/man/chi_calc.Rd +++ b/man/chi_calc.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/proto_chi_calc.R +% Please edit documentation in R/chi_calc.R \name{chi_calc} \alias{chi_calc} \title{Calculate CHI Estimates} diff --git a/man/chi_compare_estimates.Rd b/man/chi_compare_estimates.Rd index 14a087c..1eb4830 100644 --- a/man/chi_compare_estimates.Rd +++ b/man/chi_compare_estimates.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/chi_compare_est.R +% Please edit documentation in R/chi_compare_estimates.R \name{chi_compare_estimates} \alias{chi_compare_estimates} \title{Compare and validate CHI (Community Health Indicator) estimates between two datasets} From f01980aeb89a6b25c1517030476906254b0ce4ad Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 21 Feb 2025 13:42:50 -0800 Subject: [PATCH 34/35] removed quarto version specification from DESC - specifying a minimum version caused devtools::check() to fail --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 18f03bb..472d284 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,7 @@ Suggests: progress, rmarkdown, markdown, - quarto (>= 1.4), + quarto, testthat (>= 3.0.0), srvyr, survey (>= 4.0) From 9cbbbd0831acd67936fcecbdffbdfdd3dd948d6d Mon Sep 17 00:00:00 2001 From: Danny Colombara Date: Fri, 21 Feb 2025 15:16:29 -0800 Subject: [PATCH 35/35] Update chi_calc to take `ci` argument - Joie changed default to 90% CI, and good to have option to change on demand from 90% --- R/chi_calc.R | 68 ++++++++++++++++--------- apde.chi.tools.Rproj | 1 + man/chi_calc.Rd | 32 ++++++------ quarto_docs/Calculating_Prevalences.qmd | 1 + tests/testthat/test-chi_calc.R | 1 + 5 files changed, 64 insertions(+), 39 deletions(-) diff --git a/R/chi_calc.R b/R/chi_calc.R index 4c1ca45..338d343 100644 --- a/R/chi_calc.R +++ b/R/chi_calc.R @@ -1,19 +1,19 @@ #' Calculate CHI Estimates #' #' @description -#' Generates CHI estimates from input data according to provided instructions -#' created by \code{\link{chi_generate_tro_shell}}. +#' Generates CHI estimates from input data according to provided instructions. #' Handles both proportions and rates, with options for suppression of small numbers. #' -#' @param ph.data Input data.frame or data.table containing analytic read data -#' @param ph.instructions data.frame or data.table containing calculation instructions -#' @param rate Logical; if \code{TRUE} calculates rates, if \code{FALSE} calculates proportions -#' @param rate_per Rate multiplier when \code{rate=TRUE} (e.g., 100000 for per 100,000) -#' @param small_num_suppress Logical; if \code{TRUE} suppresses small numbers -#' @param suppress_low Lower bound for suppression -#' @param suppress_high Upper bound for suppression -#' @param source_name Name of data source -#' @param source_date Date of data source +#' @param ph.data data.frame or data.table. Input data containing analytic read data. +#' @param ph.instructions data.frame or data.table. Calculation instructions for processing. +#' @param ci numeric. Confidence level between 0 and 1. Default: \code{0.90}. +#' @param rate logical. If TRUE calculates rates, if FALSE calculates proportions. Default: \code{FALSE}. +#' @param rate_per numeric. Rate multiplier when \code{rate = TRUE} (e.g., 100000 for per 100,000). Default: \code{NULL}. +#' @param small_num_suppress logical. If TRUE suppresses small numbers. Default: \code{TRUE}. +#' @param suppress_low numeric. Lower bound for suppression. Default: \code{0}. +#' @param suppress_high numeric. Upper bound for suppression. Default: \code{9}. +#' @param source_name character. Name of data source. Default: \code{NULL}. +#' @param source_date Date. Date ph.data was created. Default: \code{NULL}. #' #' @return A data.table containing CHI estimates with the following columns: #' \itemize{ @@ -35,7 +35,7 @@ #' \item{\code{caution}} '!' when RSE>=30% | N == 0 #' \item{\code{suppression}} '^' when suppressed #' \item{\code{numerator}} For line-level data, count of events; for surveys, people who responded yes or no for binary variable -#' \item{\code{numerator}} For line-level data, population; for surveys, sample size +#' \item{\code{denominator}} For line-level data, population; for surveys, sample size #' \item{\code{chi}} '1' indicates that rows is used for CHI #' \item{\code{source_date}} date analytic ready data was created #' \item{\code{run_date}} date of this analysis @@ -49,7 +49,7 @@ #' \code{\link{chi_generate_metadata}} for creating metadata from results #' #' @importFrom data.table setDT copy setnames := setorder set .SD data.table -#' @importFrom rads calc compare_estimate suppress chi_cols +#' @importFrom rads calc compare_estimate suppress chi_cols round2 #' @importFrom future.apply future_lapply #' @importFrom stats na.omit #' @import progressr @@ -58,34 +58,50 @@ #' chi_calc <- function(ph.data = NULL, ph.instructions = NULL, - rate = F, + ci = 0.90, + rate = FALSE, rate_per = NULL, - small_num_suppress = T, + small_num_suppress = TRUE, suppress_low = 0, suppress_high = 9, - source_name = 'blahblah', + source_name = NULL, source_date = NULL){ # Input validation ---- if (is.null(ph.data)) stop("\n\U1F6D1 ph.data must be provided") if (!is.data.frame(ph.data)) stop("\n\U1F6D1 ph.data must be a data.frame or data.table") + if (nrow(ph.data) == 0) stop("\n\U1F6D1 ph.data is empty") + if (!is.data.table(ph.data)) setDT(ph.data) if (is.null(ph.instructions)) stop("\n\U1F6D1 ph.instructions must be provided") if (!is.data.frame(ph.instructions)) stop("\n\U1F6D1 ph.instructions must be a data.frame or data.table") + if (nrow(ph.instructions) == 0) stop("\n\U1F6D1 ph.instructions is empty") + if (!is.data.table(ph.instructions)) setDT(ph.instructions) + # Validate ci parameter + if (!is.numeric(ci)) stop("\n\U1F6D1 ci must be numeric") + if (ci <= 0 || ci >= 1) stop("\n\U1F6D1 ci must be between 0 and 1") + + # Validate rate-related parameters if (!is.logical(rate)) stop("\n\U1F6D1 rate must be logical (TRUE/FALSE)") if (rate && is.null(rate_per)) stop("\n\U1F6D1 rate_per must be provided when rate=TRUE") + if (rate && !is.numeric(rate_per)) stop("\n\U1F6D1 rate_per must be numeric") + if (rate && rate_per <= 0) stop("\n\U1F6D1 rate_per must be positive") + # Validate suppression parameters if (!is.logical(small_num_suppress)) stop("\n\U1F6D1 small_num_suppress must be logical (TRUE/FALSE)") + if (!is.numeric(suppress_low)) stop("\n\U1F6D1 suppress_low must be numeric") + if (!is.numeric(suppress_high)) stop("\n\U1F6D1 suppress_high must be numeric") + if (suppress_low >= suppress_high) stop("\n\U1F6D1 suppress_low must be less than suppress_high") - # Convert to data.table if needed - if (!is.data.table(ph.data)) setDT(ph.data) - if (!is.data.table(ph.instructions)) setDT(ph.instructions) + # Validate source_name + if (is.null(source_name)) stop("\n\U1F6D1 source_name must be provided") + if (!is.character(source_name)) stop("\n\U1F6D1 source_name must be a character string") + if (nchar(trimws(source_name)) == 0) stop("\n\U1F6D1 source_name cannot be an empty string") + + # validate source_date + if (is.null(source_date)) stop("\n\U1F6D1 source_date must be provided") + if (!inherits(source_date, "Date")) stop("\n\U1F6D1 source_date must be a be a Date object") - # Error if ph.instructions has no data ---- - if(nrow(ph.instructions) == 0){ - stop("\n\U0001f47f the table ph.instructions does not have any rows.") - #tempCHIest <- data.table(setNames(data.frame(matrix(ncol = length(chi_cols()), nrow = 0), stringsAsFactors = FALSE), chi_cols())) - } # Create 'Overall' if needed for crosstabs ---- if(!'overall' %in% names(ph.data)){ @@ -147,11 +163,13 @@ chi_calc <- function(ph.data = NULL, tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend], what = current_row$indicator_key, by = tempbv, + ci = ci, metrics = c('mean', 'numerator', 'denominator', 'rse')) } else { tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County'], what = current_row$indicator_key, by = tempbv, + ci = ci, metrics = c('mean', 'numerator', 'denominator', 'rse')) } } @@ -160,12 +178,14 @@ chi_calc <- function(ph.data = NULL, tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend], what = current_row$indicator_key, by = tempbv, + ci = ci, metrics = c('rate', 'numerator', 'denominator', 'rse'), per = rate_per) } else { tempest <- rads::calc(ph.data = ph.data[chi_year >= tempstart & chi_year <= tempend & chi_geo_kc == 'King County'], what = current_row$indicator_key, by = tempbv, + ci = ci, metrics = c('rate', 'numerator', 'denominator', 'rse'), per = rate_per) } diff --git a/apde.chi.tools.Rproj b/apde.chi.tools.Rproj index 497f8bf..778b042 100644 --- a/apde.chi.tools.Rproj +++ b/apde.chi.tools.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 177bb990-4b8d-4877-bb44-c6d5b0a97b04 RestoreWorkspace: Default SaveWorkspace: Default diff --git a/man/chi_calc.Rd b/man/chi_calc.Rd index 2a6e61f..60c69e5 100644 --- a/man/chi_calc.Rd +++ b/man/chi_calc.Rd @@ -7,33 +7,36 @@ chi_calc( ph.data = NULL, ph.instructions = NULL, - rate = F, + ci = 0.9, + rate = FALSE, rate_per = NULL, - small_num_suppress = T, + small_num_suppress = TRUE, suppress_low = 0, suppress_high = 9, - source_name = "blahblah", + source_name = NULL, source_date = NULL ) } \arguments{ -\item{ph.data}{Input data.frame or data.table containing analytic read data} +\item{ph.data}{data.frame or data.table. Input data containing analytic read data.} -\item{ph.instructions}{data.frame or data.table containing calculation instructions} +\item{ph.instructions}{data.frame or data.table. Calculation instructions for processing.} -\item{rate}{Logical; if \code{TRUE} calculates rates, if \code{FALSE} calculates proportions} +\item{ci}{numeric. Confidence level between 0 and 1. Default: \code{0.90}.} -\item{rate_per}{Rate multiplier when \code{rate=TRUE} (e.g., 100000 for per 100,000)} +\item{rate}{logical. If TRUE calculates rates, if FALSE calculates proportions. Default: \code{FALSE}.} -\item{small_num_suppress}{Logical; if \code{TRUE} suppresses small numbers} +\item{rate_per}{numeric. Rate multiplier when \code{rate = TRUE} (e.g., 100000 for per 100,000). Default: \code{NULL}.} -\item{suppress_low}{Lower bound for suppression} +\item{small_num_suppress}{logical. If TRUE suppresses small numbers. Default: \code{TRUE}.} -\item{suppress_high}{Upper bound for suppression} +\item{suppress_low}{numeric. Lower bound for suppression. Default: \code{0}.} -\item{source_name}{Name of data source} +\item{suppress_high}{numeric. Upper bound for suppression. Default: \code{9}.} -\item{source_date}{Date of data source} +\item{source_name}{character. Name of data source. Default: \code{NULL}.} + +\item{source_date}{Date. Date ph.data was created. Default: \code{NULL}.} } \value{ A data.table containing CHI estimates with the following columns: @@ -56,15 +59,14 @@ A data.table containing CHI estimates with the following columns: \item{\code{caution}} '!' when RSE>=30% | N == 0 \item{\code{suppression}} '^' when suppressed \item{\code{numerator}} For line-level data, count of events; for surveys, people who responded yes or no for binary variable - \item{\code{numerator}} For line-level data, population; for surveys, sample size + \item{\code{denominator}} For line-level data, population; for surveys, sample size \item{\code{chi}} '1' indicates that rows is used for CHI \item{\code{source_date}} date analytic ready data was created \item{\code{run_date}} date of this analysis } } \description{ -Generates CHI estimates from input data according to provided instructions -created by \code{\link{chi_generate_tro_shell}}. +Generates CHI estimates from input data according to provided instructions. Handles both proportions and rates, with options for suppression of small numbers. } \seealso{ diff --git a/quarto_docs/Calculating_Prevalences.qmd b/quarto_docs/Calculating_Prevalences.qmd index 3e00bc4..6cf6856 100644 --- a/quarto_docs/Calculating_Prevalences.qmd +++ b/quarto_docs/Calculating_Prevalences.qmd @@ -200,6 +200,7 @@ options(future.globals.maxSize = future.GB * 1024^3) myestimates <- chi_calc(ph.data = birthsdt, ph.instructions = myinstructions, + ci = 0.90, rate = FALSE, small_num_suppress = TRUE, suppress_low = 0, diff --git a/tests/testthat/test-chi_calc.R b/tests/testthat/test-chi_calc.R index b79ae8e..36d320f 100644 --- a/tests/testthat/test-chi_calc.R +++ b/tests/testthat/test-chi_calc.R @@ -5,6 +5,7 @@ test_that("chi_calc performs basic calculations correctly", { result <- chi_calc( ph.data = test_data$my.analytic, ph.instructions = test_data$my.instructions, + ci = 0.90, rate = FALSE, source_name = "test", source_date = Sys.Date()