diff --git a/R/chi_drop_illogical_ages.R b/R/chi_drop_illogical_ages.R index 8452f1a..d919df3 100644 --- a/R/chi_drop_illogical_ages.R +++ b/R/chi_drop_illogical_ages.R @@ -79,14 +79,13 @@ chi_drop_illogical_ages <- function(ph.data, agevar = 'chi_age') { # Create a standardized version of the age group ph.data[, (temp_catgroup) := data.table::fcase( - get(catgroup) == '<1', '0-0', + get(catgroup) == '<1' & grepl(' age$|^age$', get(catnum), ignore.case = T), '0-0', - grepl("<", get(catgroup)), gsub("<", "0-", get(catgroup)), + grepl("<", get(catgroup)) &grepl(' age$|^age$', get(catnum), ignore.case = T), gsub("<", "0-", get(catgroup)), - grepl("\\+", get(catgroup)), gsub("\\+", "-120", get(catgroup)), + grepl("\\+", get(catgroup))& grepl(' age$|^age$', get(catnum), ignore.case = T), gsub("\\+", "-120", get(catgroup)), - grepl('-', get(catgroup)), get(catgroup) - )] + grepl('-', get(catgroup))& grepl(' age$|^age$', get(catnum), ignore.case = T), as.character(get(catgroup)))] # Extract min and max age ph.data[, "min_age" := as.numeric(gsub("-.*", "", get(temp_catgroup)))] diff --git a/R/chi_generate_tro_shell.R b/R/chi_generate_tro_shell.R index 05bc574..2e1480b 100644 --- a/R/chi_generate_tro_shell.R +++ b/R/chi_generate_tro_shell.R @@ -26,7 +26,6 @@ #' } #' #' @param ph.analysis_set name of data.table to parse -#' @param start.year the earliest year to be used for estimates #' @param end.year the latest year to be used for aggregate estimates #' @param year.span the number of years to be included in a single non-trend period #' @param trend.span the number of years to be included in a single trend period @@ -54,52 +53,50 @@ #' @export #' chi_generate_tro_shell <- function(ph.analysis_set, - start.year, - end.year, - year.span = NULL, - trend.span = NULL, - trend.periods = NULL){ + end.year, + year.span = NULL, + trend.span = NULL, + trend.periods = NULL){ # Input validation - if (missing(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be provided") - if (!is.data.frame(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be a data.frame or data.table") - - if (missing(start.year)) stop("\n\U1F6D1 start.year must be provided") - if (!is.numeric(start.year) || length(start.year) != 1) stop("\n\U1F6D1 start.year must be a single numeric value") + if (missing(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be provided") + if (!is.data.frame(ph.analysis_set)) stop("\n\U1F6D1 ph.analysis_set must be a data.frame or data.table") + if (!("set" %in% names(ph.analysis_set)) | anyNA(ph.analysis_set$set)) { + stop("\n\u1F6D1 set number must be provided for all rows") + } + if (missing(end.year)) stop("\n\U1F6D1 end.year must be provided") + if (!is.numeric(end.year) || length(end.year) != 1) stop("\n\U1F6D1 end.year must be a single numeric value") - if (missing(end.year)) stop("\n\U1F6D1 end.year must be provided") - if (!is.numeric(end.year) || length(end.year) != 1) stop("\n\U1F6D1 end.year must be a single numeric value") + if (!is.null(year.span) && (!is.numeric(year.span) || length(year.span) != 1)) { + stop("\n\U1F6D1 year.span must be NULL or a single numeric value") + } - if (!is.null(year.span) && (!is.numeric(year.span) || length(year.span) != 1)) { - stop("\n\U1F6D1 year.span must be NULL or a single numeric value") - } + if (!is.null(trend.span) && (!is.numeric(trend.span) || length(trend.span) != 1)) { + stop("\n\U1F6D1 trend.span must be NULL or a single numeric value") + } - if (!is.null(trend.span) && (!is.numeric(trend.span) || length(trend.span) != 1)) { - stop("\n\U1F6D1 trend.span must be NULL or a single numeric value") - } + if (!is.null(trend.periods) && (!is.numeric(trend.periods) || length(trend.periods) != 1)) { + stop("\n\U1F6D1 trend.periods must be NULL or a single numeric value") + } - if (!is.null(trend.periods) && (!is.numeric(trend.periods) || length(trend.periods) != 1)) { - stop("\n\U1F6D1 trend.periods must be NULL or a single numeric value") - } - # Convert to data.table if needed - if (!is.data.table(ph.analysis_set)) setDT(ph.analysis_set) + # Convert to data.table if needed + if (!is.data.table(ph.analysis_set)) setDT(ph.analysis_set) - #parameterization checks + # parameterization checks if("x" %in% ph.analysis_set$trends & (is.null(trend.span) | is.null(trend.periods))) {stop("you have indicated that a trends analysis is to be conducted, but have not indicated both the span and number of periods for this analysis.")} - #ph.analysis_set checks - - - #advisory messages + # advisory messages if("x" %in% ph.analysis_set$trends) {message("Note: trends are applied backwards from end.year")} # Race / ethnicity is a chronic headache with CHI. Need to remove rows for race4 & Ethnicity because should be Race/ethnicity ph.analysis_set <- ph.analysis_set[!(cat1_varname == 'race4' & cat1 == 'Ethnicity')] # apply the template generating function + # generate vector of sets + sets <- unique(ph.analysis_set$set) template <- rbindlist( - lapply(X = seq(1, length(unique(ph.analysis_set$set))), + lapply(X = sets, FUN = chi_process_nontrends, ph.analysis_set = ph.analysis_set)) # split trends from other tabs because processed for multiple years @@ -117,9 +114,9 @@ chi_generate_tro_shell <- function(ph.analysis_set, # add years to template (trends) if(nrow(template.trends) > 0){ trend.years <- chi_process_trends(indicator_key = intersect(unique(template$indicator_key), unique(template.trends$indicator_key)), - trend.span = trend.span, - end.year = end.year, - trend.periods = trend.periods) + trend.span = trend.span, + end.year = end.year, + trend.periods = trend.periods) template.trends <- merge(template.trends, trend.years, by = 'indicator_key', all = T, allow.cartesian = T) # append trends template to main template diff --git a/R/chi_get_cols.R b/R/chi_get_cols.R new file mode 100644 index 0000000..4f40acd --- /dev/null +++ b/R/chi_get_cols.R @@ -0,0 +1,25 @@ +#' Get CHI variable column names +#' +#' @description +#' Returns a character vector of column names defined in the CHI YAML reference file. +#' This helper function provides easy access to the standardized CHI variable names. +#' +#' @param metadata returns metadata column names instead of primary data +#' +#' @return A character vector of column names for the chi data (Default) or metadata +#' @importFrom yaml read_yaml +#' @export +#' +#' @examples +#' cols <- chi_get_cols() +chi_get_cols <- function(metadata = FALSE) { + chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools") + if (chi.yaml.filepath == "") { + stop("Could not find reference file chi_qa.yaml") + } + chi.yaml <- yaml::read_yaml(chi.yaml.filepath) + if(metadata){ + return(names(chi.yaml$metadata)) + } + return(names(chi.yaml$vars)) +} diff --git a/R/chi_get_yaml.R b/R/chi_get_yaml.R index 7d7c813..9ad1ebf 100644 --- a/R/chi_get_yaml.R +++ b/R/chi_get_yaml.R @@ -1,23 +1,3 @@ -#' Get CHI variable column names -#' -#' Returns a character vector of column names defined in the CHI YAML reference file. -#' This helper function provides easy access to the standardized CHI variable names. -#' -#' @return A character vector of column names -#' @importFrom yaml read_yaml -#' @export -#' -#' @examples -#' cols <- chi_get_cols() -chi_get_cols <- function() { - chi.yaml.filepath <- system.file("ref", "chi_qa.yaml", package = "apde.chi.tools") - if (chi.yaml.filepath == "") { - stop("Could not find reference file chi_qa.yaml") - } - chi.yaml <- yaml::read_yaml(chi.yaml.filepath) - return(names(chi.yaml$vars)) -} - #' Get CHI YAML configuration #' #' Returns the complete CHI YAML configuration as a list. diff --git a/R/chi_qa_tro.R b/R/chi_qa_tro.R index 17a1db1..4bb63f9 100644 --- a/R/chi_qa_tro.R +++ b/R/chi_qa_tro.R @@ -148,7 +148,7 @@ chi_qa_tro <- function(CHIestimates, } } - missing.var <- setdiff(names(CHImetadata), names(unlist(chi_get_yaml()$metadata))) + missing.var <- setdiff(names(CHImetadata), chi_get_cols(metadata = TRUE)) if(length(missing.var) > 0){ status <- 0 if(verbose){ diff --git a/inst/ref/chi_qa.yaml_pre2025.yaml b/inst/ref/chi_qa.yaml_pre2025.yaml new file mode 100644 index 0000000..cb0f5a7 --- /dev/null +++ b/inst/ref/chi_qa.yaml_pre2025.yaml @@ -0,0 +1,62 @@ +schema: APDE_WIP +table: +years: + +vars: + data_source: varchar(255) + indicator_key: varchar(255) + tab: varchar(255) + year: varchar(255) + cat1: varchar(255) + cat1_group: nvarchar(2000) + cat1_varname: varchar(255) + cat2: varchar(255) + cat2_group: nvarchar(2000) + cat2_varname: varchar(255) + result: float + lower_bound: float + upper_bound: float + se: float + rse: float + comparison_with_kc: varchar(255) + time_trends: varchar(255) + significance: varchar(255) + caution: varchar(255) + suppression: varchar(255) + numerator: float + denominator: float + chi: tinyint + source_date: date + run_date: date + +metadata: + data_source: varchar(255) + indicator_key: varchar(255) + result_type: varchar(255) + valence: varchar(255) + latest_year: int + latest_year_result: float + latest_year_kc_pop: int + latest_year_count: int + map_type: varchar(255) + unit: varchar(255) + valid_years: varchar(255) + chi: tinyint + run_date: date + +toc: + data_source: varchar(255) + indicator_key: varchar(255) + topic_chi: varchar(255) + topic_bsk: varchar(255) + topic_bskhs: varchar(255) + title_toc: varchar(255) + description: varchar(1000) + url_backup: varchar(500) + latest_year_bk: varchar(255) + latest_rate_bk: varchar(255) + toc_bsk: float + toc_bskhs: float + toc_chi: float + toc_cc: float + diff --git a/man/chi_generate_tro_shell.Rd b/man/chi_generate_tro_shell.Rd index 50ab14d..74a6e17 100644 --- a/man/chi_generate_tro_shell.Rd +++ b/man/chi_generate_tro_shell.Rd @@ -6,7 +6,6 @@ \usage{ chi_generate_tro_shell( ph.analysis_set, - start.year, end.year, year.span = NULL, trend.span = NULL, @@ -16,8 +15,6 @@ chi_generate_tro_shell( \arguments{ \item{ph.analysis_set}{name of data.table to parse} -\item{start.year}{the earliest year to be used for estimates} - \item{end.year}{the latest year to be used for aggregate estimates} \item{year.span}{the number of years to be included in a single non-trend period} diff --git a/man/chi_get_cols.Rd b/man/chi_get_cols.Rd index 2047f92..8580d92 100644 --- a/man/chi_get_cols.Rd +++ b/man/chi_get_cols.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/chi_get_yaml.R +% Please edit documentation in R/chi_get_cols.R \name{chi_get_cols} \alias{chi_get_cols} \title{Get CHI variable column names} diff --git a/quarto_docs/Calculating_Prevalences.qmd b/quarto_docs/Calculating_Prevalences.qmd index db37ad1..1a72a63 100644 --- a/quarto_docs/Calculating_Prevalences.qmd +++ b/quarto_docs/Calculating_Prevalences.qmd @@ -132,7 +132,6 @@ To analyze our data consistently, we need to generate a structured set of calcul myinstructions <- chi_generate_tro_shell( ph.analysis_set = analysis_sets, - start.year = latest_year-4, # earliest year to be used for estimates end.year = latest_year, # latest year to be used for aggregate estimates year.span = 5, # number of years included in a single period trend.span = 3, # number of years included in a trend single period diff --git a/tests/testthat/test-chi_generate_tro_shell.R b/tests/testthat/test-chi_generate_tro_shell.R index 5bf4a43..18370b7 100644 --- a/tests/testthat/test-chi_generate_tro_shell.R +++ b/tests/testthat/test-chi_generate_tro_shell.R @@ -2,8 +2,8 @@ test_that("chi_generate_tro_shell validates inputs correctly", { test_data <- setup_test_data() expect_error(chi_generate_tro_shell(), "ph.analysis_set must be provided") - expect_error(chi_generate_tro_shell(data.frame(), start.year = "2023"), - "start.year must be a single numeric value") + expect_error(chi_generate_tro_shell(data.frame()), + "set number must be provided for all rows") }) test_that("ingest template format", { @@ -29,7 +29,6 @@ test_that("ingest template format", { trends, set_indicator_keys) DT <- chi_generate_tro_shell(ph.analysis_set = template, - start.year = 2021, end.year = 2022, year.span = 5, trend.span = 3,