From daebc3895811cc62b95fb2a630cc6a361979eca5 Mon Sep 17 00:00:00 2001 From: Buie Date: Tue, 1 Apr 2025 14:04:18 -0700 Subject: [PATCH 01/21] expand testing analytic set instructions --- tests/testthat/helper.R | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index b9e2154..cf366fc 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -28,15 +28,15 @@ setup_test_data <- function() { # Sample analysis set ---- test_analysis_set <- data.table( - cat1 = c('Regions', 'Gender'), - cat1_varname = c('chi_geo_region', 'chi_sex'), + cat1 = rep(c('Regions', 'Gender', 'Race/ethnicity'),2), + cat1_varname = rep(c('chi_geo_region', 'chi_sex', 'race4'),2), `_kingcounty` = c('x'), - `_wastate` = NA_character_, - demgroups = NA_character_, - crosstabs = NA_character_, - trends = NA_character_, - set = 1, - set_indicator_keys = 'indicator1, indicator2' + `_wastate` = rep(c(rep(NA_character_,2),"x"),2), + demgroups = rep(c(rep(NA_character_,2),"x"),2), + crosstabs = rep(c(rep(NA_character_,2),"x"),2), + trends = rep(c(rep(NA_character_,2),"x"),2), + set = c(rep(1,3), rep(2,3)), + set_indicator_keys = c(rep(c('indicator1, indicator2'),3), rep("indicator3",3)) ) # Sample instructions ---- @@ -53,9 +53,9 @@ setup_test_data <- function() { # Sample estimates ---- test_estimates <- data.table( - indicator_key = c("indicatorX"), + indicator_key = c(rep("indicator1",2), rep("indicator2",2)), tab = c(rep('demgroups', 4), '_kingcounty'), - year = c('2023'), + year = c('2024'), cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), cat1_group = c("East", "North", "Seattle", "South", 'King County'), cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), From 18d86b6e8457097aa71c7de89563ca0ee097b74f Mon Sep 17 00:00:00 2001 From: Buie Date: Tue, 1 Apr 2025 14:04:32 -0700 Subject: [PATCH 02/21] remove unneeded tests --- tests/testthat/test-chi_process_trends.R | 5 ----- tests/testthat/test-chi_qa.R | 3 --- 2 files changed, 8 deletions(-) delete mode 100644 tests/testthat/test-chi_process_trends.R delete mode 100644 tests/testthat/test-chi_qa.R diff --git a/tests/testthat/test-chi_process_trends.R b/tests/testthat/test-chi_process_trends.R deleted file mode 100644 index f9ba495..0000000 --- a/tests/testthat/test-chi_process_trends.R +++ /dev/null @@ -1,5 +0,0 @@ -test_that("calculates trends", { - # chi_generate_trend_years does not exist - # DT <- chi_generate_trend_years(indicator_key = c("test1", "test2"),span = 3,begin.year = 2009,final.year = 2023) - expect_identical(1L, 1L) # a dummy test because devtools::check does not allow empty test_that statements -}) diff --git a/tests/testthat/test-chi_qa.R b/tests/testthat/test-chi_qa.R deleted file mode 100644 index 8849056..0000000 --- a/tests/testthat/test-chi_qa.R +++ /dev/null @@ -1,3 +0,0 @@ -test_that("multiplication works", { - expect_equal(2 * 2, 4) -}) From d1a383a7a1907a37ba5fb3a83123e5cc77204065 Mon Sep 17 00:00:00 2001 From: Buie Date: Tue, 1 Apr 2025 14:34:34 -0700 Subject: [PATCH 03/21] sepearte into 2 set version of hgelper --- tests/testthat/helper.R | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index cf366fc..dda2845 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -28,6 +28,18 @@ setup_test_data <- function() { # Sample analysis set ---- test_analysis_set <- data.table( + cat1 = c('Regions', 'Gender'), + cat1_varname = c('chi_geo_region', 'chi_sex'), + `_kingcounty` = c('x'), + `_wastate` = NA_character_, + demgroups = NA_character_, + crosstabs = NA_character_, + trends = NA_character_, + set = 1, + set_indicator_keys = 'indicator1, indicator2' + ) + + test_analysis_set_twosets <- data.table( cat1 = rep(c('Regions', 'Gender', 'Race/ethnicity'),2), cat1_varname = rep(c('chi_geo_region', 'chi_sex', 'race4'),2), `_kingcounty` = c('x'), @@ -53,9 +65,9 @@ setup_test_data <- function() { # Sample estimates ---- test_estimates <- data.table( - indicator_key = c(rep("indicator1",2), rep("indicator2",2)), + indicator_key = c("indicatorX"), tab = c(rep('demgroups', 4), '_kingcounty'), - year = c('2024'), + year = c('2023'), cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), cat1_group = c("East", "North", "Seattle", "South", 'King County'), cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), @@ -122,6 +134,7 @@ setup_test_data <- function() { # Return ---- list(my.analytic = test_analytic, my.analysis_set = test_analysis_set, + my.analysis_set_twosets = test_analysis_set_twosets, my.estimate= test_estimates, my.estimate_old= test_estimates_old, my.metadata = test_metadata, From 38105f55f5a9fd5a1fc27bb348fe67cbda9d205c Mon Sep 17 00:00:00 2001 From: Buie Date: Tue, 1 Apr 2025 14:35:17 -0700 Subject: [PATCH 04/21] rename indicator to "indicator1" --- tests/testthat/helper.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index dda2845..e7553ee 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -65,7 +65,7 @@ setup_test_data <- function() { # Sample estimates ---- test_estimates <- data.table( - indicator_key = c("indicatorX"), + indicator_key = c("indicator1"), tab = c(rep('demgroups', 4), '_kingcounty'), year = c('2023'), cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), From ff0df91aa49c289703ef2f99749f5194f2b9c739 Mon Sep 17 00:00:00 2001 From: Buie Date: Wed, 2 Apr 2025 16:28:52 -0700 Subject: [PATCH 05/21] variable modeller able to produce code for modelling a provided factor and a likely categorical integer (integer with less than 25) does put "NA" instead of NA object, fix this next categorical strings and non categorical numerics should be straightforward after that --- tests/testthat/helper.R | 104 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index e7553ee..839fe54 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -39,6 +39,83 @@ setup_test_data <- function() { set_indicator_keys = 'indicator1, indicator2' ) + variable_modeller <- function(oneVariable, numberOfObservations) { + #if no match, report unmatched type + instructions <- FALSE + + variableName <- sub(".*\\$.*?", "\\1", deparse(substitute(oneVariable))) + + #factor + if(instructions == FALSE & is.factor(oneVariable)) { + orderTF <- is.ordered(oneVariable) + detectedLevels <- levels(oneVariable) + prop.table(table(oneVariable, useNA = "ifany")) + instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") + } + + #integer: categorical + if(instructions == FALSE & is.integer(oneVariable) & length(unique(oneVariable)) < 25) { + instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + } + + #if unmatched + if(instructions == FALSE) { + instructions <- paste0("# data type of ",deparse(substitute(oneVariable)) ," not modelled") + } + return(instructions) + } + + + + generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000){ + ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data + ### receives description of data set to emulate, number of observations to include, and a seed. If dataset is "generic" will returned structure will have idealized chi values and generic indicators + ### returns a data.table of synthetic data + + # input validation + datasetOptions <- c("generic", "hys") + dataset <- tolower(dataset) + if(!(dataset %in% datasetOptions)) { + stop(paste0("dataset must be one of: '", paste(datasetOptions, collapse = "', '"),"'")) + } + + set.seed(seed) + if(dataset == "generic") { + test_data <- data.table( + id = 1:observations, + chi_geo_kc = sample(c(0,1), observations, replace = T), + chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), numberOfObservations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), + + chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), + indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), + indicator2 = as.factor(sample(c(1,2,3,4, NA), observations, replace = T)), + indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), + chi_year = 2023) + } else if(dataset == "hys") { + test_data <- data.table(abusive_adult = sample(c('NA', '0', '1'), 100, replace = TRUE, prob = c(0.273034917704968, 0.054280110752192, 0.67268497154284)), + chi_sex = factor(sample(c('Female', 'Male', 'NA'), 100, replace = TRUE, prob = c(0.491578218735579, 0.490924473157976, 0.0174973081064452)), levels = c('Female', 'Male'), ordered = FALSE) + + ) + + } else if(dataset == "hysold") { + test_data <- data.table( + id = 1:observations, + chi_geo_kc = sample(c(0,1), observations, replace = T), + chi_race_eth8 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), numberOfObservations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), + chi_geo_region = as.factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T)), + indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), + indicator2 = as.factor(sample(c(1,2,3,4, NA), observations, replace = T)), + indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), + chi_year = 202) + + } + return(test_data) + } + + + test_analysis_set_twosets <- data.table( cat1 = rep(c('Regions', 'Gender', 'Race/ethnicity'),2), cat1_varname = rep(c('chi_geo_region', 'chi_sex', 'race4'),2), @@ -89,6 +166,33 @@ setup_test_data <- function() { test_estimates[, lower_bound := result - 1.96 * se] test_estimates[, upper_bound := result + 1.96 * se] + test_estimates_twosets <- data.table( + indicator_key = c("indicator1"), + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2023'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = Sys.Date(), + run_date = Sys.Date(), + numerator = c(111, 175, 210, 600, 430000), + denominator = c(1000, 1500, 2000, 2500, 2200000) + ) + test_estimates_twosets[, result := numerator / denominator] + test_estimates_twosets[, se := sqrt((result * (1-result)) / denominator)] + test_estimates_twosets[, rse := 100 * se / result] + test_estimates_twosets[, lower_bound := result - 1.96 * se] + test_estimates_twosets[, upper_bound := result + 1.96 * se] + + + test_estimates_old <- data.table( indicator_key = c("indicatorX"), tab = c(rep('demgroups', 4), '_kingcounty'), From 02f0e3d9c862b918b3bd07557d62ace7ade02f69 Mon Sep 17 00:00:00 2001 From: Buie Date: Thu, 3 Apr 2025 13:47:27 -0700 Subject: [PATCH 06/21] fixed "NA" and added remaining data type catchers for data modeller function. --- tests/testthat/helper.R | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 839fe54..8ad2515 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -40,22 +40,47 @@ setup_test_data <- function() { ) variable_modeller <- function(oneVariable, numberOfObservations) { + + #note : ooooooocurrently setting 61 as categorical threshold because of HRAs. + #if no match, report unmatched type instructions <- FALSE variableName <- sub(".*\\$.*?", "\\1", deparse(substitute(oneVariable))) #factor - if(instructions == FALSE & is.factor(oneVariable)) { + if(instructions == FALSE & class(oneVariable) == "factor") { orderTF <- is.ordered(oneVariable) detectedLevels <- levels(oneVariable) prop.table(table(oneVariable, useNA = "ifany")) instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + instructions <- paste0(instructions, " # as a factor") } #integer: categorical - if(instructions == FALSE & is.integer(oneVariable) & length(unique(oneVariable)) < 25) { + if(instructions == FALSE & class(oneVariable) == "numeric" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + instructions <- paste0(instructions, " # as a categorical non factor") + } + + #character: categorical + if(instructions == FALSE & class(oneVariable) == "character" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + instructions <- paste0(instructions, " # as a categorical non factor") + } + + #continuous + if(instructions == FALSE & class(oneVariable) == "numeric" & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + min(oneVariable) + max(oneVariable) + runif(numberOfObservations, min(oneVariable), max(oneVariable)) + + instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + instructions <- paste0(instructions, " # as a categorical non factor") } #if unmatched @@ -65,7 +90,9 @@ setup_test_data <- function() { return(instructions) } + oneVariable <- testHYS$abusive_intimate_partner + lapply(testHYS ,variable_modeller, numberOfObservations = 100) generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000){ ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data @@ -86,15 +113,14 @@ setup_test_data <- function() { chi_geo_kc = sample(c(0,1), observations, replace = T), chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), numberOfObservations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), - chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), indicator2 = as.factor(sample(c(1,2,3,4, NA), observations, replace = T)), indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), chi_year = 2023) } else if(dataset == "hys") { - test_data <- data.table(abusive_adult = sample(c('NA', '0', '1'), 100, replace = TRUE, prob = c(0.273034917704968, 0.054280110752192, 0.67268497154284)), - chi_sex = factor(sample(c('Female', 'Male', 'NA'), 100, replace = TRUE, prob = c(0.491578218735579, 0.490924473157976, 0.0174973081064452)), levels = c('Female', 'Male'), ordered = FALSE) + test_data <- data.table(abusive_adult = sample(c(NA, '0', '1'), 100, replace = TRUE, prob = c(0.273034917704968, 0.054280110752192, 0.67268497154284)), + chi_sex = factor(sample(c('Female', 'Male', NA), 100, replace = TRUE, prob = c(0.491578218735579, 0.490924473157976, 0.0174973081064452)), levels = c('Female', 'Male'), ordered = FALSE) ) From c51974226b8386a20c3418237e17a3130a6bd581 Mon Sep 17 00:00:00 2001 From: Buie Date: Thu, 3 Apr 2025 16:05:25 -0700 Subject: [PATCH 07/21] added optioin to not report failed data models mid process trying to execute output outomatically --- tests/testthat/helper.R | 61 +++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 8ad2515..bcfb9d4 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -39,60 +39,81 @@ setup_test_data <- function() { set_indicator_keys = 'indicator1, indicator2' ) - variable_modeller <- function(oneVariable, numberOfObservations) { + + variable_modeller <- function(oneVariable, numberOfObservations, varName = NA, report_empty = TRUE) { + if(any(class(oneVariable) %in% "data.table")) { + if(ncol(oneVariable) == 1) { + message(class(oneVariable)) + oneVariable <- oneVariable[,1][[1]] + message(class(oneVariable)) + message("caught DT") + } else { + stop("more than 1 column passed. Only pass a vector or one column") + } + } #note : ooooooocurrently setting 61 as categorical threshold because of HRAs. #if no match, report unmatched type - instructions <- FALSE + instructions <- NA - variableName <- sub(".*\\$.*?", "\\1", deparse(substitute(oneVariable))) + if(is.na(varName)){ + variableName <- sub(".*\\$.*?", "\\1", deparse(substitute(oneVariable))) + } else { + variableName <- varName + } #factor - if(instructions == FALSE & class(oneVariable) == "factor") { + if(is.na(instructions) & class(oneVariable) == "factor") { orderTF <- is.ordered(oneVariable) detectedLevels <- levels(oneVariable) - prop.table(table(oneVariable, useNA = "ifany")) instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") instructions <- gsub("'NA'", "NA", instructions) instructions <- paste0(instructions, " # as a factor") } #integer: categorical - if(instructions == FALSE & class(oneVariable) == "numeric" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + if(is.na(instructions) & (class(oneVariable) == "numeric" | class(oneVariable) == "integer") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") instructions <- gsub("'NA'", "NA", instructions) instructions <- paste0(instructions, " # as a categorical non factor") } #character: categorical - if(instructions == FALSE & class(oneVariable) == "character" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + if(is.na(instructions) & class(oneVariable) == "character" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") instructions <- gsub("'NA'", "NA", instructions) instructions <- paste0(instructions, " # as a categorical non factor") } #continuous - if(instructions == FALSE & class(oneVariable) == "numeric" & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { - min(oneVariable) - max(oneVariable) - runif(numberOfObservations, min(oneVariable), max(oneVariable)) - - instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") - instructions <- gsub("'NA'", "NA", instructions) - instructions <- paste0(instructions, " # as a categorical non factor") + if(is.na(instructions) & class(oneVariable) == "numeric" & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + #uniform distribution + instructions <- paste0(variableName, " = runif(", numberOfObservations,", ", min(oneVariable), ", ", max(oneVariable),")") + instructions <- paste0(instructions, " # continuous with uniform distribution") } #if unmatched - if(instructions == FALSE) { - instructions <- paste0("# data type of ",deparse(substitute(oneVariable)) ," not modelled") + if(is.na(instructions) & report_empty) { + instructions <- paste0("# data type of ",variableName ," not modelled") + } + + if(is.na(instructions)) { + + } else{ + return(instructions) } - return(instructions) } - oneVariable <- testHYS$abusive_intimate_partner + batch_variable_modeller <- function(x) { + variable_modeller(testHYS[,..x][[1]], 100, names(testHYS)[x], report_empty = FALSE) + } + + code <- lapply(seq_along(testHYS), batch_variable_modeller) + + cat(unlist(code), sep = ",\n\n") #copy this into your DT generating code - lapply(testHYS ,variable_modeller, numberOfObservations = 100) + test <- data.table( parse(text = cat(unlist(code), sep = ",\n\n"))) generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000){ ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data From f1017ce6a3eb9542d8b29f0677441cdcc971de76 Mon Sep 17 00:00:00 2001 From: Buie Date: Fri, 4 Apr 2025 03:01:36 -0700 Subject: [PATCH 08/21] have working pipeline to create generic dataframes from actual DT --- tests/testthat/helper.R | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index bcfb9d4..b42d59e 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -41,7 +41,7 @@ setup_test_data <- function() { - variable_modeller <- function(oneVariable, numberOfObservations, varName = NA, report_empty = TRUE) { + variable_modeller <- function(oneVariable, numberOfObservations, varName = NA, report_empty = TRUE, comments = TRUE) { if(any(class(oneVariable) %in% "data.table")) { if(ncol(oneVariable) == 1) { message(class(oneVariable)) @@ -69,33 +69,44 @@ setup_test_data <- function() { detectedLevels <- levels(oneVariable) instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") instructions <- gsub("'NA'", "NA", instructions) - instructions <- paste0(instructions, " # as a factor") + if(comments){ + instructions <- paste0(instructions, " # as a factor") + } } #integer: categorical if(is.na(instructions) & (class(oneVariable) == "numeric" | class(oneVariable) == "integer") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") instructions <- gsub("'NA'", "NA", instructions) - instructions <- paste0(instructions, " # as a categorical non factor") + if(comments){ + instructions <- paste0(instructions, " # as a categorical non factor") + } } #character: categorical if(is.na(instructions) & class(oneVariable) == "character" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") instructions <- gsub("'NA'", "NA", instructions) - instructions <- paste0(instructions, " # as a categorical non factor") + if(comments){ + + instructions <- paste0(instructions, " # as a categorical non factor") + } } #continuous if(is.na(instructions) & class(oneVariable) == "numeric" & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { #uniform distribution - instructions <- paste0(variableName, " = runif(", numberOfObservations,", ", min(oneVariable), ", ", max(oneVariable),")") - instructions <- paste0(instructions, " # continuous with uniform distribution") + instructions <- paste0(variableName, " = runif(", numberOfObservations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),")") + if(comments){ + instructions <- paste0(instructions, " # continuous with uniform distribution") + } } #if unmatched if(is.na(instructions) & report_empty) { - instructions <- paste0("# data type of ",variableName ," not modelled") + if(comments){ + instructions <- paste0("# data type of ",variableName ," not modelled") + } } if(is.na(instructions)) { @@ -106,14 +117,16 @@ setup_test_data <- function() { } batch_variable_modeller <- function(x) { - variable_modeller(testHYS[,..x][[1]], 100, names(testHYS)[x], report_empty = FALSE) + variable_modeller(testHYS[,..x][[1]], 100, names(testHYS)[x], report_empty = FALSE, comments = FALSE) } code <- lapply(seq_along(testHYS), batch_variable_modeller) - cat(unlist(code), sep = ",\n\n") #copy this into your DT generating code + variablesToAdd <- paste(unlist(code), collapse =", ") #copy this into your DT generating code + - test <- data.table( parse(text = cat(unlist(code), sep = ",\n\n"))) + + eval( parse(text =paste0(" test <- data.table(", variablesToAdd, ")",collapse = ""))) generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000){ ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data @@ -157,6 +170,7 @@ setup_test_data <- function() { indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), chi_year = 202) + } else if(dataset == "hys") { } return(test_data) } From d050bc40f89b3891dd196f827dcacf5da8016b09 Mon Sep 17 00:00:00 2001 From: Buie Date: Mon, 7 Apr 2025 16:49:29 -0700 Subject: [PATCH 09/21] completed prototype of data generator --- tests/testthat/helper.R | 154 +++++++++++++++++++++++----------------- 1 file changed, 90 insertions(+), 64 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index b42d59e..e0212f0 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -41,92 +41,118 @@ setup_test_data <- function() { - variable_modeller <- function(oneVariable, numberOfObservations, varName = NA, report_empty = TRUE, comments = TRUE) { - if(any(class(oneVariable) %in% "data.table")) { - if(ncol(oneVariable) == 1) { - message(class(oneVariable)) - oneVariable <- oneVariable[,1][[1]] - message(class(oneVariable)) - message("caught DT") - } else { - stop("more than 1 column passed. Only pass a vector or one column") - } - } - #note : ooooooocurrently setting 61 as categorical threshold because of HRAs. - #if no match, report unmatched type - instructions <- NA + number_of_observations <- 100 + comments <- TRUE + return_code <- FALSE - if(is.na(varName)){ - variableName <- sub(".*\\$.*?", "\\1", deparse(substitute(oneVariable))) - } else { - variableName <- varName + generate.test.data <- function(ph.data, number_of_observations, years, return_code = TRUE, comments = TRUE) { + ### given a data table of public health data, can return code or a DT of identical structure and similar, but non-correlated, values for each variable. + ### warning: r has a character limit of 4094 for executing a line of code in the console. If code is requested, and the resulting code is longer, you must break this into smaller chunks (and rbind the results) or source the code as a script (I need to test if this will work) + ### warning: currently will create multiple years, but reads the received data set as if it were one year, and models multiple years by repeating the model process with shifted seed + ### warning: number.of.observations is of the final dataset. If the requested number does not divide evenly across the number of years, the result will be rounded up and the user should remove observations if necessary + ### warning: the data returned is modelled on the data given but correlations between variables are not. This effectively anonymizes results as long as the underlying populations are diverse or large enough. A small enough population may provide sufficient certainty of the results. (build an example, like seperate runs of this with a mono-race table bound together, versus a multi race table) + + if(!return_code & comments) { + message("user has requested data, comments set to FALSE") + comments <- FALSE } - #factor - if(is.na(instructions) & class(oneVariable) == "factor") { - orderTF <- is.ordered(oneVariable) - detectedLevels <- levels(oneVariable) - instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") - instructions <- gsub("'NA'", "NA", instructions) - if(comments){ - instructions <- paste0(instructions, " # as a factor") + variable_modeller <- function(oneVariable, number_of_observations, varName = NA, comments = TRUE) { + if(any(class(oneVariable) %in% "data.table")) { + if(ncol(oneVariable) == 1) { + message(class(oneVariable)) + oneVariable <- oneVariable[,1][[1]] + message(class(oneVariable)) + message("caught DT") + } else { + stop("more than 1 column passed. Only pass a vector or one column") + } } - } + #note : ooooooocurrently setting 61 as categorical threshold because of HRAs. + + #if no match, report unmatched type + instructions <- NA - #integer: categorical - if(is.na(instructions) & (class(oneVariable) == "numeric" | class(oneVariable) == "integer") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { - instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") - instructions <- gsub("'NA'", "NA", instructions) - if(comments){ - instructions <- paste0(instructions, " # as a categorical non factor") + if(is.na(varName)){ + variableName <- sub(".*\\$.*?", "\\1", deparse(substitute(oneVariable))) + } else { + variableName <- varName } - } - #character: categorical - if(is.na(instructions) & class(oneVariable) == "character" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { - instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", numberOfObservations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") - instructions <- gsub("'NA'", "NA", instructions) - if(comments){ + #factor + if(is.na(instructions) & class(oneVariable) == "factor") { + orderTF <- is.ordered(oneVariable) + detectedLevels <- levels(oneVariable) + instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + if(comments){ + instructions <- paste0(instructions, " # as a factor") + } + } - instructions <- paste0(instructions, " # as a categorical non factor") + #integer: categorical + if(is.na(instructions) & (class(oneVariable) == "numeric" | class(oneVariable) == "integer") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + if(comments){ + instructions <- paste0(instructions, " # as a categorical non factor") + } } - } - #continuous - if(is.na(instructions) & class(oneVariable) == "numeric" & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { - #uniform distribution - instructions <- paste0(variableName, " = runif(", numberOfObservations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),")") - if(comments){ - instructions <- paste0(instructions, " # continuous with uniform distribution") + #character: categorical + if(is.na(instructions) & class(oneVariable) == "character" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + instructions <- gsub("'NA'", "NA", instructions) + if(comments){ + + instructions <- paste0(instructions, " # as a categorical non factor") + } + } + + #continuous + if(is.na(instructions) & class(oneVariable) == "numeric" & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + #uniform distribution + instructions <- paste0(variableName, " = runif(", number_of_observations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),")") + if(comments){ + instructions <- paste0(instructions, " # continuous with uniform distribution") + } } - } - #if unmatched - if(is.na(instructions) & report_empty) { - if(comments){ + #if unmatched + if(is.na(instructions) & comments) { instructions <- paste0("# data type of ",variableName ," not modelled") } - } - if(is.na(instructions)) { + if(is.na(instructions)) { - } else{ - return(instructions) + } else{ + return(instructions) + } } - } - batch_variable_modeller <- function(x) { - variable_modeller(testHYS[,..x][[1]], 100, names(testHYS)[x], report_empty = FALSE, comments = FALSE) - } + batch_variable_modeller <- function(x) { + variable_modeller(ph.data[,..x][[1]], number_of_observations, names(ph.data)[x], comments = comments) + } + + codeList <- lapply(seq_along(ph.data), batch_variable_modeller) - code <- lapply(seq_along(testHYS), batch_variable_modeller) + codeText <- paste(unlist(codeList), collapse =", \n" ) #copy this into your DT generating code - variablesToAdd <- paste(unlist(code), collapse =", ") #copy this into your DT generating code + if(return_code) { + cat(codeText) + } else { + + + eval( parse(text = paste0("DT <- data.table(", codeText, ")",collapse = ""))) + + return(DT) + } + + } - eval( parse(text =paste0(" test <- data.table(", variablesToAdd, ")",collapse = ""))) generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000){ ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data @@ -145,7 +171,7 @@ setup_test_data <- function() { test_data <- data.table( id = 1:observations, chi_geo_kc = sample(c(0,1), observations, replace = T), - chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), numberOfObservations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), number_of_observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), @@ -162,7 +188,7 @@ setup_test_data <- function() { test_data <- data.table( id = 1:observations, chi_geo_kc = sample(c(0,1), observations, replace = T), - chi_race_eth8 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), numberOfObservations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_race_eth8 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), number_of_observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), chi_geo_region = as.factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T)), indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), From 84c3371bd329113191f10703a06a2c944743717d Mon Sep 17 00:00:00 2001 From: Buie Date: Mon, 7 Apr 2025 17:19:46 -0700 Subject: [PATCH 10/21] bug fix and format handling for data generated. rename to data modeller --- tests/testthat/helper.R | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index e0212f0..a74de18 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -46,7 +46,7 @@ setup_test_data <- function() { comments <- TRUE return_code <- FALSE - generate.test.data <- function(ph.data, number_of_observations, years, return_code = TRUE, comments = TRUE) { + data_modeller <- function(ph.data, number_of_observations, years, return_code = TRUE, comments = TRUE) { ### given a data table of public health data, can return code or a DT of identical structure and similar, but non-correlated, values for each variable. ### warning: r has a character limit of 4094 for executing a line of code in the console. If code is requested, and the resulting code is longer, you must break this into smaller chunks (and rbind the results) or source the code as a script (I need to test if this will work) ### warning: currently will create multiple years, but reads the received data set as if it were one year, and models multiple years by repeating the model process with shifted seed @@ -120,8 +120,12 @@ setup_test_data <- function() { } #if unmatched - if(is.na(instructions) & comments) { - instructions <- paste0("# data type of ",variableName ," not modelled") + if(is.na(instructions)) { + + instructions <- paste0("`",variableName,"`", " = NA") + if(comments){ + instructions <- paste0(instructions, " # data type not modelled") + } } if(is.na(instructions)) { @@ -137,11 +141,19 @@ setup_test_data <- function() { codeList <- lapply(seq_along(ph.data), batch_variable_modeller) - codeText <- paste(unlist(codeList), collapse =", \n" ) #copy this into your DT generating code + if(comments) { + + codeListFormatted <- c(gsub(" #", ", #", codeList[1:(length(codeList)-1)]), codeList[length(codeList)]) + codeText <- paste(unlist(codeListFormatted), collapse =" \n" ) #copy this into your DT generating code + + } else { + codeText <- paste(unlist(codeList), collapse =", " ) #copy this into your DT generating code + } if(return_code) { cat(codeText) + return(codeList) } else { @@ -153,6 +165,8 @@ setup_test_data <- function() { } +test <- data_modeller(testHYS, number_of_observations = 1000, years = 2020, return_code = FALSE, comments = TRUE) +testT <- data_modeller(testHYS, number_of_observations = 1000, years = 2020, return_code = TRUE, comments = TRUE) generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000){ ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data From 9cde19dc83e03b0bac3491b293dbdfb4f13a48c3 Mon Sep 17 00:00:00 2001 From: Buie Date: Tue, 8 Apr 2025 12:07:44 -0700 Subject: [PATCH 11/21] updated comments for current state --- tests/testthat/helper.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index a74de18..a71b928 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -48,10 +48,10 @@ setup_test_data <- function() { data_modeller <- function(ph.data, number_of_observations, years, return_code = TRUE, comments = TRUE) { ### given a data table of public health data, can return code or a DT of identical structure and similar, but non-correlated, values for each variable. - ### warning: r has a character limit of 4094 for executing a line of code in the console. If code is requested, and the resulting code is longer, you must break this into smaller chunks (and rbind the results) or source the code as a script (I need to test if this will work) - ### warning: currently will create multiple years, but reads the received data set as if it were one year, and models multiple years by repeating the model process with shifted seed - ### warning: number.of.observations is of the final dataset. If the requested number does not divide evenly across the number of years, the result will be rounded up and the user should remove observations if necessary - ### warning: the data returned is modelled on the data given but correlations between variables are not. This effectively anonymizes results as long as the underlying populations are diverse or large enough. A small enough population may provide sufficient certainty of the results. (build an example, like seperate runs of this with a mono-race table bound together, versus a multi race table) + ### warning: r has a run-instruction character limit of 4094. If code is requested, and the resulting instruction is longer, you must break this into seperate instructions, such making several smaller DTs and binding them together. (test if sourcing as a script is an exception to the limit) + ### warning: (not implemented) currently will create multiple years, but reads the received data set as if it were one year, and models multiple years by repeating the model process with shifted seed + ### warning: (not implemented) number.of.observations is of the final dataset. If the requested number does not divide evenly across the number of years, the result will be rounded up and the user should remove observations if necessary + ### warning: the data returned is modelled on the data given but correlations between variables are not. This effectively anonymizes results as long as the underlying populations are diverse or large enough. A small enough population may provide sufficient certainty of the results. (note to self for vignette: build an example showing seperate runs. One of a DT built from multiple mono-race reuslts bound together, versus building results from a table with observations from multiple races. Show how the results in the former more closely resembles results by race from actual data.) if(!return_code & comments) { message("user has requested data, comments set to FALSE") From 40695eba436fcd5718241bcbde10b5991a15e0cc Mon Sep 17 00:00:00 2001 From: Buie Date: Wed, 9 Apr 2025 12:20:06 -0700 Subject: [PATCH 12/21] cleaning some testing code adding two set generic and generic analysis set --- tests/testthat/helper.R | 116 +++++++++++++++++++++++++--------------- 1 file changed, 74 insertions(+), 42 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index a71b928..c2e2d59 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -40,14 +40,9 @@ setup_test_data <- function() { ) - - - number_of_observations <- 100 - comments <- TRUE - return_code <- FALSE - - data_modeller <- function(ph.data, number_of_observations, years, return_code = TRUE, comments = TRUE) { - ### given a data table of public health data, can return code or a DT of identical structure and similar, but non-correlated, values for each variable. + data_modeller <- function(ph.data, number_of_observations, return_code = TRUE, comments = TRUE) { + ### receives a data table of public health data, number of observations and user decision if they want code (or a DT) and, if code, if it should be commented + ### returns code or a DT of identical structure and similar, but non-correlated, values for each variable provided that can be modeled. If comments are enabled, will return comment for non modeled variables. ### warning: r has a run-instruction character limit of 4094. If code is requested, and the resulting instruction is longer, you must break this into seperate instructions, such making several smaller DTs and binding them together. (test if sourcing as a script is an exception to the limit) ### warning: (not implemented) currently will create multiple years, but reads the received data set as if it were one year, and models multiple years by repeating the model process with shifted seed ### warning: (not implemented) number.of.observations is of the final dataset. If the requested number does not divide evenly across the number of years, the result will be rounded up and the user should remove observations if necessary @@ -144,10 +139,10 @@ setup_test_data <- function() { if(comments) { codeListFormatted <- c(gsub(" #", ", #", codeList[1:(length(codeList)-1)]), codeList[length(codeList)]) - codeText <- paste(unlist(codeListFormatted), collapse =" \n" ) #copy this into your DT generating code + codeText <- paste(unlist(codeListFormatted), collapse =" \n" ) } else { - codeText <- paste(unlist(codeList), collapse =", " ) #copy this into your DT generating code + codeText <- paste(unlist(codeList), collapse =", " ) } if(return_code) { @@ -165,13 +160,11 @@ setup_test_data <- function() { } -test <- data_modeller(testHYS, number_of_observations = 1000, years = 2020, return_code = FALSE, comments = TRUE) -testT <- data_modeller(testHYS, number_of_observations = 1000, years = 2020, return_code = TRUE, comments = TRUE) - generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000){ + generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000, years = 2023){ ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data - ### receives description of data set to emulate, number of observations to include, and a seed. If dataset is "generic" will returned structure will have idealized chi values and generic indicators - ### returns a data.table of synthetic data + ### receives description of data set to emulate, number of observations to include, a seed and number of years. + ### returns a data.table of synthetic data. If dataset is "generic" the returned structure will have idealized chi values and generic indicators # input validation datasetOptions <- c("generic", "hys") @@ -180,44 +173,52 @@ testT <- data_modeller(testHYS, number_of_observations = 1000, years = 2020, ret stop(paste0("dataset must be one of: '", paste(datasetOptions, collapse = "', '"),"'")) } - set.seed(seed) + year_iterator <- function(observations, seed, years) { + + } + if(dataset == "generic") { - test_data <- data.table( - id = 1:observations, - chi_geo_kc = sample(c(0,1), observations, replace = T), - chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), number_of_observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), - chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), - chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), - indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), - indicator2 = as.factor(sample(c(1,2,3,4, NA), observations, replace = T)), - indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), - chi_year = 2023) + + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + id = 1:observations, + chi_geo_kc = sample(c(0,1), observations, replace = T), + chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), number_of_observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), + chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), + indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), + indicator2 = as.factor(sample(c(1,2,3,4, NA), observations, replace = T)), + indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), + chi_year = year) + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } } else if(dataset == "hys") { - test_data <- data.table(abusive_adult = sample(c(NA, '0', '1'), 100, replace = TRUE, prob = c(0.273034917704968, 0.054280110752192, 0.67268497154284)), - chi_sex = factor(sample(c('Female', 'Male', NA), 100, replace = TRUE, prob = c(0.491578218735579, 0.490924473157976, 0.0174973081064452)), levels = c('Female', 'Male'), ordered = FALSE) + returnDT <- data.table() - ) - } else if(dataset == "hysold") { - test_data <- data.table( - id = 1:observations, - chi_geo_kc = sample(c(0,1), observations, replace = T), - chi_race_eth8 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), number_of_observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), - chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), - chi_geo_region = as.factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T)), - indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), - indicator2 = as.factor(sample(c(1,2,3,4, NA), observations, replace = T)), - indicator3 = as.factor(sample(c("<20","21-40","41-60","61<"), observations, replace = T)), - chi_year = 202) - } else if(dataset == "hys") { } - return(test_data) + return(returnDT) } + test_data_generic <- generate_test_data("generic", 100, 1000, c(2016:2023)) + # testHYS <- get_data_hys() + # testBRFSS <- as_table_brfss(get_data_brfss()) + # + # + # inputDT <- testHYS + # + # testDT <- data_modeller(inputDT, number_of_observations = 1000, years = 2020, return_code = FALSE, comments = TRUE) + # testCode <- data_modeller(inputDT, number_of_observations = 1000, years = 2020, return_code = TRUE, comments = TRUE) test_analysis_set_twosets <- data.table( + #this should work with the generic data set cat1 = rep(c('Regions', 'Gender', 'Race/ethnicity'),2), cat1_varname = rep(c('chi_geo_region', 'chi_sex', 'race4'),2), `_kingcounty` = c('x'), @@ -229,6 +230,36 @@ testT <- data_modeller(testHYS, number_of_observations = 1000, years = 2020, ret set_indicator_keys = c(rep(c('indicator1, indicator2'),3), rep("indicator3",3)) ) + test_analysis_set_twosets_estimates <- data.table( + for(indicator in c("indicator1","indicator2")) { + partialDT <- data.table( + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2023'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = Sys.Date(), + run_date = Sys.Date(), + numerator = c(111, 175, 210, 600, 430000), + denominator = c(1000, 1500, 2000, 2500, 2200000) + ) + } + + + ) + test_estimates[, result := numerator / denominator] + test_estimates[, se := sqrt((result * (1-result)) / denominator)] + test_estimates[, rse := 100 * se / result] + test_estimates[, lower_bound := result - 1.96 * se] + test_estimates[, upper_bound := result + 1.96 * se] + # Sample instructions ---- test_instructions <- data.table( indicator_key = c("indicator1", "indicator2", "indicator1", "indicator2"), @@ -340,6 +371,7 @@ testT <- data_modeller(testHYS, number_of_observations = 1000, years = 2020, ret list(my.analytic = test_analytic, my.analysis_set = test_analysis_set, my.analysis_set_twosets = test_analysis_set_twosets, + my.generic_data = test_data_generic, my.estimate= test_estimates, my.estimate_old= test_estimates_old, my.metadata = test_metadata, From eaaadae1d9d84ac29c15f53e01f24e706440ad4b Mon Sep 17 00:00:00 2001 From: Buie Date: Wed, 9 Apr 2025 17:02:54 -0700 Subject: [PATCH 13/21] fixed parsing error and improved and standardized reporting for data modeller --- tests/testthat/helper.R | 190 +++++++++++++++++++++++++++++++++++----- 1 file changed, 167 insertions(+), 23 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index c2e2d59..7d03fb8 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -40,18 +40,29 @@ setup_test_data <- function() { ) + ################################ migrate this out ########################################################### + + data_modeller <- function(ph.data, number_of_observations, return_code = TRUE, comments = TRUE) { ### receives a data table of public health data, number of observations and user decision if they want code (or a DT) and, if code, if it should be commented + ### "number_of_observations" may be a number or a string ### returns code or a DT of identical structure and similar, but non-correlated, values for each variable provided that can be modeled. If comments are enabled, will return comment for non modeled variables. ### warning: r has a run-instruction character limit of 4094. If code is requested, and the resulting instruction is longer, you must break this into seperate instructions, such making several smaller DTs and binding them together. (test if sourcing as a script is an exception to the limit) ### warning: (not implemented) currently will create multiple years, but reads the received data set as if it were one year, and models multiple years by repeating the model process with shifted seed ### warning: (not implemented) number.of.observations is of the final dataset. If the requested number does not divide evenly across the number of years, the result will be rounded up and the user should remove observations if necessary ### warning: the data returned is modelled on the data given but correlations between variables are not. This effectively anonymizes results as long as the underlying populations are diverse or large enough. A small enough population may provide sufficient certainty of the results. (note to self for vignette: build an example showing seperate runs. One of a DT built from multiple mono-race reuslts bound together, versus building results from a table with observations from multiple races. Show how the results in the former more closely resembles results by race from actual data.) - if(!return_code & comments) { - message("user has requested data, comments set to FALSE") - comments <- FALSE + if(inherits(number_of_observations, "character") & return_code == FALSE) { + number_of_observations <- as.integer(number_of_observations) + if(is.na(number_of_observations)) { + stop("user has requested data, but 'number_of_observations' could not be coerced to an integer. 'number_of_observations' must be an integer to perform calculations.") + } } + #if(!return_code & comments) { + # message("user has requested data, 'comments' set to FALSE.") + # comments <- FALSE + #} + variable_modeller <- function(oneVariable, number_of_observations, varName = NA, comments = TRUE) { if(any(class(oneVariable) %in% "data.table")) { @@ -119,7 +130,7 @@ setup_test_data <- function() { instructions <- paste0("`",variableName,"`", " = NA") if(comments){ - instructions <- paste0(instructions, " # data type not modelled") + instructions <- paste0(instructions, " # data type not modeled") } } @@ -138,28 +149,52 @@ setup_test_data <- function() { if(comments) { - codeListFormatted <- c(gsub(" #", ", #", codeList[1:(length(codeList)-1)]), codeList[length(codeList)]) - codeText <- paste(unlist(codeListFormatted), collapse =" \n" ) + codeListParsed <- c(list("DT <- data.table("),gsub(" #", ", #", codeList[1:(length(codeList)-1)]), gsub(" #",") #",codeList[length(codeList)])) + + codeText <- paste(unlist(codeListParsed), collapse =" \n" ) } else { - codeText <- paste(unlist(codeList), collapse =", " ) + codeListParsed <- c(list("DT <- data.table("),paste0(codeList[1:(length(codeList)-1)], ","), paste0(codeList[length(codeList)], ")")) + + codeText <- paste(unlist(codeListParsed), collapse =" \n" ) } if(return_code) { + #codeText <- paste(unlist(codeList), collapse =" \n" ) cat(codeText) return(codeList) } else { - - eval( parse(text = paste0("DT <- data.table(", codeText, ")",collapse = ""))) - + cat(codeText) + eval( parse(text = paste0(codeText))) + # eval( parse(text = paste0("DT <- data.table(", codeText,collapse = ""))) return(DT) } } + ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams')) + + todo <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = F, comments = TRUE) + + tada <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = T, comments = TRUE) + + codeListParsed <- c(list("DT <- data.table("),gsub(" #", ", #", tada[1:(length(tada)-1)]), gsub(" #",") #",tada[length(tada)])) + + codeText <- paste(unlist(codeListParsed), collapse =" \n" ) + + tada <- eval( parse(text = paste0(codeText))) + + + + str(ph.data) + str(todo) + str(tada) + + + ################################ end migrate this out ########################################################### generate_test_data <- function(dataset = "generic", observations = 100, seed = 1000, years = 2023){ ### generates a synthetic data set appropriate for testing functions relying on APDE data structures and where you do not want to use real data @@ -167,7 +202,7 @@ setup_test_data <- function() { ### returns a data.table of synthetic data. If dataset is "generic" the returned structure will have idealized chi values and generic indicators # input validation - datasetOptions <- c("generic", "hys") + datasetOptions <- c("generic", "brfss", "death") dataset <- tolower(dataset) if(!(dataset %in% datasetOptions)) { stop(paste0("dataset must be one of: '", paste(datasetOptions, collapse = "', '"),"'")) @@ -178,13 +213,12 @@ setup_test_data <- function() { } if(dataset == "generic") { - for(year in years) { seed <- seed*year DTIteration <- data.table( id = 1:observations, chi_geo_kc = sample(c(0,1), observations, replace = T), - chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), number_of_observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), @@ -197,25 +231,133 @@ setup_test_data <- function() { returnDT <- DTIteration } } - } else if(dataset == "hys") { - returnDT <- data.table() + } else if(dataset == "death") { + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + `state_file_number` = NA, # data type not modelled, + `underlying_cod_code` = NA, # data type not modelled, + age6 = sample(c('65-74', '75+', '25-44', '45-64', '<18', '18-24'), observations, replace = TRUE, prob = c(0.0107097718352957, 0.0114414953768376, 0.0644581919776492, 0.184660413756403, 0.184194771502694, 0.544535355551121)), # as a categorical non factor, + bigcities = sample(c(NA, 'Seattle city', 'Auburn city', 'Kent city', 'Federal Way city', 'Bellevue city', 'Renton city', 'Kirkland city', 'Redmond city'), observations, replace = TRUE, prob = c(0.0491585179272268, 0.0566753143085213, 0.0538814607862702, 0.0679837690414422, 0.0367192177210138, 0.0190248120800905, 0.052351493381228, 0.249850329275594, 0.414355085478614)), # as a categorical non factor, + `hra20_name` = NA, # data type not modelled, + chi_sex = factor(sample(c('Female', 'Male', NA), observations, replace = TRUE, prob = c(0.468036985299009, 0.531763453735116, 0.000199560965875075)), levels = c('Female', 'Male'), ordered = FALSE), # as a factor, + chi_geo_kc = sample(c('King County'), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + pov200grp = sample(c(NA, 'Very high poverty areas', 'High poverty areas', 'Medium poverty areas', 'Low poverty areas'), observations, replace = TRUE, prob = c(0.287234750216191, 0.174283243530899, 0.24998336991951, 0.285771303133107, 0.00272733320029269)), # as a categorical non factor, + race3 = factor(sample(c('White', 'Black', NA, 'Asian', 'Multiple', 'AIAN', 'NHPI'), observations, replace = TRUE, prob = c(0.0782278986230293, 0.763054613184328, 0.0178939666067984, 0.111355018958292, 0.0109758531231291, 0.0108428124792124, 0.0076498370252112)), levels = c('Black', 'White', 'Multiple', 'Asian', 'AIAN', 'NHPI'), ordered = FALSE), # as a factor, + race4 = factor(sample(c('White', 'Hispanic', 'Black', NA, 'Asian', 'Multiple', 'AIAN', 'NHPI'), observations, replace = TRUE, prob = c(0.0771635734716956, 0.727200159648773, 0.0169626820993814, 0.110889376704583, 0.0108428124792124, 0.0391139493115147, 0.0103106499035455, 0.00751679638129449)), levels = c('Black', 'White', 'Multiple', 'Asian', 'NHPI', 'Hispanic', 'AIAN'), ordered = FALSE), # as a factor, + chi_geo_region = sample(c(NA, 'Seattle', 'South', 'East', 'North'), observations, replace = TRUE, prob = c(0.204549990021952, 0.077230093793654, 0.287367790860108, 0.428124792123994, 0.00272733320029269)), # as a categorical non factor, + wastate = sample(c('Washington State'), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + `chi_age` = NA, # data type not modelled, + chi_year = sample(c('2021'), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + race3_hispanic = sample(c(NA, 'Hispanic'), observations, replace = TRUE, prob = c(0.0392469899554314, 0.960753010044569)) # as a categorical non factor + ) + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } + } else if(dataset == "brfss") { + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + chi_year = sample(c('2023'), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + age = runif(observations, 18, 99), # continuous with uniform distribution, + age5_v2 = factor(sample(c('25-44', '45-64', '18-24', '75+', '65-74'), observations, replace = TRUE, prob = c(0.0619025944469731, 0.366257017144591, 0.320285237445001, 0.149294492489759, 0.102260658473676)), levels = c('18-24', '25-44', '45-64', '65-74', '75+'), ordered = FALSE), # as a factor, + chi_sex = factor(sample(c('Female', 'Male'), observations, replace = TRUE, prob = c(0.504779244424215, 0.495220755575785)), levels = c('Male', 'Female'), ordered = FALSE), # as a factor, + race3 = factor(sample(c('White', NA, 'Asian', 'Black', 'Multiple', 'NHPI', 'AIAN'), observations, replace = TRUE, prob = c(0.715217721134881, 0.0567440449097254, 0.0115308754362009, 0.135639508420574, 0.00971021089364285, 0.0380822333485055, 0.0330754058564709)), levels = c('White', 'Black', 'AIAN', 'Asian', 'NHPI', 'Multiple', NA), ordered = FALSE), # as a factor, + race4 = factor(sample(c('White', 'Hispanic', 'Asian', 'Multiple', 'Black', 'AIAN', NA, 'NHPI'), observations, replace = TRUE, prob = c(0.00652404794416629, 0.0511303292368381, 0.133667121832802, 0.00804126839629798, 0.0923987255348202, 0.662570171445911, 0.0364132908511607, 0.00925504475800334)), levels = c('AIAN', 'Black', 'Asian', 'NHPI', 'Hispanic', 'White', 'Multiple', NA), ordered = FALSE), # as a factor, + hispanic = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.896829009255045, 0.0923987255348202, 0.010772265210135)), # as a categorical non factor, + income6b = factor(sample(c('$50-74,999', NA, '$100,000+', '$20-34,999', '$75-99,999', '<$20,000', '$35-49,999'), observations, replace = TRUE, prob = c(0.0399028978910636, 0.0696404187528448, 0.062661204673039, 0.101046882111971, 0.101805492338037, 0.441662873615536, 0.183280230617509)), levels = c('<$20,000', '$20-34,999', '$35-49,999', '$50-74,999', '$75-99,999', '$100,000+'), ordered = FALSE), # as a factor, + sexorien = factor(sample(c('Something else', 'Straight', 'Lesbian/Gay', 'Bisexual'), observations, replace = TRUE, prob = c(0.887725686542255, 0.0342891822181763, 0.0523441055985435, 0.0256410256410256)), levels = c('Straight', 'Lesbian/Gay', 'Bisexual', 'Something else'), ordered = FALSE), # as a factor, + trnsgndr = sample(c('0', '1'), observations, replace = TRUE, prob = c(0.990593233196783, 0.00940676680321651)), # as a categorical non factor, + veteran3 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.922166590805644, 0.072523137611895, 0.00531027158246093)), # as a categorical non factor, + asthnow = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.896222121074192, 0.0951297223486573, 0.00864815657715066)), # as a categorical non factor, + bphigh = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.699135184342285, 0.296768320436959, 0.00409649522075558)), # as a categorical non factor, + cholchk5 = sample(c('1', '0', NA), observations, replace = TRUE, prob = c(0.832802306175087, 0.0989227734789865, 0.0682749203459263)), # as a categorical non factor, + x_crcrec = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + x_crcrec2 = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + cvdheart = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.948869670763162, 0.0432407828857533, 0.00788954635108481)), # as a categorical non factor, + cvdstrk3 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.977393415263238, 0.0203307540585647, 0.00227583067819754)), # as a categorical non factor, + denvst1 = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + diab2 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.924594143529055, 0.0731300257927477, 0.00227583067819754)), # as a categorical non factor, + exerany = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.870581095433166, 0.128053406159915, 0.00136549840691853)), # as a categorical non factor, + disab2 = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.777120315581854, 0.199969655590957, 0.0229100288271886)), # as a categorical non factor, + ecignow1 = sample(c('3', NA, '2', '1'), observations, replace = TRUE, prob = c(0.0166894249734486, 0.0256410256410256, 0.920194204217873, 0.0374753451676529)), # as a categorical non factor, + firearm4 = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + flushot7 = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.567440449097254, 0.384615384615385, 0.0479441662873616)), # as a categorical non factor, + fnotlast = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.846002124108633, 0.0726748596571082, 0.0813230162342588)), # as a categorical non factor, + sdhfood1 = sample(c('5', NA, '1', '3', '2', '4'), observations, replace = TRUE, prob = c(0.00819299044151115, 0.00955848884842968, 0.0549233803671673, 0.0588681535427098, 0.787133970565923, 0.0813230162342588)), # as a categorical non factor, + genhlth2 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.881201638598088, 0.116977696859354, 0.00182066454255803)), # as a categorical non factor, + mam2yrs = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + medcost1 = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.914580488544986, 0.0819299044151115, 0.0034896070399029)), # as a categorical non factor, + x_pastaer = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.560309512972235, 0.298892429069944, 0.140798057957821)), # as a categorical non factor, + fmd = sample(c('0', '1', NA), observations, replace = TRUE, prob = c(0.855257168866636, 0.130480958883326, 0.0142618722500379)), # as a categorical non factor, + mjnow = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.767410104688211, 0.144135942952511, 0.0884539523592778)), # as a categorical non factor, + obese = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.69458352298589, 0.207555757851616, 0.0978607191624943)), # as a categorical non factor, + x_bmi5cat = factor(sample(c('Overweight', NA, 'Obese', 'Normal', 'Underweight'), observations, replace = TRUE, prob = c(0.018813533606433, 0.343043544226976, 0.332726445152481, 0.207555757851616, 0.0978607191624943)), levels = c('Underweight', 'Normal', 'Overweight', 'Obese'), ordered = FALSE), # as a factor, + x_veglt1a = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + crvscrnx = sample(c(NA), observations, replace = TRUE, prob = c(1)), # as a categorical non factor, + persdoc3 = sample(c('1', '0', NA), observations, replace = TRUE, prob = c(0.861629494765589, 0.126687907752997, 0.011682597481414)), # as a categorical non factor, + x_pneumo3 = sample(c(NA, '0', '1'), observations, replace = TRUE, prob = c(0.18236989834623, 0.0499165528751328, 0.767713548778638)), # as a categorical non factor, + smoker1 = sample(c('0', NA, '1'), observations, replace = TRUE, prob = c(0.907449552419967, 0.0553785465028069, 0.0371719010772265)), # as a categorical non factor, + finalwt1 = runif(observations, 9.1, 3698.3), # continuous with uniform distribution, + x_ststr = sample(c('2023532011', '2023532012', '2023532082', '2023532021', '2023532022', '2023532031', '2023532032', '2023532131', '2023532121', '2023532161', '2023532151', '2023531141', '2023532061', '2023531142', '2023532091', '2023532112', '2023532081', '2023532071', '2023532042', '2023532122', '2023532051', '2023532072', '2023532062', '2023532101', '2023532102', '2023531271', '2023531231', '2023532052', '2023531241', '2023532111', '2023532092', '2023532041', '2023532141', '2023532132', '2023531161', '2023531301', '2023531211', '2023531242', '2023532142', '2023531202', '2023532019'), observations, replace = TRUE, prob = c(0.0581095433166439, 0.00364132908511607, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.000151722045213169, 0.00121377636170536, 0.000151722045213169, 0.587922925201032, 0.210286754665453, 0.000151722045213169, 0.0136549840691853, 0.00804126839629798, 0.0127446517979062, 0.0101653770292824, 0.00106205431649219, 0.000606888180852678, 0.00136549840691853, 0.000910332271279017, 0.00242755272341071, 0.0019723865877712, 0.00151722045213169, 0.000910332271279017, 0.00182066454255803, 0.00652404794416629, 0.000606888180852678, 0.000758610226065847, 0.00166894249734486, 0.000606888180852678, 0.000455166135639508, 0.000758610226065847, 0.000606888180852678, 0.0019723865877712, 0.00242755272341071, 0.000606888180852678, 0.000606888180852678, 0.000303444090426339, 0.0453648915187377, 0.0171445911090881)), # as a categorical non factor, + hra20_id_1 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_2 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_3 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_4 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_5 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_6 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_7 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_8 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_9 = runif(observations, 1, 61), # continuous with uniform distribution, + hra20_id_10 = runif(observations, 1, 61), # continuous with uniform distribution, + default_wt = runif(observations, 9.1, 3698.3), # continuous with uniform distribution, + `_id` = NA, # data type not modelled, + chi_geo_region = sample(c(NA, 'South', 'East', 'North', 'Seattle'), observations, replace = TRUE, prob = c(0.230010620543165, 0.0650887573964497, 0.330602336519496, 0.284175390684266, 0.0901228948566227)) # as a categorical non factor + ) + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } + } else if(dataset == "skeleton") { + for(year in years) { + seed <- seed*year + DTIteration <- data.table( + #paste data modelling code here + ) + if(exists("returnDT")) { + returnDT <- rbind(returnDT, DTIteration) + } else { + returnDT <- DTIteration + } + } } return(returnDT) } test_data_generic <- generate_test_data("generic", 100, 1000, c(2016:2023)) - # testHYS <- get_data_hys() - # testBRFSS <- as_table_brfss(get_data_brfss()) - # - # - # inputDT <- testHYS - # - # testDT <- data_modeller(inputDT, number_of_observations = 1000, years = 2020, return_code = FALSE, comments = TRUE) - # testCode <- data_modeller(inputDT, number_of_observations = 1000, years = 2020, return_code = TRUE, comments = TRUE) + test_data_brfss <- generate_test_data("brfss", 100, 1000, c(2016:2023)) + test_data_death <- generate_test_data("death", 100, 1000, c(2016:2023)) + + d1 <- get_data_brfss(cols = c("chi_year", "age", "age5_v2", "chi_sex", "race3", "race4", "hispanic", + "income6b", "sexorien", "trnsgndr", "veteran3", "chi_geo_region", + "asthnow", "bphigh", "cholchk5", "x_crcrec", "x_crcrec2", "cvdheart", "cvdstrk3", "denvst1", "diab2", + "exerany", "disab2", "ecignow1", "firearm4", "flushot7", "fnotlast", "sdhfood1", "genhlth2", + "mam2yrs", "medcost1", "x_pastaer", "fmd", "mjnow", "obese", "x_bmi5cat", "x_veglt1a", + "crvscrnx", "persdoc3", "x_pneumo3", "smoker1"), + year = 2023) + d1 <- rads::as_table_brfss(d1) + +inputDT <- d1 + testCode <- data_modeller(inputDT, number_of_observations = "observations", return_code = TRUE, comments = T) test_analysis_set_twosets <- data.table( #this should work with the generic data set @@ -372,6 +514,8 @@ setup_test_data <- function() { my.analysis_set = test_analysis_set, my.analysis_set_twosets = test_analysis_set_twosets, my.generic_data = test_data_generic, + my.brfss_data = test_data_brfss, + my.death_data = test_data_death, my.estimate= test_estimates, my.estimate_old= test_estimates_old, my.metadata = test_metadata, From 008c7150496235789ad66e76a76ac56c3fe6392b Mon Sep 17 00:00:00 2001 From: Buie Date: Thu, 10 Apr 2025 12:07:02 -0700 Subject: [PATCH 14/21] fixed typing error, implemented test solution for continuous numerics, but probably not working correctly in non integer cases --- tests/testthat/helper.R | 52 +++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 7d03fb8..ec4d78c 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -55,7 +55,10 @@ setup_test_data <- function() { if(inherits(number_of_observations, "character") & return_code == FALSE) { number_of_observations <- as.integer(number_of_observations) if(is.na(number_of_observations)) { - stop("user has requested data, but 'number_of_observations' could not be coerced to an integer. 'number_of_observations' must be an integer to perform calculations.") + stop("user has requested data, but 'number_of_observations' could not be coerced to an integer. 'number_of_observations' must be an integer.") + } + if(!(number_of_observations > 0)) { + stop("number_of_observations must be an integer greater than 0") } } #if(!return_code & comments) { @@ -75,9 +78,7 @@ setup_test_data <- function() { stop("more than 1 column passed. Only pass a vector or one column") } } - #note : ooooooocurrently setting 61 as categorical threshold because of HRAs. - #if no match, report unmatched type instructions <- NA if(is.na(varName)){ @@ -86,8 +87,10 @@ setup_test_data <- function() { variableName <- varName } + oneVariableClass <- class(oneVariable) + #factor - if(is.na(instructions) & class(oneVariable) == "factor") { + if(is.na(instructions) & inherits(oneVariable, "factor")) { orderTF <- is.ordered(oneVariable) detectedLevels <- levels(oneVariable) instructions <- paste0(variableName," = factor(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")), levels = c('",paste0(detectedLevels, collapse = "', '"),"'), ordered = ", orderTF,")", collapse = "") @@ -98,8 +101,8 @@ setup_test_data <- function() { } #integer: categorical - if(is.na(instructions) & (class(oneVariable) == "numeric" | class(oneVariable) == "integer") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { - instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") + if(is.na(instructions) & inherits(oneVariable, "integer") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + instructions <- paste0(variableName," = as.integer(sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),")))", collapse = "") instructions <- gsub("'NA'", "NA", instructions) if(comments){ instructions <- paste0(instructions, " # as a categorical non factor") @@ -107,7 +110,7 @@ setup_test_data <- function() { } #character: categorical - if(is.na(instructions) & class(oneVariable) == "character" & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { + if(is.na(instructions) & inherits(oneVariable, "character") & (length(unique(oneVariable)) <= 61 & length(oneVariable) > 61)) { instructions <- paste0(variableName," = sample(c('",paste0(unlist(unique(oneVariable)),collapse = "', '"),"'), ", number_of_observations,", replace = TRUE, prob = c(",paste0(prop.table(table(oneVariable, useNA = 'ifany')), collapse = ", "),"))", collapse = "") instructions <- gsub("'NA'", "NA", instructions) if(comments){ @@ -116,12 +119,35 @@ setup_test_data <- function() { } } - #continuous - if(is.na(instructions) & class(oneVariable) == "numeric" & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + #continuous integer + if(is.na(instructions) & inherits(oneVariable, "integer") & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + #uniform distribution + instructions <- paste0(variableName, " = as.integer(runif(", number_of_observations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),"))") + if(comments){ + instructions <- paste0(instructions, " # continuous integer with uniform distribution") + } + } + + #continuous double + if(is.na(instructions) & inherits(oneVariable, "double") & (length(unique(oneVariable)) > 61 & length(oneVariable) > 61)) { + count_decimal_places <- function(x) { + if (!is.numeric(x)) return(NA) + sapply(x, function(num) { + if (is.na(num)) return(NA) + str_num <- as.character(num) + if (grepl("\\.", str_num)) { + return(nchar(strsplit(str_num, "\\.")[[1]][2])) + } else { + return(0) + } + }) + } + oneVariable[,RH := count_decimal_places(oneVariable[[1]])] + numberOfDecimals <- max(oneVariable$RH, na.rm = T) #uniform distribution - instructions <- paste0(variableName, " = runif(", number_of_observations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),")") + instructions <- paste0(variableName, " = as.double(round(runif(", number_of_observations,", ", min(oneVariable, na.rm = TRUE), ", ", max(oneVariable, na.rm = TRUE),"),", numberOfDecimals , "))") if(comments){ - instructions <- paste0(instructions, " # continuous with uniform distribution") + instructions <- paste0(instructions, " # continuous double with uniform distribution") } } @@ -174,9 +200,11 @@ setup_test_data <- function() { } } +rads::list_dataset_columns("birth") ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams')) - + ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams', "time_of_birth", "mother_birthplace_country" )) + #ph.data <- get_data_birth() todo <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = F, comments = TRUE) tada <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = T, comments = TRUE) From 4b1feb4fc624c0431402d37274dc419ed6ba2aef Mon Sep 17 00:00:00 2001 From: Buie Date: Thu, 10 Apr 2025 12:09:18 -0700 Subject: [PATCH 15/21] clean test code --- tests/testthat/helper.R | 53 ++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index ec4d78c..85db8fe 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -200,27 +200,27 @@ setup_test_data <- function() { } } -rads::list_dataset_columns("birth") - - ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams')) - ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams', "time_of_birth", "mother_birthplace_country" )) - #ph.data <- get_data_birth() - todo <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = F, comments = TRUE) - - tada <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = T, comments = TRUE) - - codeListParsed <- c(list("DT <- data.table("),gsub(" #", ", #", tada[1:(length(tada)-1)]), gsub(" #",") #",tada[length(tada)])) - - codeText <- paste(unlist(codeListParsed), collapse =" \n" ) - - tada <- eval( parse(text = paste0(codeText))) - - - - str(ph.data) - str(todo) - str(tada) +# +# ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams')) +# ph.data <- get_data_birth(cols = c('race4', 'chi_age', 'hra20_name', 'sex', 'birth_weight_grams', "time_of_birth", "mother_birthplace_country" )) +# #ph.data <- get_data_birth() +# todo <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = F, comments = TRUE) +# +# tada <- data_modeller(ph.data = ph.data, number_of_observations = 10000, return_code = T, comments = TRUE) +# +# codeListParsed <- c(list("DT <- data.table("),gsub(" #", ", #", tada[1:(length(tada)-1)]), gsub(" #",") #",tada[length(tada)])) +# +# codeText <- paste(unlist(codeListParsed), collapse =" \n" ) +# +# tada <- eval( parse(text = paste0(codeText))) +# +# +# +# str(ph.data) +# str(todo) +# str(tada) +# ################################ end migrate this out ########################################################### @@ -374,19 +374,6 @@ rads::list_dataset_columns("birth") test_data_brfss <- generate_test_data("brfss", 100, 1000, c(2016:2023)) test_data_death <- generate_test_data("death", 100, 1000, c(2016:2023)) - d1 <- get_data_brfss(cols = c("chi_year", "age", "age5_v2", "chi_sex", "race3", "race4", "hispanic", - "income6b", "sexorien", "trnsgndr", "veteran3", "chi_geo_region", - "asthnow", "bphigh", "cholchk5", "x_crcrec", "x_crcrec2", "cvdheart", "cvdstrk3", "denvst1", "diab2", - "exerany", "disab2", "ecignow1", "firearm4", "flushot7", "fnotlast", "sdhfood1", "genhlth2", - "mam2yrs", "medcost1", "x_pastaer", "fmd", "mjnow", "obese", "x_bmi5cat", "x_veglt1a", - "crvscrnx", "persdoc3", "x_pneumo3", "smoker1"), - year = 2023) - d1 <- rads::as_table_brfss(d1) - -inputDT <- d1 - - testCode <- data_modeller(inputDT, number_of_observations = "observations", return_code = TRUE, comments = T) - test_analysis_set_twosets <- data.table( #this should work with the generic data set cat1 = rep(c('Regions', 'Gender', 'Race/ethnicity'),2), From 781a21eeb7e41ac927646e329b45cae4c6275ca0 Mon Sep 17 00:00:00 2001 From: Buie Date: Thu, 10 Apr 2025 17:31:04 -0700 Subject: [PATCH 16/21] stable with generic analysis set and data, and modelled death and brfss data --- tests/testthat/helper.R | 98 ++++++++----------- .../testthat/test-chi_generate_analysis_set.R | 10 ++ 2 files changed, 49 insertions(+), 59 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 85db8fe..c381281 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -246,7 +246,7 @@ setup_test_data <- function() { DTIteration <- data.table( id = 1:observations, chi_geo_kc = sample(c(0,1), observations, replace = T), - chi_race_4 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), + chi_race_7 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), indicator1 = as.factor(sample(c("never","sometimes", "always", NA), observations, replace = T)), @@ -377,45 +377,51 @@ setup_test_data <- function() { test_analysis_set_twosets <- data.table( #this should work with the generic data set cat1 = rep(c('Regions', 'Gender', 'Race/ethnicity'),2), - cat1_varname = rep(c('chi_geo_region', 'chi_sex', 'race4'),2), + cat1_varname = rep(c('chi_geo_region', 'chi_sex', 'chi_race_7'),2), `_kingcounty` = c('x'), - `_wastate` = rep(c(rep(NA_character_,2),"x"),2), - demgroups = rep(c(rep(NA_character_,2),"x"),2), - crosstabs = rep(c(rep(NA_character_,2),"x"),2), - trends = rep(c(rep(NA_character_,2),"x"),2), + `_wastate` = NA_character_, + demgroups = c(rep(NA_character_,3),rep("x", 3)), + crosstabs = c(rep(NA_character_,3),rep("x", 3)), + trends = c(rep(NA_character_,3),rep("x", 3)), set = c(rep(1,3), rep(2,3)), set_indicator_keys = c(rep(c('indicator1, indicator2'),3), rep("indicator3",3)) ) - test_analysis_set_twosets_estimates <- data.table( - for(indicator in c("indicator1","indicator2")) { - partialDT <- data.table( - tab = c(rep('demgroups', 4), '_kingcounty'), - year = c('2023'), - cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), - cat1_group = c("East", "North", "Seattle", "South", 'King County'), - cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), - cat2 = NA_character_, - cat2_group = NA_character_, - cat2_varname = NA_character_, - data_source = 'JustTesting', - caution = NA_character_, - suppression = NA_character_, - chi = 1, - source_date = Sys.Date(), - run_date = Sys.Date(), - numerator = c(111, 175, 210, 600, 430000), - denominator = c(1000, 1500, 2000, 2500, 2200000) - ) - } + # create twoset analysis set + #remove("test_twoset_estimates") + for(indicator in c("indicator1","indicator2")) { + partialDT <- data.table( + indicator = indicator, + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2023'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = Sys.Date(), + run_date = Sys.Date(), + numerator = c(111, 175, 210, 600, 430000), + denominator = c(1000, 1500, 2000, 2500, 2200000) + ) + if(exists("test_twoset_estimates")) { + test_twoset_estimates <- rbind(test_twoset_estimates, partialDT) + } else { + test_twoset_estimates <- partialDT + } + } + test_twoset_estimates[, result := numerator / denominator] + test_twoset_estimates[, se := sqrt((result * (1-result)) / denominator)] + test_twoset_estimates[, rse := 100 * se / result] + test_twoset_estimates[, lower_bound := result - 1.96 * se] + test_twoset_estimates[, upper_bound := result + 1.96 * se] - ) - test_estimates[, result := numerator / denominator] - test_estimates[, se := sqrt((result * (1-result)) / denominator)] - test_estimates[, rse := 100 * se / result] - test_estimates[, lower_bound := result - 1.96 * se] - test_estimates[, upper_bound := result + 1.96 * se] # Sample instructions ---- test_instructions <- data.table( @@ -455,32 +461,6 @@ setup_test_data <- function() { test_estimates[, lower_bound := result - 1.96 * se] test_estimates[, upper_bound := result + 1.96 * se] - test_estimates_twosets <- data.table( - indicator_key = c("indicator1"), - tab = c(rep('demgroups', 4), '_kingcounty'), - year = c('2023'), - cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), - cat1_group = c("East", "North", "Seattle", "South", 'King County'), - cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), - cat2 = NA_character_, - cat2_group = NA_character_, - cat2_varname = NA_character_, - data_source = 'JustTesting', - caution = NA_character_, - suppression = NA_character_, - chi = 1, - source_date = Sys.Date(), - run_date = Sys.Date(), - numerator = c(111, 175, 210, 600, 430000), - denominator = c(1000, 1500, 2000, 2500, 2200000) - ) - test_estimates_twosets[, result := numerator / denominator] - test_estimates_twosets[, se := sqrt((result * (1-result)) / denominator)] - test_estimates_twosets[, rse := 100 * se / result] - test_estimates_twosets[, lower_bound := result - 1.96 * se] - test_estimates_twosets[, upper_bound := result + 1.96 * se] - - test_estimates_old <- data.table( indicator_key = c("indicatorX"), diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R index c5a0e95..b4bf3b1 100644 --- a/tests/testthat/test-chi_generate_analysis_set.R +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -4,3 +4,13 @@ test_that("chi_generate_analysis_set validates inputs", { expect_error(chi_generate_analysis_set(data_source = 123), "data_source must be a single character string") expect_error(chi_generate_analysis_set(CHIestimates = 123), "CHIestimates must be a data.table or data.frame") }) + +test_that("chi_generate_analysis_set generates expected output", { + TestData <- setup_test_data() + TestData$my.generic_data + TestData$my.analysis_set_twosets + test_analysis_instructions_results <- apde.chi.tools::chi_generate_tro_shell(TestData$my.analysis_set_twosets, end.year = 2023, year.span = 3, trend.span = 3, trend.periods = 5) + test_chi_calc_output <- apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = test_analysis_instructions_results, source_name = "test",source_date = as.Date("2025-05-10"), ci = .95) + + +}) From 8a67eed34e7dd18ba00464faad4d7cb723b3a641 Mon Sep 17 00:00:00 2001 From: Buie Date: Fri, 11 Apr 2025 15:24:27 -0700 Subject: [PATCH 17/21] added test for CHI generate estimates --- tests/testthat/test-chi_generate_analysis_set.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R index b4bf3b1..0730f69 100644 --- a/tests/testthat/test-chi_generate_analysis_set.R +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -5,12 +5,10 @@ test_that("chi_generate_analysis_set validates inputs", { expect_error(chi_generate_analysis_set(CHIestimates = 123), "CHIestimates must be a data.table or data.frame") }) -test_that("chi_generate_analysis_set generates expected output", { +test_that("chi_generate_analysis_set generates expected analysis set from CHI estimates", { TestData <- setup_test_data() - TestData$my.generic_data - TestData$my.analysis_set_twosets - test_analysis_instructions_results <- apde.chi.tools::chi_generate_tro_shell(TestData$my.analysis_set_twosets, end.year = 2023, year.span = 3, trend.span = 3, trend.periods = 5) - test_chi_calc_output <- apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = test_analysis_instructions_results, source_name = "test",source_date = as.Date("2025-05-10"), ci = .95) - - + DT_test_analysis_instructions_results <- apde.chi.tools::chi_generate_tro_shell(TestData$my.analysis_set_twosets, end.year = 2023, year.span = 3, trend.span = 3, trend.periods = 5) + DT_test_chi_calc_output <- apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = DT_test_analysis_instructions_results, source_name = "test",source_date = as.Date("2025-05-10"), ci = .95) + DT_recreated_analysis_set <- chi_generate_analysis_set(CHIestimates = DT_test_chi_calc_output) + expect_equal(all.equal(TestData$my.analysis_set_twosets[order(cat1,set)] , DT_recreated_analysis_set[order(cat1,set)]),TRUE) }) From 575c50bde3c80ae454b0513a5ca272e8b0ad340a Mon Sep 17 00:00:00 2001 From: Buie Date: Fri, 11 Apr 2025 15:25:05 -0700 Subject: [PATCH 18/21] fixed chi_geo_kc levels for generic data generated by setup_test_data() --- tests/testthat/helper.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index c381281..fa0b1dd 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -245,7 +245,7 @@ setup_test_data <- function() { seed <- seed*year DTIteration <- data.table( id = 1:observations, - chi_geo_kc = sample(c(0,1), observations, replace = T), + chi_geo_kc = sample(c('King County',NA_character_), observations, replace = T), chi_race_7 = factor(sample(c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA), observations, replace = T, prob = c(.19,.01,.07,.11,.01,.35,.07,.14,.02)), levels = c("Asian", "AIAN", "Black", "Hispanic", "NHPI", "White", "Other", "Multiple", NA)), chi_sex = as.factor(sample(c("Male","Female"), observations, replace = T)), chi_geo_region = factor(sample(c("South", "North", "Seattle", "East"), observations, replace = T), levels = c("South","North","Seattle","East")), From fadba24bb6beefbac311174417b655be65f9ef79 Mon Sep 17 00:00:00 2001 From: Buie Date: Fri, 11 Apr 2025 17:32:11 -0700 Subject: [PATCH 19/21] encapsulate connection test --- tests/testthat/helper.R | 70 +++++++++++++++++++++++++++- tests/testthat/test-chi_sql_update.R | 26 ++++++----- 2 files changed, 83 insertions(+), 13 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index fa0b1dd..697b3ed 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -437,7 +437,7 @@ setup_test_data <- function() { # Sample estimates ---- test_estimates <- data.table( - indicator_key = c("indicator1"), + indicator_key = c("indicatorX"), tab = c(rep('demgroups', 4), '_kingcounty'), year = c('2023'), cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), @@ -504,6 +504,71 @@ setup_test_data <- function() { run_date = Sys.Date() ) + validate_hhsaw_connection <- function(hhsaw_key = 'hhsaw'){ + # Key should be a character string that can be used to generate a database connection + # Also have to allow for the option of interactive authentication + # TODO: Allow hhsaw_key to be a database connection itself + is.db = function(x){ + r = try(dbIsValid(hhsaw_key)) + if(inherits(r, 'try-error')){ + r = FALSE + } + r + } + status <- 0 + closeserver = TRUE + if(is.character(hhsaw_key)){ + server <- grepl('server', tolower(Sys.info()['release'])) + trykey <- try(keyring::key_get(hhsaw_key, keyring::key_list(hhsaw_key)[['username']]), silent = T) + if (inherits(trykey, "try-error")) warning(paste0("Your hhsaw keyring is not properly configured or you are not connected to the VPN. \n", + "Please check your VPN connection and or set your keyring and run the function again. \n", + paste0("e.g., keyring::key_set('hhsaw', username = 'ALastname@kingcounty.gov') \n"), + "When prompted, be sure to enter the same password that you use to log into to your laptop. \n", + "If you already have an hhsaw key on your keyring with a different name, you can specify it with the 'mykey = ...' or 'hhsaw_key = ...' argument \n")) + rm(trykey) + + if(server == FALSE){ + con <- try(con <- DBI::dbConnect(odbc::odbc(), + driver = getOption('rads.odbc_version'), + server = 'kcitazrhpasqlprp16.azds.kingcounty.gov', + database = 'hhs_analytics_workspace', + uid = keyring::key_list(hhsaw_key)[["username"]], + pwd = keyring::key_get(hhsaw_key, keyring::key_list(hhsaw_key)[["username"]]), + Encrypt = 'yes', + TrustServerCertificate = 'yes', + Authentication = 'ActiveDirectoryPassword'), silent = T) + if (inherits(con, "try-error")) warning(paste("Either your computer is not connected to KC systems (e.g. VPN is not connected), your hhsaw key is not properly configured, and/or your key value is outdated.", + "To (re)set your hhsaw key use keyring::key_set('", hhsaw_key, "', username = 'ALastname@kingcounty.gov')"), + "When prompted, be sure to enter the same password that you use to log into to your laptop.") + }else{ + message(paste0('Please enter the password you use for your laptop into the pop-up window. \n', + 'Note that the pop-up may be behind your Rstudio session. \n', + 'You will need to use your two factor authentication app to confirm your KC identity.')) + con <- DBI::dbConnect(odbc::odbc(), + driver = getOption('rads.odbc_version'), + server = "kcitazrhpasqlprp16.azds.kingcounty.gov", + database = "hhs_analytics_workspace", + uid = keyring::key_list(hhsaw_key)[["username"]], + Encrypt = "yes", + TrustServerCertificate = "yes", + Authentication = "ActiveDirectoryInteractive") + status <- 1 + } + + # on.exit(DBI::dbDisconnect(con)) + + }else if(is.db(hhsaw_key)){ + closeserver = FALSE + con = hhsaw_key + status <- 1 + }else{ + warning('`hhsaw_key` is not a reference to database connection or keyring') + } + + return(status) + + } + # Return ---- list(my.analytic = test_analytic, my.analysis_set = test_analysis_set, @@ -514,5 +579,6 @@ setup_test_data <- function() { my.estimate= test_estimates, my.estimate_old= test_estimates_old, my.metadata = test_metadata, - my.instructions = test_instructions) + my.instructions = test_instructions, + my.hhsaw_status_test = validate_hhsaw_connection) } diff --git a/tests/testthat/test-chi_sql_update.R b/tests/testthat/test-chi_sql_update.R index 3962127..a4db66b 100644 --- a/tests/testthat/test-chi_sql_update.R +++ b/tests/testthat/test-chi_sql_update.R @@ -2,17 +2,21 @@ test_that("chi_update_sql validates inputs", { test_data <- setup_test_data() - expect_warning( - chi_update_sql( - CHIestimates = test_data$my.estimate, - CHImetadata = test_data$my.metadata, - table_name = 'JustTesting', - server = 'development', - replace_table = FALSE - ), - "Validation may be flawed for the following variables because they are 100% missing" - ) - + con_status <- test_data$my.hhsaw_status_test() + if(con_status == 1) { + expect_warning( + chi_update_sql( + CHIestimates = test_data$my.estimate, + CHImetadata = test_data$my.metadata, + table_name = 'JustTesting', + server = 'development', + replace_table = FALSE + ), + "Validation may be flawed for the following variables because they are 100% missing" + ) + } else { + message("no connection to hhsaw available. skipping test of tsql validation") + } expect_error(chi_update_sql(), "The results table to push to SQL \\(CHIestimates\\) is missing") expect_error(suppressWarnings(chi_update_sql(CHIestimates = test_data$my.estimate)), From ff20a99101f2bcd5a79d26efaf9198ad65e02457 Mon Sep 17 00:00:00 2001 From: Buie Date: Fri, 11 Apr 2025 17:46:37 -0700 Subject: [PATCH 20/21] suppressed warnings where unimportant --- tests/testthat/test-chi_generate_analysis_set.R | 4 ++-- tests/testthat/test-chi_sql_update.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R index 0730f69..6b41036 100644 --- a/tests/testthat/test-chi_generate_analysis_set.R +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -8,7 +8,7 @@ test_that("chi_generate_analysis_set validates inputs", { test_that("chi_generate_analysis_set generates expected analysis set from CHI estimates", { TestData <- setup_test_data() DT_test_analysis_instructions_results <- apde.chi.tools::chi_generate_tro_shell(TestData$my.analysis_set_twosets, end.year = 2023, year.span = 3, trend.span = 3, trend.periods = 5) - DT_test_chi_calc_output <- apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = DT_test_analysis_instructions_results, source_name = "test",source_date = as.Date("2025-05-10"), ci = .95) - DT_recreated_analysis_set <- chi_generate_analysis_set(CHIestimates = DT_test_chi_calc_output) + DT_test_chi_calc_output <- suppressWarnings(apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = DT_test_analysis_instructions_results, source_name = "test",source_date = as.Date("2025-05-10"), ci = .95)) + DT_recreated_analysis_set <- suppressWarnings(chi_generate_analysis_set(CHIestimates = DT_test_chi_calc_output)) expect_equal(all.equal(TestData$my.analysis_set_twosets[order(cat1,set)] , DT_recreated_analysis_set[order(cat1,set)]),TRUE) }) diff --git a/tests/testthat/test-chi_sql_update.R b/tests/testthat/test-chi_sql_update.R index a4db66b..5d0ab50 100644 --- a/tests/testthat/test-chi_sql_update.R +++ b/tests/testthat/test-chi_sql_update.R @@ -15,7 +15,7 @@ test_that("chi_update_sql validates inputs", { "Validation may be flawed for the following variables because they are 100% missing" ) } else { - message("no connection to hhsaw available. skipping test of tsql validation") + message("connection test skipped") } expect_error(chi_update_sql(), "The results table to push to SQL \\(CHIestimates\\) is missing") From 7121e94d8ab8f434c00014f244ec95daa90ccefd Mon Sep 17 00:00:00 2001 From: Buie Date: Sat, 12 Apr 2025 10:01:01 -0700 Subject: [PATCH 21/21] unresolved: error when trtying to test chi_generate_metadata. Something wrong with how data tables (names not properly exporteD) attempted providing specific dates to data exported from helper.R chenged data source name in chi_generate_analysis_set set to "generic_test" to match test set used. --- tests/testthat/helper.R | 61 +++++++++++++++---- .../testthat/test-chi_generate_analysis_set.R | 2 +- tests/testthat/test_chi_generate_metadata.R | 6 ++ 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 697b3ed..dce7e89 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -370,9 +370,9 @@ setup_test_data <- function() { return(returnDT) } - test_data_generic <- generate_test_data("generic", 100, 1000, c(2016:2023)) - test_data_brfss <- generate_test_data("brfss", 100, 1000, c(2016:2023)) - test_data_death <- generate_test_data("death", 100, 1000, c(2016:2023)) + test_data_generic <- generate_test_data("generic", 10000, 1000, c(2016:2023)) + test_data_brfss <- generate_test_data("brfss", 10000, 1000, c(2016:2023)) + test_data_death <- generate_test_data("death", 10000, 1000, c(2016:2023)) test_analysis_set_twosets <- data.table( #this should work with the generic data set @@ -388,6 +388,7 @@ setup_test_data <- function() { ) # create twoset analysis set + #not currently exported, may not be needed #remove("test_twoset_estimates") for(indicator in c("indicator1","indicator2")) { partialDT <- data.table( @@ -404,17 +405,38 @@ setup_test_data <- function() { caution = NA_character_, suppression = NA_character_, chi = 1, - source_date = Sys.Date(), - run_date = Sys.Date(), + source_date = as.Date("2025-01-01"), + run_date = as.Date("2025-01-01"), numerator = c(111, 175, 210, 600, 430000), denominator = c(1000, 1500, 2000, 2500, 2200000) ) if(exists("test_twoset_estimates")) { test_twoset_estimates <- rbind(test_twoset_estimates, partialDT) - } else { - test_twoset_estimates <- partialDT - } + } else { + test_twoset_estimates <- partialDT + } } + partialDT <- data.table( + indicator = "indicator3", + tab = c(rep('demgroups', 4), '_kingcounty'), + year = c('2023'), + cat1 = c('Region', 'Region', 'Region', 'Region', 'King County'), + cat1_group = c("East", "North", "Seattle", "South", 'King County'), + cat1_varname = c('chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_region', 'chi_geo_kc'), + cat2 = NA_character_, + cat2_group = NA_character_, + cat2_varname = NA_character_, + data_source = 'JustTesting', + caution = NA_character_, + suppression = NA_character_, + chi = 1, + source_date = as.Date("2025-01-01"), + run_date = as.Date("2025-01-01"), + numerator = c(111, 175, 210, 600, 430000), + denominator = c(1000, 1500, 2000, 2500, 2200000) + ) + test_twoset_estimates <- rbind(test_twoset_estimates, partialDT) + test_twoset_estimates[, result := numerator / denominator] test_twoset_estimates[, se := sqrt((result * (1-result)) / denominator)] test_twoset_estimates[, rse := 100 * se / result] @@ -422,6 +444,23 @@ setup_test_data <- function() { test_twoset_estimates[, upper_bound := result + 1.96 * se] + #twoset metadata should work with with the "generic" dataset + test_twoset_metadata <- data.table( + indicator_key = c("indicator1", "indicator2","indicator3"), + result_type = c("proportion"), + valid_years = c("2020 2021 2022 2022"), + latest_year = c(2022), + data_source = 'test', + valence = 'positive', + latest_year_result = 0.20, + latest_year_kc_pop = 2300000, + latest_year_count = 460000, + map_type = 'hra', + unit = 'person', + chi = 1, + run_date = as.Date("2025-01-01") + ) + # Sample instructions ---- test_instructions <- data.table( @@ -450,8 +489,8 @@ setup_test_data <- function() { caution = NA_character_, suppression = NA_character_, chi = 1, - source_date = Sys.Date(), - run_date = Sys.Date(), + source_date = as.Date("2025-01-01"), + run_date = as.Date("2025-01-01"), numerator = c(111, 175, 210, 600, 430000), denominator = c(1000, 1500, 2000, 2500, 2200000) ) @@ -501,7 +540,7 @@ setup_test_data <- function() { map_type = 'hra', unit = 'person', chi = 1, - run_date = Sys.Date() + run_date = as.Date("2024-01-01") ) validate_hhsaw_connection <- function(hhsaw_key = 'hhsaw'){ diff --git a/tests/testthat/test-chi_generate_analysis_set.R b/tests/testthat/test-chi_generate_analysis_set.R index 6b41036..279bb31 100644 --- a/tests/testthat/test-chi_generate_analysis_set.R +++ b/tests/testthat/test-chi_generate_analysis_set.R @@ -8,7 +8,7 @@ test_that("chi_generate_analysis_set validates inputs", { test_that("chi_generate_analysis_set generates expected analysis set from CHI estimates", { TestData <- setup_test_data() DT_test_analysis_instructions_results <- apde.chi.tools::chi_generate_tro_shell(TestData$my.analysis_set_twosets, end.year = 2023, year.span = 3, trend.span = 3, trend.periods = 5) - DT_test_chi_calc_output <- suppressWarnings(apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = DT_test_analysis_instructions_results, source_name = "test",source_date = as.Date("2025-05-10"), ci = .95)) + DT_test_chi_calc_output <- suppressWarnings(apde.chi.tools::chi_calc(ph.data = TestData$my.generic_data, ph.instructions = DT_test_analysis_instructions_results, source_name = "generic_test", source_date = as.Date("2025-04-10"), ci = .80)) DT_recreated_analysis_set <- suppressWarnings(chi_generate_analysis_set(CHIestimates = DT_test_chi_calc_output)) expect_equal(all.equal(TestData$my.analysis_set_twosets[order(cat1,set)] , DT_recreated_analysis_set[order(cat1,set)]),TRUE) }) diff --git a/tests/testthat/test_chi_generate_metadata.R b/tests/testthat/test_chi_generate_metadata.R index 41663c1..9f050ce 100644 --- a/tests/testthat/test_chi_generate_metadata.R +++ b/tests/testthat/test_chi_generate_metadata.R @@ -4,4 +4,10 @@ test_that("chi_generate_metadata handles valid inputs", { expect_error(chi_generate_metadata(), "meta.old must be provided") expect_error(chi_generate_metadata(meta.old = test_data$my.metadata), "est.current must be provided") + + # why does this test fail? the DTs are not properly constructed, perhaps I need to update packages? will try later + #DTtest <- test_data$my.estimate + #DTtest[tab,] # throws error + #chi_generate_metadata(meta.old = test_data$my.metadata, est.current = test_data$my.estimate) + })