diff --git a/.gitignore b/.gitignore index d0dd721..acbb8ee 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ .Rproj.user .vscode/ *.bkp +*.log check/ docs/ FastRet.Rproj diff --git a/DESCRIPTION b/DESCRIPTION index 36f5df4..a0aa549 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: FastRet Title: Retention Time Prediction in Liquid Chromatography -Version: 1.3.0 +Version: 1.3.5 Authors@R: c( person(given = "Tobias", family = "Schmidt", role = c("aut", "cre", "cph"), email = "tobias.schmidt331@gmail.com", comment = c(ORCID = "0000-0001-9681-9253")), person(given = "Christian", family = "Amesoeder", role = c("aut", "cph"), email = "christian-amesoeder@web.de", comment = c(ORCID = "0000-0002-1668-8351")), diff --git a/NEWS.md b/NEWS.md index 69f70ef..59f4412 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,61 @@ +# FastRet 1.3.5 + +Bugfix: + +1. `predict.frm()` now imputes missing/non-finite values not only for base model + predictors, but also for adjustment model predictors. This prevents `NA` + predictions if the adjustment model depends on predictors that are + missing/non-finite in the new data. + +Internal Improvements: + +1. Consolidated the imputation logic in `predict.frm()` into a shared internal + helper. + +# FastRet 1.3.4 + +Internal Improvements: + +1. `start_gui()` and `start_gui_in_devmode()` now rely on + `with(future::plan(...), local = TRUE)` so that temporary future plans are + restored automatically without manual bookkeeping. + +# FastRet 1.3.3 + +API Improvements: + +1. `selective_measuring()` accepts `"max"` and `"inf"` as additional values + for its `rt_coef` parameter: + - `"max"` is an alias for the existing `"max_ridge_coef"` + - `"inf"` sets all chemical descriptor features to zero before clustering so that + RT alone drives the distance metric (i.e., it is "infinitely" more important + than the chemical descriptors). + +# FastRet 1.3.2 + +Bugfix: + +1. `adjust_frm()` automatically switched to "lm" adjustment if `predictors` + contained only one predictor, regardless of whether `add_cds` was TRUE or + FALSE. This has now been fixed, so that "lasso", "ridge" and "gbtree" + adjustment is now possible if either `add_cds` is TRUE, `add_cds` is + `NULL` or `predictors` contains more than one predictor. + +# FastRet 1.3.1 + +API Improvements: + +1. Added arguments `match_rts` and `match_keys` to `adjust_frm()`: + - If `match_rts=TRUE` (default), RTs are obtained by matching rows in + `new_data` to rows in `frm$df` based on `match_keys`. + - If `match_rts=FALSE`, RTs are obtained by applying the base model to + `new_data`. + - `match_keys` can be any combination of "INCHIKEY", "SMILES" and "NAME". If + left at default NULL, SMILES+INCHIKEY is used if both columns are present + in the adjusted and the original training data. Otherwise, SMILES+NAME is + used. + # FastRet 1.3.0 API Improvements: diff --git a/R/app.R b/R/app.R index 5a2ab22..87cab5c 100644 --- a/R/app.R +++ b/R/app.R @@ -49,8 +49,7 @@ start_gui <- function(port = 8080, # Use [start_gui_in_devmode()] for development catf("Checking CDK version") check_cdk_version() - oldplan <- future::plan("multisession", workers = nw) - on.exit(future::plan(oldplan), add = TRUE) + with(future::plan("multisession", workers = nw), local = TRUE) catf("Starting FastRet GUI") app <- fastret_app(port, host, reload, nsw) shiny::runApp(app) @@ -140,8 +139,7 @@ start_gui_in_devmode <- function(strategy = "sequential", )) catf("Initializing cluster") - oldplan <- future::plan(strategy) - on.exit(future::plan(oldplan), add = TRUE, after = FALSE) + with(future::plan(strategy), local = TRUE) catf("Starting FastRet GUI in development mode") pkg_root <- dirname(system.file("DESCRIPTION", package = "FastRet")) diff --git a/R/sm.R b/R/sm.R index 22ee9be..231d403 100644 --- a/R/sm.R +++ b/R/sm.R @@ -13,7 +13,7 @@ #' re-measurement. The selection process includes: #' #' 1. Generating chemical descriptors from the SMILES strings of the molecules. -#' These are the features used by [train_frm()] and [adjust_frm()]. +#' These are the features used by [train_frm()] and [adjust_frm()].˝ #' 2. Standardizing chemical descriptors to have zero mean and unit variance. #' 3. Training a Ridge Regression model with the standardized chemical #' descriptors as features and the retention times as the target variable. @@ -40,15 +40,17 @@ #' #' @param rt_coef #' Which coefficient to use for scaling RT before clustering. Options are: -#' - max_ridge_coef: scale with the maximum absolute coefficient obtained in -#' ridge regression. I.e., RT will have approximately the same weight as the -#' most important chemical descriptor. -#' - 1: do not scale RT any further, i.e., use standardized RT. The effect of +#' - `0`: exclude RT from the clustering. +#' - 'max' / 'max_ridge_coef': scale with the maximum absolute coefficient +#' obtained in ridge regression. I.e., RT will have approximately the same +#' weight as the most important chemical descriptor. +#' - `1`: do not scale RT any further, i.e., use standardized RT. The effect of #' leaving RT unscaled is kind of unpredictable, as the ridge coefficients #' depend on the dataset. If the maximum absolute coefficient is much smaller #' than 1, RT will dominate the clustering. If it is much larger than 1, RT #' will have little influence on the clustering. -#' - 0: exclude RT from the clustering. +#' - 'inf': set all chemical descriptor values to zero, i.e., RT is "infinitely" +#' more important than any chemical descriptor. #' #' @return #' A list containing the following elements: @@ -77,6 +79,15 @@ selective_measuring <- function(raw_data, rt_coef = "max_ridge_coef" ) { + stopifnot( + is.data.frame(raw_data), + all(c("NAME", "SMILES", "RT") %in% colnames(raw_data)), + is.numeric(k_cluster), length(k_cluster) == 1, !is.na(k_cluster), k_cluster >= 2, + is.logical(verbose) || is.numeric(verbose), + is.null(seed) || is.numeric(seed), + is.numeric(rt_coef) || is.character(rt_coef), length(rt_coef) == 1, !is.na(rt_coef) + ) + rt_coef <- try_as_numeric(rt_coef, fallback = rt_coef) logf <- if (verbose >= 1) catf else null logf("Starting Selective Measuring") @@ -91,25 +102,31 @@ selective_measuring <- function(raw_data, df <- df[, nonmeta, drop = FALSE] dfz <- as.data.frame(scale(df)) # z-score standardized (mean 0, sd 1) - logf("Training Ridge Regression model") - Xz <- as.matrix(dfz[, colnames(dfz) != "RT", drop = FALSE]) - y <- as.numeric(dfz[, "RT"]) - model <- fit_glmnet(Xz, y, method = "ridge", seed = seed) + if (rt_coef %in% c(Inf, "inf")) { + logf("Setting CDs to zero because rt_coef is Inf") + coefs <- rep(0, ncol(dfz) - 1) + names(coefs) <- colnames(dfz)[colnames(dfz) != "RT"] + model <- NULL + dfzb <- data.frame(RT = dfz$RT) + } else { + logf("Training Ridge Regression model") + Xz <- as.matrix(dfz[, colnames(dfz) != "RT", drop = FALSE]) + y <- as.numeric(dfz[, "RT"]) + model <- fit_glmnet(Xz, y, method = "ridge", seed = seed) - logf("Scaling features by coefficients of Ridge Regression model") - coef_mat <- glmnet::coef.glmnet(model) # (m+1) x 1 matrix - coefs <- as.numeric(coef_mat)[-1] # remove intercept - coefs <- setNames(coefs, rownames(coef_mat)[-1]) - coefs <- coefs[colnames(Xz)] # ensure correct order - Xzb <- sweep(Xz, 2, coefs, `*`) # z-score standardized and beta-scaled + logf("Scaling features by coefficients of Ridge Regression model") + coef_mat <- glmnet::coef.glmnet(model) # (m+1) x 1 matrix + coefs <- as.numeric(coef_mat)[-1] # remove intercept + coefs <- setNames(coefs, rownames(coef_mat)[-1]) + coefs <- coefs[colnames(Xz)] # ensure correct order + Xzb <- sweep(Xz, 2, coefs, `*`) # z-score standardized and beta-scaled - logf("Scaling RT by %s before clustering", rt_coef) - rtc <- if (rt_coef == "max_ridge_coef") { - max(abs(coefs), na.rm = TRUE) - } else { - as.numeric(rt_coef) + logf("Scaling RT by %s before clustering", rt_coef) + rtc <- if (grepl("max", rt_coef)) max(abs(coefs), na.rm = TRUE) + else if (is.numeric(rt_coef)) as.numeric(rt_coef) + else stop("Invalid value for rt_coef: ", rt_coef) + dfzb <- data.frame(RT = dfz$RT * rtc, Xzb) } - dfzb <- data.frame(RT = dfz$RT * rtc, Xzb) logf("Applying PAM clustering") clobj <- cluster::pam(dfzb, k = as.numeric(k_cluster)) @@ -124,3 +141,9 @@ selective_measuring <- function(raw_data, ) ret } + +try_as_numeric <- function(x, fallback = x) { + if (length(x) != 1) stop("scalar expected") + y <- suppressWarnings(as.numeric(x)) + if (!is.na(y)) y else fallback +} \ No newline at end of file diff --git a/R/train.R b/R/train.R index 9e84b5b..75f78f3 100644 --- a/R/train.R +++ b/R/train.R @@ -88,7 +88,7 @@ train_frm <- function(df, method = "lasso", verbose = 1, nfolds = 5, nw = 1, stopifnot( is.data.frame(df), all(c("NAME", "RT", "SMILES") %in% colnames(df)), - method %in% c("lasso", "ridge", "gbtree", "gbtreeDefault", "gbtreeRP"), + is.character(method), length(method) == 1, !is.na(method), is.numeric(nfolds), nfolds >= 2, is.numeric(nw), nw >= 1, is.numeric(degree_polynomial), degree_polynomial >= 1, @@ -101,6 +101,7 @@ train_frm <- function(df, method = "lasso", verbose = 1, nfolds = 5, nw = 1, ) # Init variables + method <- normalize_train_method(method) if (is.numeric(seed)) set.seed(seed) if (method == "gbtree") method <- "gbtreeDefault" args <- named( @@ -209,12 +210,26 @@ train_frm <- function(df, method = "lasso", verbose = 1, nfolds = 5, nw = 1, #' A logical value indicating whether to add chemical descriptors as predictors #' to new data. Default is TRUE if `adj_type` is "lasso", "ridge" or "gbtree" #' and FALSE if `adj_type` is "lm". +#' @param match_rts +#' Logical indicating how to obtain the RT feature used during training of the +#' adjustment model. If TRUE (default), RT is obtained by matching rows in +#' `new_data` to rows in `frm$df` based on `match_keys`. If FALSE, RT is obtained +#' by applying the base model to `new_data`. +#' @param match_keys +#' Which columns to use for matching `new_data` to `frm$df` if `match_rts` is +#' TRUE. Can be any combination of "INCHIKEY", "SMILES" and "NAME". If left at +#' NULL, matching is done via `c("SMILES", "INCHIKEY")` if INCHIKEYs are +#' available, else via `c("SMILES", "NAME")`. See 'Details' for more information +#' on the matching procedure. #' #' @details -#' Matching is done via "SMILES"+"INCHIKEY" if both datasets have non-missing -#' INCHIKEYs for all rows; otherwise via "SMILES"+"NAME". If multiple rows in -#' `frm$df` match the same row in `new_data`, their RT values are averaged -#' first, and this average is used for training the adjustment model. +#' Matching is done via `c("SMILES", "INCHIKEY")` if both datasets have non-missing +#' INCHIKEYs for all rows; otherwise via `c("SMILES", "NAME")`. This can be +#' overridden by supplying `match_keys`. If `match_rts = FALSE`, no matching is +#' performed and retention times are generated by calling `predict(frm, new_data, +#' adjust = FALSE)`. If multiple rows in `frm$df` match the same row in +#' `new_data`, their RT values are averaged first, and this average is used for +#' training the adjustment model. #' #' Example: if `frm$df` equals data.frame OLD shown below and `new_data` equals #' data.frame NEW, then the resulting, paired data.frame will look like PAIRED. @@ -282,7 +297,7 @@ train_frm <- function(df, method = "lasso", verbose = 1, nfolds = 5, nw = 1, #' frm <- read_rp_lasso_model_rds() #' new_data <- read_rpadj_xlsx() #' frm_adj <- adjust_frm(frm, new_data, verbose = 0) -#' +#' frm_adj <- adjust_frm(frm, new_data, verbose = 0, adj_type = "lasso") adjust_frm <- function(frm, new_data, predictors = 1:6, @@ -291,7 +306,9 @@ adjust_frm <- function(frm, seed = NULL, do_cv = TRUE, adj_type = "lm", - add_cds = NULL) { + add_cds = NULL, + match_rts = TRUE, + match_keys = NULL) { # Stubs for interactive Development if (FALSE) { @@ -309,16 +326,20 @@ adjust_frm <- function(frm, is.logical(verbose) || is.numeric(verbose), is.null(seed) || is.numeric(seed), is.logical(do_cv), - is.character(adj_type), adj_type %in% c("lm", "lasso", "ridge", "gbtree"), - is.null(add_cds) || is.logical(add_cds) + is.character(adj_type), length(adj_type) == 1, !is.na(adj_type), + is.null(add_cds) || is.logical(add_cds), + is.logical(match_rts), length(match_rts) == 1, !is.na(match_rts), + is.null(match_keys) || (is.character(match_keys) && length(match_keys) >= 1 && + all(match_keys %in% c("INCHIKEY", "SMILES", "NAME"))) ) - if (length(predictors) == 1 && predictors != 7 && adj_type != "lm") { + adj_type <- normalize_adj_type(adj_type) + if (is.null(seed)) seed <- sample.int(.Machine$integer.max, 1) + if (is.null(add_cds)) add_cds <- if (adj_type == "lm") FALSE else TRUE + if (length(predictors) == 1 && isFALSE(add_cds) && adj_type != "lm") { fmt <- "Adjustment via '%s' requires 2+ predictors. Setting 'adj_type' to 'lm'." warning(sprintf(fmt, adj_type)) adj_type <- "lm" } - if (is.null(seed)) seed <- sample.int(.Machine$integer.max, 1) - if (is.null(add_cds)) add_cds <- if (adj_type == "lm") FALSE else TRUE # Configure logging logf <- if (verbose >= 1) catf else null @@ -335,8 +356,8 @@ adjust_frm <- function(frm, # Map new and old data logf("Mapping new and old data") - args <- named(predictors, nfolds, verbose, seed, do_cv, adj_type) - df <- merge_dfs(old = frm$df, new = new_data) + args <- named(predictors, nfolds, verbose, seed, do_cv, adj_type, match_rts, match_keys) + df <- build_adjustment_df(frm, new_data, match_rts, match_keys) df <- preprocess_data( df, degree_polynomial = 1, interaction_terms = FALSE, verbose = verbose, nw = 1, rm_near_zero_var = FALSE, rm_na = FALSE, add_cds = add_cds, @@ -373,6 +394,7 @@ adjust_frm <- function(frm, models <- mapply( adjust_frm, list(frm), train_dfs, list(predictors), nfolds, verbose, seeds, FALSE, adj_type, + MoreArgs = list(add_cds = add_cds, match_rts = match_rts, match_keys = match_keys), SIMPLIFY = FALSE, USE.NAMES = FALSE ) dbgf("Predicting RT_ADJ for hold-out data in each fold") @@ -486,9 +508,9 @@ predict.frm <- function(object = train_frm(), # Init locals logf <- if (verbose >= 1) catf else null - cd_pds <- get_predictors(object, base = TRUE, adjust = FALSE) - dgp <- get_dgp(cd_pds) - iat <- any(grepl(":", cd_pds)) + pds <- get_predictors(object, base = TRUE, adjust = FALSE) + dgp <- get_dgp(pds) + iat <- any(grepl(":", pds)) rm_nzv <- FALSE rm_na <- FALSE add_cds <- TRUE @@ -498,47 +520,40 @@ predict.frm <- function(object = train_frm(), errmsg <- "Model has not been adjusted yet. Please adjust the model first using `adjust_frm()`." stop(errmsg) } - if (!all(cd_pds %in% colnames(df))) { - logf("Chemical descriptors not found in newdata. Trying to calculate them from the provided SMILES.") + if (!all(pds %in% colnames(df))) { + logf("Predictors not found in newdata. Trying to calculate them from the provided SMILES.") df <- preprocess_data( df, dgp, iat, verbose, 1, rm_nzv, rm_na, add_cds, rm_ucs = TRUE, rt_terms = 1, mandatory = c("NAME", "SMILES") ) } - if (!all(cd_pds %in% colnames(df))) { - missing <- paste(setdiff(cd_pds, colnames(df)), collapse = ", ") - errmsg <- paste("The following cd_pds are missing in `df`: ", missing) + if (!all(pds %in% colnames(df))) { + missing <- paste(setdiff(pds, colnames(df)), collapse = ", ") + errmsg <- paste("The following predictors are missing in `df`: ", missing) stop(errmsg) } # Impute missing values - if (impute && any(is.na(df[, cd_pds]))) { - nap <- cd_pds[colSums(is.na(df[, cd_pds])) > 0] - napstr <- paste(nap, collapse = ", ") - logf("NA values found for following cd_pds: %s", napstr) - logf("Replacing NA values by column means of training data") + if (impute) { train_df <- object$df - if (!all(cd_pds %in% colnames(train_df))) { + if (!all(pds %in% colnames(train_df))) { # We don't store interaction terms and/or polynomial features in # the training data frame, so we need to preprocess it again to get - # these features. + # these features, if they are required for imputation. train_df <- preprocess_data( train_df, dgp, iat, verbose, nw = 1, rm_near_zero_var = FALSE, rm_na = FALSE, add_cds = TRUE, rm_ucs = TRUE, rt_terms = 1, mandatory = c("NAME", "SMILES") ) } - for (p in nap) { - col_mean <- mean(train_df[[p]], na.rm = TRUE) - df[[p]][is.na(df[[p]])] <- col_mean - } + df <- impute_missing_predictors(df, pds, train_df, logf) } # Predict retention times using base model adjust <- !is.null(object$adj) && (isTRUE(adjust) || is.null(adjust)) logf("Predicting retention times") - yhat <- as.numeric(predict(object$model, as.matrix(df[, cd_pds]))) + yhat <- as.numeric(predict(object$model, as.matrix(df[, pds]))) # Adjust predictions if requested if (adjust) { @@ -558,6 +573,11 @@ predict.frm <- function(object = train_frm(), ) } adj_df <- adj_df[, adj_pds, drop = FALSE] + + # Impute missing values for adjustment predictors (can happen for + # exotic compounds where CD calculation fails for a few features). + if (impute) adj_df <- impute_missing_predictors(adj_df, adj_pds, object$adj$df, logf) + X_adj <- if (inherits(object$adj$model, "lm")) adj_df else as.matrix(adj_df) x <- try(yhat <- as.numeric(predict(object$adj$model, X_adj))) } @@ -573,6 +593,37 @@ predict.frm <- function(object = train_frm(), as.numeric(yhat) } +impute_missing_predictors <- function(df, pds, train_df, logf = null) { + if (length(pds) == 0) return(df) + if (!all(pds %in% colnames(df))) { + missing <- paste(setdiff(pds, colnames(df)), collapse = ", ") + stop(sprintf("The following predictors are missing in `df`: %s", missing)) + } + if (!all(pds %in% colnames(train_df))) { + missing <- paste(setdiff(pds, colnames(train_df)), collapse = ", ") + fmt <- "Required predictors missing in training data: %s. This indicates a model/training-data mismatch." + stop(sprintf(fmt, missing)) + } + + bad <- !is.finite(as.matrix(df[, pds, drop = FALSE])) + if (any(bad)) df[, pds][bad] <- NA + if (!anyNA(df[, pds, drop = FALSE])) return(df) + + nap <- pds[colSums(is.na(df[, pds, drop = FALSE])) > 0] + if (length(nap) == 0) return(df) + + logf("NA values found for following predictors: %s", paste(nap, collapse = ", ")) + logf("Replacing NA values by column means of training data") + + for (p in nap) { + col_mean <- mean(train_df[[p]], na.rm = TRUE) + if (!is.finite(col_mean) || is.na(col_mean)) col_mean <- 0 + df[[p]][is.na(df[[p]])] <- col_mean + } + + df +} + get_req_pkgs <- function(frm) { model_types <- get_model_type(frm) pkgs <- c( @@ -690,110 +741,185 @@ clip_predictions <- function(yhat, y) { yhat } -# Preprocessing ##### +# Helpers ##### + +normalize_train_method <- function(method) { + stopifnot(is.character(method), length(method) == 1, !is.na(method)) + igrepl <- function(pattern, x) grepl(pattern, x, ignore.case = TRUE) + if (igrepl("^(lasso)$", method)) return("lasso") + if (igrepl("^(ridge)$", method)) return("ridge") + if (igrepl("^(gbtree|gbtreeDefault|BRT)$", method)) return("gbtree") + if (igrepl("^(gbtreeRP|BRTRP)$", method)) return("gbtreeRP") + fmt <- "Invalid method '%s'. Allowed values: lasso, ridge, gbtree, gbtreeRP." + stop(sprintf(fmt, method)) +} + +normalize_adj_type <- function(adj_type) { + stopifnot(is.character(adj_type), length(adj_type) == 1, !is.na(adj_type)) + igrepl <- function(pattern, x) grepl(pattern, x, ignore.case = TRUE) + if (igrepl("^(lm)$", adj_type)) return("lm") + if (igrepl("^(lasso)$", adj_type)) return("lasso") + if (igrepl("^(ridge)$", adj_type)) return("ridge") + if (igrepl("^(gbtree|gbtreeDefault|BRT)$", adj_type)) return("gbtree") + fmt <- "Invalid adj_type '%s'. Allowed values: lm, lasso, ridge, gbtree." + stop(sprintf(fmt, adj_type)) +} #' @noRd -#' @title Merge training and adjustment data.frames +#' @title Build adjustment data #' @description -#' Merges the original data frame used to train a FastRet model with a new -#' data frame provided by the user for adjusting the original model. +#' Creates the data.frame used to train an adjustment model either by matching +#' the new measurements against the original training data or by predicting the +#' missing RTs with the base model. #' -#' @param old The original data frame used to train `frm`. Must contain columns -#' "NAME", "SMILES", "RT" and optionally "INCHIKEY" as well the chemical -#' descriptors listed in `CDFeatures` -#' @param new The new data frame provided by the user. Must contain columns -#' "NAME", "SMILES", "RT" and optionally "INCHIKEY". +#' @param frm FastRet model that provides the original training data and base model. +#' @param new The new adjustment data frame. +#' @param match_rts Logical indicating whether RTs should be taken from +#' `frm$df` via matching. +#' @param match_keys Character vector with the columns used for matching. #' -#' @return A data frame with following columns: -#' - NAME: Molecule names from `new` -#' - SMILES: Molecule SMILES from `new` -#' - RT: Corresponding retention times taken from `old` -#' - RT_ADJ: Retention times from `new` +#' @return A data.frame containing NAME, SMILES, optional INCHIKEY, RT, and RT_ADJ. #' #' @examples -#' new <- data.frame( -#' NAME = c("A", "B", "B", "B"), -#' SMILES = c("C", "CC", "CC", "CC"), -#' RT = c(2.5, 5.5, 5.7, 5.6) -#' ) -#' old <- data.frame( -#' NAME = c("A", "B", "B", "C"), -#' SMILES = c("C", "CC", "CC", "CCC"), -#' RT = c(5.0, 8.0, 8.2, 9.0), -#' nAtom = c(1, 2, 2, 3) -#' ) -#' merged <- merge_dfs(old, new) -#' print(merged) -#' ## data.frame( -#' ## NAME = c("A", "B", "B", "B"), -#' ## SMILES = c("C", "CC", "CC", "CC"), -#' ## RT_ADJ = c(2.5, 5.5, 5.7, 5.6), -#' ## RT = c(5, 8.1, 8.1, 8.1), -#' ## nAtom = c(1, 2, 2, 2) -#' ## ) -merge_dfs <- function(old, new) { - use_inchi <- { - !is.null(old$INCHIKEY) && all(!is.na(old$INCHIKEY)) && - !is.null(new$INCHIKEY) && all(!is.na(new$INCHIKEY)) +#' frm <- read_rp_lasso_model_rds() +#' new_data <- read_rpadj_xlsx()[1:3, ] +#' build_adjustment_df(frm, new_data, match_rts = TRUE) +build_adjustment_df <- function(frm, new, match_rts = TRUE, match_keys = NULL) { + df <- data.frame(NAME = new$NAME, SMILES = new$SMILES, RT_ADJ = new$RT) + if ("INCHIKEY" %in% colnames(new)) { + df$INCHIKEY <- new$INCHIKEY } - if (use_inchi) { - new <- data.frame( - NAME = new$NAME, - INCHIKEY = new$INCHIKEY, - SMILES = new$SMILES, - RT_ADJ = new$RT, - KEY = paste0(new$SMILES, "_", new$INCHIKEY) - ) - old <- data.frame( - RT = old$RT, - KEY = paste0(old$SMILES, "_", old$INCHIKEY) - ) - } else { - new <- data.frame( - NAME = new$NAME, - SMILES = new$SMILES, - RT_ADJ = new$RT, - KEY = paste0(new$SMILES, "_", new$NAME) - ) - old <- data.frame( - RT = old$RT, - KEY = paste0(old$SMILES, "_", old$NAME) - ) + if (!match_rts) { + df$RT <- as.numeric(predict(frm, new, adjust = FALSE, verbose = 0)) + return(df) } - old <- old[old$KEY %in% unique(new$KEY), ] - - # At this point, we can still have multiple measurements per KEY in old. To - # deal with this, we average their RTs first, then map every new entry to - # the average. - old_RT_avgs <- tapply(old$RT, old$KEY, mean) - old <- old[!duplicated(old$KEY), ] - old$RT <- as.numeric(old_RT_avgs[old$KEY]) # as.numeric drops dimnames attr - - # Now finally merge new and old data to get the paired data.frame for model - # fitting. Make sure to restore the original order after merging, as - # `merge()` returns rows in an 'unspecified' order (even with sort=FALSE). - new$ID <- seq_len(nrow(new)) - df <- merge(new, old, by = "KEY", sort = FALSE, all = FALSE) - df <- df[order(df$ID), ] + keys <- resolve_match_keys(frm$df, new, match_keys) + df$KEY <- build_match_key(new, keys, "new data") + df$ID <- seq_len(nrow(df)) + key <- build_match_key(frm$df, keys, "original data") + old <- data.frame(RT = frm$df$RT, KEY = key) + old <- old[old$KEY %in% unique(df$KEY), , drop = FALSE] + if (!nrow(old)) { + key_str <- paste(keys, collapse = "+") + fmt <- "Could not map any new entries to original data based on %s." + stop(sprintf(fmt, key_str)) + } + rt_avg <- tapply(old$RT, old$KEY, mean) + old <- old[!duplicated(old$KEY), , drop = FALSE] + old$RT <- as.numeric(rt_avg[old$KEY]) + df <- merge(df, old, by = "KEY", all = FALSE, sort = FALSE) + df <- df[order(df$ID), , drop = FALSE] df$ID <- NULL df$KEY <- NULL - if (nrow(df) < nrow(new)) { - key <- if (use_inchi) "SMILES+INCHIKEY" else "SMILES+NAME" + key_str <- paste(keys, collapse = "+") + miss <- nrow(new) - nrow(df) fmt <- "Could not map %d new entries to original data based on %s." - stop(sprintf(fmt, nrow(new) - nrow(df), key)) - } else if (nrow(df) > nrow(new)) { - # This can't happen due to averaging above. We check anyway to be safe. - msg <- sprintf("Ambiguous mapping: %d new -> %d old", nrow(new), nrow(df)) - stop(msg) + stop(sprintf(fmt, miss, key_str)) } df } +#' @noRd +#' @title Resolve match keys +#' @description Determine which columns should be used to match `new` against the training data. +#' @param old Original training data stored in `frm$df`. +#' @param new New adjustment data frame. +#' @param match_keys Optional character vector supplied by the caller. +#' @return Character vector of column names used for matching. +#' @examples +#' old <- data.frame(SMILES = "C", NAME = "A", INCHIKEY = "AAA", RT = 1) +#' new <- data.frame(SMILES = "C", NAME = "A", INCHIKEY = "AAA", RT = 2) +#' resolve_match_keys(old, new, NULL) # c("SMILES", "INCHIKEY") +#' resolve_match_keys(old, new, c("SMILES", "NAME")) # c("SMILES", "NAME") +#' resolve_match_keys(old, new, c("SMILES", "FOO")) # error +resolve_match_keys <- function(old, new, match_keys) { + allowed <- c("INCHIKEY", "SMILES", "NAME") + keys <- if (is.null(match_keys)) { + has_inchi <- ("INCHIKEY" %in% colnames(old)) && + ("INCHIKEY" %in% colnames(new)) && + all(!is.na(old$INCHIKEY)) && + all(!is.na(new$INCHIKEY)) + if (has_inchi) c("SMILES", "INCHIKEY") else c("SMILES", "NAME") + } else { + invalid <- setdiff(match_keys, allowed) + if (length(invalid)) { + fmt <- "Invalid match_keys supplied: %s" + stop(sprintf(fmt, paste(invalid, collapse = ", "))) + } + match_keys + } + ensure_cols_exist(old, keys, "original data") + ensure_cols_exist(new, keys, "new data") + ensure_no_NAs(old, keys, "original data") + ensure_no_NAs(new, keys, "new data") + keys +} + +#' @noRd +#' @title Build composite match key +#' @description Concatenate selected columns to create deterministic match keys. +#' @param df Data frame containing the columns of interest. +#' @param cols Character vector with column names used in the key. +#' @param label Short label used in error messages. +#' @return Character vector with unique keys per row. +#' @examples +#' df <- data.frame(SMILES = c("C", "CC"), NAME = c("A", "B")) +#' build_match_key(df, c("SMILES", "NAME"), "demo") +#' ## +build_match_key <- function(df, cols, label) { + parts <- lapply(cols, function(col) df[[col]]) + parts <- lapply(parts, as.character) + key <- do.call(paste, c(parts, list(sep = "__"))) + if (any(is.na(key))) { + fmt <- "Missing values detected while creating %s keys." + stop(sprintf(fmt, label)) + } + key +} + +#' @noRd +#' @title Ensure required columns are present +#' @description Stop with an informative message when required matching columns are missing. +#' @param df Data frame to inspect. +#' @param cols Character vector with required columns. +#' @param label Text describing the data frame (e.g. "new data"). +#' @examples +#' ensure_cols_exist(data.frame(SMILES = "C"), "SMILES", "demo") +ensure_cols_exist <- function(df, cols, label) { + missing <- setdiff(cols, colnames(df)) + if (length(missing)) { + fmt <- "%s missing columns required for matching: %s" + stop(sprintf(fmt, label, paste(missing, collapse = ", "))) + } +} + +#' @noRd +#' @title Ensure matching columns have no missing values +#' @description Validate that the supplied columns do not contain `NA`s before matching. +#' @param df Data frame to inspect. +#' @param cols Character vector with required columns. +#' @param label Text describing the data frame (e.g. "original data"). +#' @examples +#' ensure_no_NAs(data.frame(SMILES = "C"), "SMILES", "demo") +ensure_no_NAs <- function(df, cols, label) { + has_na <- sapply(cols, function(col) any(is.na(df[[col]]))) + if (any(has_na)) { + cols_na <- cols[has_na] + fmt <- "Columns %s contain NA values in %s." + stop(sprintf(fmt, paste(cols_na, collapse = ", "), label)) + } +} + #' @noRd #' @description Get RMSE, Rsquared, MAE and %below1min for a specific dataset and model. #' @param data dataframe with retention time in the first column #' @param model object useable as input for [predict()] +#' @examples +#' df <- data.frame(RT = c(1, 2)) +#' df[[CDFeatures[1]]] <- c(0.3, 0.8) +#' lm_mod <- stats::lm(RT ~ ., data = df) +#' get_stats(df, lm_mod) get_stats <- function(df, model) { X <- as.matrix(df[, colnames(df) %in% CDFeatures]) y <- df$RT diff --git a/TODOS.md b/TODOS.md deleted file mode 100644 index a3d480a..0000000 --- a/TODOS.md +++ /dev/null @@ -1,176 +0,0 @@ -# Todos - -- [Open](#open) - - [Make sure tests run on Linux and MacOS](#make-sure-tests-run-on-linux-and-macos) - - [Update included Measurements](#update-included-measurements) - - [Add tests for the shiny GUI](#add-tests-for-the-shiny-gui) - - [Remove RP](#remove-rp) - - [Ensure test coverage](#ensure-test-coverage) - - [Ensure backwards compatibility](#ensure-backwards-compatibility) - - [Resubmit to CRAN](#resubmit-to-cran) -- [Done](#done) - - [Allow disabling of cross-validation](#allow-disabling-of-cross-validation) - - [Fix failing tests in Github CI](#fix-failing-tests-in-github-ci) - - [Allow Adjustment based on chemical descriptors](#allow-adjustment-based-on-chemical-descriptors) - - [Add gbtree and glmnet as adjustment models](#add-gbtree-and-glmnet-as-adjustment-models) - -## Open - -### Make sure tests run on Linux and MacOS - -Currently failing tests: - -```log -══ Failed tests ════════════════════════════════════════════════════════════════ -── Error ('test-fit_gbtree.R:47:5'): fit_gbtree works with xgboost 1.7 ───────── - -Error: Error: ! in callr subprocess. -Caused by error: -! Expected `packageVersion("xgboost", lib.loc = new_lib) == package_version("1.7.9.1")` to be TRUE. -Differences: -`actual`: FALSE -`expected`: TRUE - -Installed xgboost version: 1.7.11.1 - -[ FAIL 1 | WARN 0 | SKIP 1 | PASS 94 ] -Error: -! Test failures. -Execution halted -``` - - -### Update included Measurements - -- Remove `inst/extdata/Measurements_v8.xlsx` -- Add `data/datasets.rda`, containing datasets described in - [../freda/README.md](../freda/README.md), i.e., HILIC_1, HILIC_2, - HILIC_Retip_1, HILIC_Retip_2, RP_1, RP_2, RP_AXMM_1, RP_AXMM_2, RP_B_1, - RP_B_2, RP_Flat, RP_FR25_Flat, RP_Steep, RP_T25_Flat, RP_T25_FR25_Flat, - RP_T25_FR25_Steep, RP_Val, RP_Val_Flat, RP_Val_FR25_Flat, RP_Val_Steep, - RP_Val_T25_Flat, RP_Val_T25_FR25_Flat, RP_Val_T25_FR25_Steep. - -### Add tests for the shiny GUI - -Do some research about best-practices for testing shiny applications. -If no good options exists, define a minimal set of tests that ensure that the -shiny application works as expected. And then execute each test manually. - -### Remove RP - -Replace all mentions of the following objects with corresponding objects from -the new `datasets.rda` object. See issue 'Update included Measurements'. - -- `RP.rda` -- `RP.xlsx` -- `RP_adj.xlsx` -- `RP_lass_model.rds` -- `Measurements_v8.xlsx` - -Then mark the all old objects as deprecated in the documentation - -### Ensure test coverage - -Ensure we have tests for the following scenarios: - -1. Train a ridge model, predict with it, adjust with lm/RT, predict with adjusted model -2. Train a lasso model, predict with it, adjust with gbtree/RTTCD, predict with adjusted model -3. Train a gbtree model, predict with it, adjust with lasso/RTCD, predict with adjusted model - -### Ensure backwards compatibility - -Include an adjusted lasso and gbtree model that was trained and adjusted with -FastRet version 1.2.2 or earlier. Write testcase to ensure that all *.frm -functions like adjust_frm, predict.frm, get_predictors, coef.frm, plot.frm work -as expected with those. - -### Resubmit to CRAN - -Resubmit FastRet v1.3.0 to CRAN. - -## Done - -### Allow disabling of cross-validation - -In `adjust_frm` allow `docv = FALSE` (do-cross-validation). In `train_frm` allow -`docv = FALSE` (do-cross-validation). The corresponding `cv` element of the -returned object should be `NULL` in those cases. The docs and tests must be -adjusted accordingly. - -### Fix failing tests in Github CI - -The following tests: - -```txt -── Error ('test-train_frm-gbtree.R:5:5'): train_frm works if `method == "GBTree"` ── - -Error in `FUN(X[[i]], ...)`: subscript out of bounds -Backtrace: -1. └─FastRet::train_frm(...) at test-train_frm-gbtree.R:5:5 -2. └─base::lapply(tmp, "[[", 2) - -── Error ('test-fit_gbtree.R:8:5'): fit.gbtrees works as expected ────────────── -Error in `begin_iteration:end_iteration`: argument of length 0 -Backtrace: -1. └─FastRet:::fit_gbtree(df, verbose = 0) at test-fit_gbtree.R:8:5 -2. └─FastRet:::fit_gbtree_grid(...) -3. └─xgboost::xgb.train(...) - -── Error ('test-fit_gbtree.R:16:5'): fit.gbtrees works for data from reverse phase column ── -Error in `begin_iteration:end_iteration`: argument of length 0 -Backtrace: -1. └─FastRet:::fit_gbtree(df, verbose = 0) at test-fit_gbtree.R:16:5 -2. └─FastRet:::fit_gbtree_grid(...) -3. └─xgboost::xgb.train(...) -``` - -fail in: - -- https://www.r-project.org/nosvn/R.check/r-devel-linux-x86_64-fedora-clang/FastRet-00check.html -- https://www.r-project.org/nosvn/R.check/r-devel-linux-x86_64-debian-gcc/FastRet-00check.html -- https://www.r-project.org/nosvn/R.check/r-devel-linux-x86_64-fedora-gcc/FastRet-00check.html -- https://www.r-project.org/nosvn/R.check/r-release-linux-x86_64/FastRet-00check.html -- https://www.r-project.org/nosvn/R.check/r-oldrel-windows-x86_64/FastRet-00check.html - -I strongly assume, it has to do with the update of xgboost from version 1.7.11 -to 3.1.2.1 that was published on 2025-12-03. I.e., we need to check if we can -find a way that works with both xgboost versions (1.x.x) and (3.x.x). If we -cannot, we should introduce a version check and call different code paths -depending on the installed xgboost version. In this case, we should add tests in -.github/workflows for both xgboost versions to make sure that both code paths are -tested in the future. E.g. doing something linke this: - -- {os: macos-latest, r: 'release', xgboost: 'lastest'} -- {os: windows-latest, r: 'release', xgboost: 'lastest'} -- {os: ubuntu-latest, r: 'devel', xgboost: 'lastest', http-user-agent: 'release'} -- {os: ubuntu-latest, r: 'release', xgboost: 'lastest'} -- {os: ubuntu-latest, r: 'release', xgboost: '1.7.11'} -- {os: ubuntu-latest, r: 'oldrel-1', xgboost: 'lastest'} -- {os: ubuntu-latest, r: '4.1', xgboost: 'lastest'} - -Instead of - -- {os: macos-latest, r: 'release'} -- {os: windows-latest, r: 'release'} -- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} -- {os: ubuntu-latest, r: 'release'} -- {os: ubuntu-latest, r: 'release'} -- {os: ubuntu-latest, r: 'oldrel-1'} -- {os: ubuntu-latest, r: '4.1'} - -in `.github/workflows/r-cmd-check.yaml`. - -### Allow Adjustment based on chemical descriptors - -In `adjust_frm` add argument `add_cds`. If `add_cds = TRUE` chemical descriptors -should be calculated using `preprocess_data()` (as done in `train_frm()`) but -always without interaction terms and without polynomial terms. The resulting CDs -should be included in the adjustment model. - -### Add gbtree and glmnet as adjustment models - -Currently, adjustment models are only fitted using `lm`. In addition, `glmnet` -and `gbtree` models (as returned by `fit_glmnet()` and `fit_gbtree()`) should be -supported as well. Introduce a new argument `adj_type` to `adjust_frm()` that -can take values `"lm"` (default), `"glmnet"`, and `"gbtree"` and based on this -value decide which adjustment model to fit. diff --git a/man/adjust_frm.Rd b/man/adjust_frm.Rd index 764b0a6..4b3fbc1 100644 --- a/man/adjust_frm.Rd +++ b/man/adjust_frm.Rd @@ -13,7 +13,9 @@ adjust_frm( seed = NULL, do_cv = TRUE, adj_type = "lm", - add_cds = NULL + add_cds = NULL, + match_rts = TRUE, + match_keys = NULL ) } \arguments{ @@ -45,6 +47,17 @@ the \code{cv} element in the returned adjustment object will be NULL.} \item{add_cds}{A logical value indicating whether to add chemical descriptors as predictors to new data. Default is TRUE if \code{adj_type} is "lasso", "ridge" or "gbtree" and FALSE if \code{adj_type} is "lm".} + +\item{match_rts}{Logical indicating how to obtain the RT feature used during training of the +adjustment model. If TRUE (default), RT is obtained by matching rows in +\code{new_data} to rows in \code{frm$df} based on \code{match_keys}. If FALSE, RT is obtained +by applying the base model to \code{new_data}.} + +\item{match_keys}{Which columns to use for matching \code{new_data} to \code{frm$df} if \code{match_rts} is +TRUE. Can be any combination of "INCHIKEY", "SMILES" and "NAME". If left at +NULL, matching is done via \code{c("SMILES", "INCHIKEY")} if INCHIKEYs are +available, else via \code{c("SMILES", "NAME")}. See 'Details' for more information +on the matching procedure.} } \value{ An object of class \code{frm}, as returned by \code{\link[=train_frm]{train_frm()}}, but with an @@ -80,10 +93,12 @@ the original column) and to attach this adjustment model to an existing FastRet model. } \details{ -Matching is done via "SMILES"+"INCHIKEY" if both datasets have non-missing -INCHIKEYs for all rows; otherwise via "SMILES"+"NAME". If multiple rows in -\code{frm$df} match the same row in \code{new_data}, their RT values are averaged -first, and this average is used for training the adjustment model. +Matching is done via \code{c("SMILES", "INCHIKEY")} if both datasets have non-missing +INCHIKEYs for all rows; otherwise via \code{c("SMILES", "NAME")}. This can be +overridden by supplying \code{match_keys}. If \code{match_rts = FALSE}, no matching is +performed and retention times are generated by calling \code{predict(frm, new_data, adjust = FALSE)}. If multiple rows in \code{frm$df} match the same row in +\code{new_data}, their RT values are averaged first, and this average is used for +training the adjustment model. Example: if \code{frm$df} equals data.frame OLD shown below and \code{new_data} equals data.frame NEW, then the resulting, paired data.frame will look like PAIRED. diff --git a/man/selective_measuring.Rd b/man/selective_measuring.Rd index aec9f19..a418d1b 100644 --- a/man/selective_measuring.Rd +++ b/man/selective_measuring.Rd @@ -25,15 +25,17 @@ function.} \item{rt_coef}{Which coefficient to use for scaling RT before clustering. Options are: \itemize{ -\item max_ridge_coef: scale with the maximum absolute coefficient obtained in -ridge regression. I.e., RT will have approximately the same weight as the -most important chemical descriptor. -\item 1: do not scale RT any further, i.e., use standardized RT. The effect of +\item \code{0}: exclude RT from the clustering. +\item 'max' / 'max_ridge_coef': scale with the maximum absolute coefficient +obtained in ridge regression. I.e., RT will have approximately the same +weight as the most important chemical descriptor. +\item \code{1}: do not scale RT any further, i.e., use standardized RT. The effect of leaving RT unscaled is kind of unpredictable, as the ridge coefficients depend on the dataset. If the maximum absolute coefficient is much smaller than 1, RT will dominate the clustering. If it is much larger than 1, RT will have little influence on the clustering. -\item 0: exclude RT from the clustering. +\item 'inf': set all chemical descriptor values to zero, i.e., RT is "infinitely" +more important than any chemical descriptor. }} } \value{ @@ -59,7 +61,7 @@ selects a sensible subset of molecules from the original dataset for re-measurement. The selection process includes: \enumerate{ \item Generating chemical descriptors from the SMILES strings of the molecules. -These are the features used by \code{\link[=train_frm]{train_frm()}} and \code{\link[=adjust_frm]{adjust_frm()}}. +These are the features used by \code{\link[=train_frm]{train_frm()}} and \code{\link[=adjust_frm]{adjust_frm()}}.˝ \item Standardizing chemical descriptors to have zero mean and unit variance. \item Training a Ridge Regression model with the standardized chemical descriptors as features and the retention times as the target variable. diff --git a/misc/datasets/Measurements_v10.xlsx b/misc/datasets/Measurements_v10.xlsx deleted file mode 100644 index b6ef79f..0000000 Binary files a/misc/datasets/Measurements_v10.xlsx and /dev/null differ diff --git a/tests/testthat/test-adjust_frm.R b/tests/testthat/test-adjust_frm.R index 5e92685..8d65084 100644 --- a/tests/testthat/test-adjust_frm.R +++ b/tests/testthat/test-adjust_frm.R @@ -62,7 +62,7 @@ test_that("adjust_frm merges by INCHIKEY when available", { # Now change a few NAMES as well, so that even the SMILES+NAME fallback # cannot find matches for all entries - new$NAME[8:10] <- NA + new$NAME[8:10] <- paste0("missing_", seq_len(3)) expect_error( afm3 <- adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 42, predictors = 1), "Could not map 3 new entries" @@ -79,6 +79,64 @@ test_that("adjust_frm merges by INCHIKEY when available", { }) +test_that("adjust_frm can skip RT matching and use predictions", { + + frm <- readRDS(pkg_file("extdata/RP_lasso_model.rds")) + idx <- seq(2, 200, by = 20) + new <- frm$df[idx, c("NAME", "SMILES", "RT")] + set.seed(19) + new$RT <- new$RT + runif(nrow(new), min = -0.2, max = 0.2) + + base_preds <- predict(frm, df = new, adjust = FALSE, verbose = 0) + afm <- adjust_frm( + frm = frm, + new_data = new, + nfolds = 2, + verbose = 0, + seed = 19, + predictors = 1:2, + adj_type = "lasso", + match_rts = FALSE + ) + + expect_equal(afm$adj$df$RT, base_preds) + expect_equal(afm$adj$df$RT_ADJ, new$RT) + expect_false(afm$adj$args$match_rts) + expect_true(is.null(afm$adj$args$match_keys)) + expect_equal(nrow(afm$adj$df), nrow(new)) +}) + + +test_that("adjust_frm respects custom match_keys", { + + frm <- readRDS(pkg_file("extdata/RP_lasso_model.rds")) + frm$df$INCHIKEY <- NULL + idx <- seq(3, 120, by = 15) + new <- frm$df[idx, c("NAME", "SMILES", "RT")] + new$NAME <- paste0(new$NAME, "__new") + + expect_error( + adjust_frm(frm, new, nfolds = 2, verbose = 0, seed = 5), + regexp = "Could not map" + ) + + afm <- adjust_frm( + frm, new, + nfolds = 2, + verbose = 0, + seed = 5, + predictors = 1:2, + adj_type = "lasso", + match_keys = "SMILES" + ) + + expect_equal(nrow(afm$adj$df), nrow(new)) + expect_identical(afm$adj$args$match_keys, "SMILES") + expect_true(all(afm$adj$df$SMILES == new$SMILES)) + expect_equal(afm$adj$df$RT_ADJ, new$RT) +}) + + test_that("adjust_frm works with do_cv = FALSE", { # Load pretrained model for speed-up frm <- readRDS(pkg_file("extdata/RP_lasso_model.rds")) diff --git a/tests/testthat/test-predict_frm.R b/tests/testthat/test-predict_frm.R index 5ff9c70..6e9a9cd 100644 --- a/tests/testthat/test-predict_frm.R +++ b/tests/testthat/test-predict_frm.R @@ -57,4 +57,26 @@ test_that("predict.frm imputes CDs if they are NA", { # Check Results expect_equal(round(yhat, 2), c(4.31, 4.54)) expect_equal(round(yhat_no_imputing, 2), c(NaN, 4.54)) + + # Now test imputation for adjusted predictions. + # + # Adjustment dataframes never remove NAs or NZV-predictors during training. + # Therefore the predict function assumes them to be present during + # prediction as well. To mimic this behavior, store an apprpriately + # preprocessed dataframe in frm$adj$df. + frm$adj$model <- frm$model # Reuse same model, which depends on Kier3 + frm$adj$df <- preprocess_data( + frm$df, degree_polynomial = 1, interaction_terms = FALSE, verbose = 0, + nw = 1, rm_near_zero_var = FALSE, rm_na = FALSE, add_cds = TRUE, + rm_ucs = TRUE, rt_terms = 1 + ) + + # Without imputation, NA in adjustment predictors yields NA predictions. + yhat_no_impute <- predict.frm(frm, new_data, adjust = TRUE, impute = FALSE, clip = FALSE, verbose = 0) + expect_true(anyNA(yhat_no_impute)) + + # With imputation, predictions must be finite. + yhat <- predict.frm(frm, new_data, adjust = TRUE, impute = TRUE, clip = FALSE, verbose = 0) + expect_false(anyNA(yhat)) + expect_true(all(is.finite(yhat))) })