diff --git a/R/AAA.R b/R/AAA.R index 2692bd9..705b537 100644 --- a/R/AAA.R +++ b/R/AAA.R @@ -3,3 +3,6 @@ ### the variance of the negative gradient options(gamboostLSS_stab_ngrad = FALSE) } + +# get rid of NOTEs in R CMD check for "undefined global functions or variables" +globalVariables(c("ngradient", "y", "combined_risk")) diff --git a/R/cvrisk.nc_mboostLSS.R b/R/cvrisk.nc_mboostLSS.R index 02793e3..0743e34 100644 --- a/R/cvrisk.nc_mboostLSS.R +++ b/R/cvrisk.nc_mboostLSS.R @@ -99,7 +99,7 @@ plot.nc_cvriskLSS <- function(x, type = "lines", main = attr(x, "type"), ...) { if (type != "lines") - warning("Only ", sQuote('type = "lines"'), " supported for non-cyclical fitting") + warning("Only ", sQuote('type = "lines"'), " supported for noncyclical fitting") plot.cvriskLSS(x = x, type = "lines", xlab = xlab, ylab = ylab, ylim = ylim, main = main, ...) } diff --git a/R/helpers.R b/R/helpers.R index f8e7629..d7a593c 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -119,20 +119,20 @@ rescale_weights <- function(w) { ## helper function in a modified version based on mboost_2.2-3 ## print trace of boosting iterations -do_trace <- function(current, mstart, risk, - linebreak = options("width")$width / 2, mstop = 1000) { - current <- current - mstart +do_trace <- function(current, risk, mstart, + linebreak = round(options("width")$width / 2), mstop = 1000) { + + current <- current - mstart + if (current != mstop) { - if ((current - 1) %/% linebreak == (current - 1) / linebreak) { + if (current %% linebreak == 1) { mchr <- formatC(current + mstart, format = "d", width = nchar(mstop) + 1, big.mark = "'") cat(paste("[", mchr, "] ",sep = "")) - } else { - if ((current %/% linebreak != current / linebreak)) { - cat(".") - } else { - cat(" -- risk:", risk[current + mstart], "\n") - } + } + cat(".") + if (current %% linebreak == 0) { + cat(" -- risk:", risk[current + mstart], "\n") } } else { cat("\nFinal risk:", risk[current + mstart], "\n") diff --git a/R/mboostLSS.R b/R/mboostLSS.R index 51ca303..3e9fd19 100644 --- a/R/mboostLSS.R +++ b/R/mboostLSS.R @@ -4,14 +4,14 @@ ### (glm/gam/m/black)boostLSS functions mboostLSS <- function(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL, - method = c("cycling", "inner", "outer"), ...){ - + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...){ + cl <- match.call() if(is.null(cl$families)) cl$families <- families method <- match.arg(method) - + fit <- mboostLSS_fit(formula = formula, data = data, families = families, control = control, weights = weights, ..., fun = mboost, funchar = "mboost", call = cl, @@ -20,47 +20,47 @@ mboostLSS <- function(formula, data = list(), families = GaussianLSS(), } glmboostLSS <- function(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL, - method = c("cycling", "inner", "outer"), ...){ - + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...){ + cl <- match.call() if(is.null(cl$families)) cl$families <- families method <- match.arg(method) - + fit <- mboostLSS_fit(formula = formula, data = data, families = families, control = control, weights = weights, ..., fun = glmboost, funchar = "glmboost", call = cl, method = method) - + return(fit) } gamboostLSS <- function(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL, - method = c("cycling", "inner", "outer"), ...){ - + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...){ + cl <- match.call() if(is.null(cl$families)) cl$families <- families method <- match.arg(method) - + fit <- mboostLSS_fit(formula = formula, data = data, families = families, control = control, weights = weights, ..., fun = gamboost, funchar = "gamboost", call = cl, method = method) - + return(fit) } blackboostLSS <- function(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL, - method = c("cycling", "inner", "outer"), ...){ + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...){ cl <- match.call() if(is.null(cl$families)) cl$families <- families method <- match.arg(method) - + fit <- mboostLSS_fit(formula = formula, data = data, families = families, control = control, weights = weights, ..., fun = blackboost, funchar = "blackboost", call = cl, @@ -74,249 +74,15 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), control = boost_control(), weights = NULL, fun = mboost, funchar = "mboost", call = NULL, method, ...){ - - if(method == "cycling") - iBoost_cycling <- function(niter) { - - start <- sapply(fit, mstop) - mvals <- vector("list", length(niter)) - for (j in 1:length(niter)){ - mvals[[j]] <- rep(start[j] + niter[j], max(niter)) - if (niter[j] > 0) - mvals[[j]][1:niter[j]] <- (start[j] + 1):(start[j] + niter[j]) - } - - ENV <- lapply(mods, function(j) environment(fit[[j]]$subset)) - - for (i in 1:max(niter)){ - for (j in mods){ - ## update value of nuisance parameters - ## use response(fitted()) as this is much quicker than fitted(, type = response) - for (k in mods[-j]) - assign(names(fit)[k], families[[k]]@response(fitted(fit[[k]])), - environment(get("ngradient", environment(fit[[j]]$subset)))) - ## update value of u, i.e. compute ngradient with new nuisance parameters - - ENV[[j]][["u"]] <- ENV[[j]][["ngradient"]](ENV[[j]][["y"]], ENV[[j]][["fit"]], - ENV[[j]][["weights"]]) - # same as: - # evalq(u <- ngradient(y, fit, weights), environment(fit[[j]]$subset)) - - ## update j-th component to "m-th" boosting step - fit[[j]][mvals[[j]][i]] - } - if (trace){ - firstRun <- firstRun - ## which is the current risk? rev() needed to get the last - ## list element with maximum length - whichRisk <- names(which.max(rev(lapply(lapply(fit, function(x) x$risk()), length)))) - do_trace(current = max(sapply(mvals, function(x) x[i])), - mstart = ifelse(firstRun, 0, max(start)), - mstop = ifelse(firstRun, max(niter) + 1, max(niter)), - risk = fit[[whichRisk]]$risk()) - } - } - return(TRUE) - } - if(method == "outer") - iBoost_outer <- function(niter){ - - #this is the case for boosting from the beginning - if(is.null(attr(fit, "combined_risk")) | niter == 0){ - combined_risk <- vapply(fit, risk, numeric(1)) - } - - best <- which(names(fit) == tail(names(combined_risk), 1)) - - - - ENV <- lapply(mods, function(j) environment(fit[[j]]$subset)) - - for (i in seq_len(niter)){ - - ## update value of nuisance parameters - ## use response(fitted()) as this is much quicker than fitted(, type = response) - for( k in mods[-best]){ - assign(names(fit)[best], families[[best]]@response(fitted(fit[[best]])), - environment(get("ngradient", environment(fit[[k]]$subset)))) - - #evalq(u <- ngradient(y, fit, weights), ENV[[k]]) - ENV[[k]][["u"]] <- ENV[[k]][["ngradient"]](ENV[[k]][["y"]], - ENV[[k]][["fit"]], - ENV[[k]][["weights"]]) - } - - if (funchar == "glmboost") { - lik_risks <- list() - coefs <- list() - - for(i in mods){ - coefs[[i]] <- evalq(environment(get("fit1"))[["est"]](u)/ - environment(get("fit1"))[["sxtx"]], envir = ENV[[i]]) - - all_fitted <- sweep(environment(ENV[[i]][["fit1"]])[["X"]], 2, - coefs[[i]] ,`*`) - - lik_risks[[i]] <- sapply(1:ncol(all_fitted), function(j) - ENV[[i]][["triskfct"]](ENV[[i]][["y"]], - ENV[[i]][["fit"]] + nu[i] * all_fitted[,j])) - } - - #do a mboost step per hand - best <- which.min(vapply(lik_risks, min, - FUN.VALUE = numeric(1), ...)) - - ENV[[best]]$all_fitted <- all_fitted - ENV[[best]]$lik_risks <- lik_risks - ENV[[best]]$best <- best - ENV[[best]]$coefs <- coefs - - evalq({ - xs <- which.min(lik_risks[[best]]) - basses <- list(model = c(coef = coefs[[best]][xs], - xselect = xs, - p = length(coefs[[best]])), - fitted = function() { - return(coefs[[best]][xs] * environment(get("fit1"))[["X"]][, xs, drop = FALSE]) - }) - class(basses) <- c("bm_cwlin", "bm_lin", "bm") - fit <- fit + nu * basses$fitted() - u <- get("ngradient")(get("y"), fit, weights) - mrisk[(mstop + 1)] <- get("triskfct")(get("y"), fit) - ens[[(mstop + 1)]] <- basses - xselect[(mstop + 1)] <- xs - nuisance[[(mstop +1)]] <- family@nuisance() - mstop <- mstop + 1}, - envir <- ENV[[best]]) - } - - #all other baselearner - else{ - risks <- list() - for( i in mods){ - risks[[i]] <- evalq({ - ss_new <- lapply(get("blfit", envir = environment(get("basefit"))), - function(x) x(u)) - sapply(ss_new, function(x) get("riskfct")(get("y"), fit + nu * x$fitted(), weights)) - }, envir = ENV[[i]]) - } - - best <- which.min(vapply(risks, min, - FUN.VALUE = numeric(1))) - - evalq({ - ss <- lapply(get("blfit", envir = environment(get("basefit"))), - function(x) x(u)) - xselect[mstop + 1] <- which.min(sapply(ss, function(x) - get("riskfct")(get("y"), fit + nu * x$fitted(), weights))) - fit <- fit + nu * ss[[tail(xselect, 1)]]$fitted() - u <- get("ngradient")(get("y"), fit, weights) - mrisk[(mstop + 1)] <- get("triskfct")(get("y"), fit) - ens[[(mstop + 1)]] <- ss[[tail(xselect, 1)]] - nuisance[[(mstop + 1)]] <- family@nuisance() - mstop <- mstop + 1}, - envir = ENV[[best]]) - - } - - combined_risk[(length(combined_risk) + 1)] <- tail(risk(fit[[best]]), 1) - names(combined_risk)[length(combined_risk)] <- names(fit)[best] - - if (trace){ - do_trace(current = length(combined_risk[combined_risk != 0]), - mstart = ifelse(firstRun, length(fit) + 1, - length(combined_risk[combined_risk != 0])), - mstop = ifelse(firstRun, niter - length(fit), niter), - risk = combined_risk[combined_risk != 0]) - } - } - combined_risk <<- combined_risk - return(TRUE) - } - if(method == "inner" ) - iBoost_inner <- function(niter){ - - - #this is the case for boosting from the beginning - if(is.null(attr(fit, "combined_risk")) | niter == 0){ - combined_risk <- vapply(fit, risk, numeric(1)) - } - - best <- which(names(fit) == tail(names(combined_risk), 1)) - - - - ENV <- lapply(mods, function(j) environment(fit[[j]]$subset)) - - for (i in seq_len(niter)){ - - ## update value of nuisance parameters - ## use response(fitted()) as this is much quicker than fitted(, type = response) - for( k in mods[-best]){ - assign(names(fit)[best], families[[best]]@response(fitted(fit[[best]])), - environment(get("ngradient", environment(fit[[k]]$subset)))) - } - - risks <- numeric(length(fit)) - for(b in 1:length(fit)){ - st <- mstop(fit[[b]]) - fit[[b]][st + 1] - #risks[b] <- evalq({riskfct(y, fit, weights)}, envir = ENV[[b]]) - risks[b] <- ENV[[b]][["riskfct"]](ENV[[b]][["y"]], - ENV[[b]][["fit"]], - ENV[[b]][["weights"]]) - fit[[b]][st] - - ## fit[[b]][st] is not enough to reduce the model back to beginning, so - ## so all these values have to be reduced, so that they are calculated - ## correctly the next time - evalq({xselect <- xselect[1:mstop]; - mrisk <- mrisk[1:mstop]; - ens <- ens[1:mstop]; - nuisance <- nuisance[1:mstop]}, - environment(fit[[b]]$subset)) - } - - - - best <- which.min(risks) - - ## update value of u, i.e. compute ngradient with new nuisance parameters - ENV[[best]][["u"]] <- ENV[[best]][["ngradient"]](ENV[[best]][["y"]], - ENV[[best]][["fit"]], - ENV[[best]][["weights"]]) - - # same as: - # evalq(u <- ngradient(y, fit, weights), environment(fit[[j]]$subset)) - - - ## update selected component by 1 - fit[[best]][mstop(fit[[best]]) + 1] - - - combined_risk[(length(combined_risk) + 1)] <- tail(risk(fit[[best]]), 1) - names(combined_risk)[length(combined_risk)] <- names(fit)[best] - - if (trace){ - do_trace(current = length(combined_risk[combined_risk != 0]), - mstart = ifelse(firstRun, length(fit) + 1, - length(combined_risk[combined_risk != 0])), - mstop = ifelse(firstRun, niter - length(fit), niter), - risk = combined_risk[combined_risk != 0]) - } - combined_risk <<- combined_risk - } - return(TRUE) - } - + if (length(families) == 0) stop(sQuote("families"), " not specified") - + if ("offset" %in% names(list(...))) stop("Do not use argument ", sQuote("offset"), ". Please specify offsets via families") ### Use mu in "families" to specify offset in mu-family etc. - + if (is.list(formula)){ if (!all(names(formula) %in% names(families)) || length(unique(names(formula))) != length(names(families))) @@ -339,28 +105,23 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), tmp[[i]] <- formula formula <- tmp } - + mstop <- mstoparg <- control$mstop - control$mstop <- 1 - if (method == "cycling") { + control$mstop <- 0 + if (method == "cyclic") mstop <- check(mstop, "mstop", names(families)) - } else { - #check mstop for inner and outer loss fitting methods - if (length(mstop) != 1 | mstop %% 1 != 0 | mstop < length(families)) - stop(sQuote("mstop"), " has to be an integer larger than ", - length(families)) - } - + + nu <- control$nu nu <- check(nu, "nu", names(families)) - + if (is.list(control$risk) || is.list(control$center) || is.list(control$trace)) stop(sQuote("risk"),", ", sQuote("center"), " and ", sQuote("trace") , " cannot be lists in ", sQuote("boost_control")) - + trace <- control$trace control$trace <- FALSE - + w <- weights if (is.null(weights)){ if (!is.list(response)) { @@ -370,12 +131,12 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), } } weights <- rescale_weights(weights) - + fit <- vector("list", length = length(families)) names(fit) <- names(families) - + mods <- 1:length(fit) - + offset <- vector("list", length = length(mods)) names(offset) <- names(families) for (j in mods){ @@ -409,47 +170,136 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), control=control, weights = w, ...)) } - if (trace) - do_trace(current = 1, mstart = 0, - mstop = max(mstop), - risk = fit[[length(fit)]]$risk()) - - #select the correct fitting method - if (method == "cycling") { - if (any(mstop > 1)){ - firstRun <- TRUE - ## actually go for initial mstop iterations! - tmp <- iBoost_cycling(mstop - 1) + + iBoost <- function(niter, method) { + + start <- sapply(fit, mstop) + # initialize combined_risk + combined_risk <- NA + + if (method == "noncyclic") { + ### noncyclical fitting ### + # this is the case for boosting from the beginning + if (is.null(attr(fit, "combined_risk")) | niter == 0) { + combined_risk <- vapply(fit, risk, numeric(1)) + } + + best <- which(names(fit) == tail(names(combined_risk), 1)) + } else { + ### cyclical fitting ### + mvals <- vector("list", length(niter)) + for (j in 1:length(niter)) { + mvals[[j]] <- rep(start[j] + niter[j], max(niter)) + if (niter[j] > 0) + mvals[[j]][1:niter[j]] <- (start[j] + 1):(start[j] + niter[j]) + } } - } - - else { - if (mstop >= length(fit)){ - firstRun <- TRUE - combined_risk <- NULL - tmp <- ifelse(method == "inner", - iBoost_inner(mstop - length(fit)), - iBoost_outer(mstop - length(fit))) + + ENV <- lapply(mods, function(j) environment(fit[[j]]$subset)) + ## main loop starts here ## + for (i in 1:max(niter)){ + if (method == "noncyclic") { + ### noncyclical fitting ### + + ## update value of nuisance parameters + ## use response(fitted()) as this is much quicker than fitted(, type = response) + for( k in mods[-best]) { + assign(names(fit)[best], families[[best]]@response(fitted(fit[[best]])), + environment(get("ngradient", environment(fit[[k]]$subset)))) + } + + risks <- numeric(length(fit)) + for(b in 1:length(fit)){ + st <- mstop(fit[[b]]) + mstop(fit[[b]]) = st + 1 + risks[b] <- tail(risk(fit[[b]]), 1) + #evalq({riskfct(y, fit, weights)}, envir = ENV[[b]]) + #risks[b] <- ENV[[b]][["riskfct"]](ENV[[b]][["y"]], ENV[[b]][["fit"]], ENV[[b]][["weights"]]) + fit[[b]][st] + + ## fit[[b]][st] is not enough to reduce the model back to beginning, so + ## so all these values have to be reduced, so that they are calculated + ## correctly the next time + evalq({ + xselect <- xselect[seq_len(mstop)]; + mrisk <- mrisk[seq_len(mstop + 1)]; + ens <- ens[seq_len(mstop)]; + nuisance <- nuisance[seq_len(mstop)] + }, + environment(fit[[b]]$subset)) + } + + best <- which.min(risks) + + ## update value of u, i.e. compute ngradient with new nuisance parameters + evalq({u <- ngradient(y, fit, weights)}, ENV[[best]]) + #ENV[[best]][["u"]] <- ENV[[best]][["ngradient"]](ENV[[best]][["y"]], ENV[[best]][["fit"]], ENV[[best]][["weights"]]) + + ## update selected component by 1 + fit[[best]][mstop(fit[[best]]) + 1] + + ## update risk list + combined_risk[(length(combined_risk) + 1)] <- tail(risk(fit[[best]]), 1) + names(combined_risk)[length(combined_risk)] <- names(fit)[best] + combined_risk <<- combined_risk + + } else { + ### cyclical fitting ### + for (j in mods){ + ## update value of nuisance parameters + ## use response(fitted()) as this is much quicker than fitted(, type = response) + for (k in mods[-j]) + assign(names(fit)[k], families[[k]]@response(fitted(fit[[k]])), + environment(get("ngradient", environment(fit[[j]]$subset)))) + ## update value of u, i.e. compute ngradient with new nuisance parameters + + ENV[[j]][["u"]] <- ENV[[j]][["ngradient"]](ENV[[j]][["y"]], ENV[[j]][["fit"]], + ENV[[j]][["weights"]]) + # same as: + # evalq(u <- ngradient(y, fit, weights), environment(fit[[j]]$subset)) + + ## update j-th component to "m-th" boosting step + fit[[j]][mvals[[j]][i]] + } + } + + if (trace){ + if (method == "noncyclic") { + do_trace(current = length(combined_risk) - length(fit), mstart = sum(start), + risk = combined_risk[-(1:length(fit))], mstop = niter) + } else { + ## which is the current risk? rev() needed to get the last + ## list element with maximum length + whichRisk <- names(which.max(rev(lapply(fit, function(x) length(risk(x)))))) + do_trace(current = max(sapply(mvals, function(x) x[i])), + mstart = max(start), + mstop = max(niter), + risk = risk(fit[[whichRisk]])[-1]) + } + + } } + return(TRUE) } - - firstRun <- FALSE - - class(fit) <- c(paste(funchar, "LSS", sep=""), "mboostLSS") - if(method != "cycling"){ + + if (any(mstop > 0)) + tmp <- iBoost(mstop, method = method) + + class(fit) <- c(paste0(funchar, "LSS"), "mboostLSS") + if(method != "cyclic"){ class(fit) <- c("nc_mboostLSS", class(fit)) } - + ### update to a new number of boosting iterations mstop ### i <= mstop means less iterations than current ### i > mstop needs additional computations ### updates take place in THIS ENVIRONMENT, ### some models are CHANGED! - if(method == "cycling") { + if(method == "cyclic") { attr(fit, "subset") <- function(i) { - + i <- check(i, "mstop", names(families)) - + msf <- mstop(fit) niter <- i - msf if (all(niter == 0)) { @@ -458,88 +308,83 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), } else { minStart <- min(msf[niter != 0], i[niter != 0]) } - + ## check if minStart bigger than mstop of parameters that are not touched #if (length(msf[niter == 0]) > 0 && minStart < min(msf[niter == 0])) #minStart <- min(msf[niter == 0]) - + ## reduce models first (when necessary) if (any(msf > minStart)){ cf <- class(fit) class(fit) <- "list" ## needed to use [] operator for lists - + #cat("processed parameters: ", paste(names(fit[msf > minStart]), # collapse = ", "), "\n") - + lapply(fit[msf > minStart], function(obj) obj$subset(minStart)) - + ## remove additional boosting iterations from environments lapply(fit[msf > minStart], function(obj){ - evalq({xselect <- xselect[1:mstop]; - mrisk <- mrisk[1:mstop]; - ens <- ens[1:mstop]; - nuisance <- nuisance[1:mstop]}, + evalq({xselect <- xselect[seq_len(mstop)]; + mrisk <- mrisk[seq_len(mstop + 1)]; + ens <- ens[seq_len(mstop)]; + nuisance <- nuisance[seq_len(mstop)]}, environment(obj$subset)) }) - + class(fit) <- cf if (trace) cat("Model first reduced to mstop = ", minStart, ".\n", "Now continue ...\n", sep ="") } - + ## now increase models (when necessary) if (any(i > minStart)){ ## set negative values to 0 ## (only applicable if some parameters do not need to be touched inc <- ifelse(i - minStart > 0, i - minStart, 0) - tmp <- iBoost_cycling(inc) + tmp <- iBoost(inc, method = method) } - + mstop <<- i } - } + } else { attr(fit, "subset") <- function(i) { - if(i < length(fit)){ - warning(paste("Minimal number of iterations:", length(fit), - "(at least one iteration for each distribution parameter)")) - i <- length(fit) - } msf <- sum(mstop(fit)) niter <- i - msf - + ## check if minStart bigger than mstop of parameters that are not touched #if (length(msf[niter == 0]) > 0 && minStart < min(msf[niter == 0])) #minStart <- min(msf[niter == 0]) - + ## reduce models first (when necessary) if (niter < 0 ){ cf <- class(fit) + d = length(fit) class(fit) <- "list" ## needed to use [] operator for lists - #cat("processed parameters: ", paste(names(fit[msf > minStart]), # collapse = ", "), "\n") - #reduce the combined risk values - combined_risk <<- attr(fit, "combined_risk")()[1:i] - new_stop_value <- table(names(attr(fit, "combined_risk")())) - - + combined_risk <- attr(fit, "combined_risk")() + combined_risk <<- attr(fit, "combined_risk")()[seq_len(i + d)] + new_stop_value <- table(names(attr(fit, "combined_risk")())) - 1 + + for(o in names(new_stop_value)){ fit[[o]]$subset(new_stop_value[o]) } - + ## remove additional boosting iterations from environments lapply(fit, function(obj){ - evalq({xselect <- xselect[1:mstop]; - mrisk <- mrisk[1:mstop]; - ens <- ens[1:mstop]; - nuisance <- nuisance[1:mstop]}, + evalq({xselect <- xselect[seq_len(mstop)]; + mrisk <- mrisk[seq_len(mstop + 1)]; + ens <- ens[seq_len(mstop)]; + nuisance <- nuisance[seq_len(mstop)]}, environment(obj$subset)) }) - + #update ALL nuisance parameters to last update ENV <- lapply(mods, function(j) environment(fit[[j]]$subset)) for(j in names(new_stop_value)){ @@ -548,31 +393,24 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), environment(get("ngradient", environment(fit[[j]]$subset)))) } } - + for(k in mods){ evalq(u <- get("ngradient")(get("y"), fit, weights), ENV[[k]]) } - - + + class(fit) <- cf } - + ## now increase models (when necessary) else if (niter > 0){ - if (method == "inner") { - tmp <- iBoost_inner(niter) - } - else { - tmp <- iBoost_outer(niter) - } - - + tmp <- iBoost(niter, method = method) } - + mstop <<- i } } - + ## make call in submodels nicer: cl <- call cl[[1]] <- as.name(gsub("LSS", "", cl[[1]])) @@ -582,9 +420,9 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), ## This is not really nice fit[[i]]$call$family <- families[[i]] } - + attr(fit, "(weights)") <- weights ## attach weights used for fitting - + ## update to new weights; just a fresh start attr(fit, "update") <- function(weights = NULL, oobweights = NULL, risk = NULL, trace = NULL, mstop = NULL) { @@ -610,8 +448,8 @@ mboostLSS_fit <- function(formula, data = list(), families = GaussianLSS(), attr(fit, "call") <- call attr(fit, "data") <- data attr(fit, "families") <- families - if(method != "cycling") + if(method != "cyclic") attr(fit, "combined_risk") <- function() combined_risk - + return(fit) } diff --git a/R/methods.R b/R/methods.R index 228606c..ec52214 100644 --- a/R/methods.R +++ b/R/methods.R @@ -1,7 +1,7 @@ ## Methods "[.mboostLSS" <- function(x, i, return = TRUE, ...) { - stopifnot((length(i) == 1 | length(i) == length(x)) && i > 0) + stopifnot(length(i) == 1 | length(i) == length(x)) attr(x, "subset")(i) if (return) return(x) invisible(NULL) @@ -455,8 +455,7 @@ stabsel.mboostLSS <- function(x, cutoff, q, PFER, if (nsel >= q) break } - #this changes nothing for method = "cycling" but fixes mstop for - #method = "inner" or "outer" + #this changes nothing for method = "cyclic" but fixes mstop for method = "noncyclic" mstop <- check(mstop, "mstop", names(x)) ## complete paths if (any(sapply(xs, length) < mstop)) { @@ -476,7 +475,7 @@ stabsel.mboostLSS <- function(x, cutoff, q, PFER, ret <- unlist(ret) ## compute selection paths - #merging for method cycling + #merging for method cyclic if(!inherits(x, "nc_mboostLSS")){ sequences <- lapply(1:length(xs), function(i) { res <- matrix(FALSE, nrow = length(nms[[i]]), ncol = mstop[[i]]) diff --git a/README.md b/README.md index 4005461..5dd462b 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,8 @@ For installation instructions see below. Instructions on how to use `gamboostLSS` can be found here: - [gamboostLSS tutorial](https://www.jstatsoft.org/article/view/v074i01). +Details on the noncyclical fitting method can be found here: +- [noncyclical fitting](https://arxiv.org/abs/1611.10171); This is a preleminary version currently under review. ## Issues & Feature Requests diff --git a/appveyor.yml b/appveyor.yml index 181e617..1c78173 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -12,21 +12,13 @@ install: - ps: Bootstrap # Adapt as necessary starting from here - -# environment: -# matrix: -# - TEST_DIR: patch -# - TEST_DIR: pkg - before_build: -# - cp ../travis-tool.sh ./travis-tool.sh -# - cp travis-tool.sh.cmd %TEST_DIR%/travis-tool.sh.cmd -# - cd %TEST_DIR% - bash -c "echo '^travis-tool\.sh\.cmd$' >> .Rbuildignore" -# - dir build_script: - travis-tool.sh install_deps + - travis-tool.sh install_github hofnerb/stabs + - travis-tool.sh install_github boost-R/mboost test_script: - travis-tool.sh run_tests diff --git a/inst/CITATION b/inst/CITATION index 0fa5bd7..cdc0a33 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -59,4 +59,24 @@ citEntry( ) +citEntry( + entry= "TechReport", + title = "Stability selection for component-wise gradient boosting in multiple dimensions", + author = personList(as.person("Janek Thomas"), + as.person("Andreas Mayr"), + as.person("Bernd Bischl"), + as.person("Matthias Schmid"), + as.person("Adam Smith"), + as.person("Benjamin Hofner")), + year = "2016", + institution = "ArXiv", + header = "To cite the noncyclical fitting method of 'gamboostLSS' use:", + url = "https://arxiv.org/abs/1611.10171", + textVersion = + paste("Thomas, J., Mayr, A., Bischl, B., Schmid, M., Smith, A., and Hofner, B. (2016).", + "Stability selection for component-wise gradient boosting in multiple dimensions.", + "arXiv preprint arXiv:1611.10171.") +) + + citFooter('\nUse ', sQuote('toBibtex(citation("gamboostLSS"))'), ' to extract BibTeX references.') diff --git a/man/cvrisk.Rd b/man/cvrisk.Rd index 0eb483a..240bf53 100644 --- a/man/cvrisk.Rd +++ b/man/cvrisk.Rd @@ -40,13 +40,13 @@ make.grid(max, length.out = 10, min = NULL, log = TRUE, bootstrap samples. } \item{grid}{ - If the model was fitted with \code{method = "cycling"}, grid is + If the model was fitted with \code{method = "cyclic"}, grid is a matrix of stopping parameters the empirical risk is to be evaluated for. Each row represents a parameter combination. The number of columns must be - equal to the number of parameters of the GAMLSS family. Per default, + equal to the number of parameters of the GAMLSS family. Per default, \code{make.grid(mstop(object))} is used. - Otherwise (i.e., for \code{method = "inner"} or \code{method = "outer"}) grid + Otherwise (i.e., for \code{method = "noncyclic"}) grid is a vector of mstop values. Per default all steps up to the intial stopping iteration, i.e., \code{1:mstop(object)} are used. } @@ -132,8 +132,7 @@ make.grid(max, length.out = 10, min = NULL, log = TRUE, fold to fold. The heatmap shows only the average risk but in a nicer fashion. - For the noncyclical fitting methods (i.e. \code{method = "inner"} or - \code{method = "outer"}) only the line plot for exists. + For the \code{method = "noncyclic"} only the line plot for exists. Hofner et al. (2015) provide a detailed description of cross-validation for \code{\link{gamboostLSS}} models and show a diff --git a/man/mboostLSS.Rd b/man/mboostLSS.Rd index bac10ef..c6dfb2d 100644 --- a/man/mboostLSS.Rd +++ b/man/mboostLSS.Rd @@ -18,13 +18,17 @@ } \usage{ mboostLSS(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL, method = c("cycling", "inner", "outer"), ...) + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...) glmboostLSS(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL, method = c("cycling", "inner", "outer"), ...) + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...) gamboostLSS(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL, method = c("cycling", "inner", "outer"), ...) + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...) blackboostLSS(formula, data = list(), families = GaussianLSS(), - control = boost_control(), weights = NULL,method = c("cycling", "inner", "outer"), ...) + control = boost_control(), weights = NULL, + method = c("cyclic", "noncyclic"), ...) ## fit function: mboostLSS_fit(formula, data = list(), families = GaussianLSS(), @@ -58,9 +62,9 @@ mboostLSS_fit(formula, data = list(), families = GaussianLSS(), \item{call}{ used to forward the call from \code{mboostLSS}, \code{glmboostLSS}, \code{gamboostLSS} and \code{blackboostLSS}. This argument should not be directly specified by users!} - \item{method}{ fitting method, currently three methods are supported: - \code{"cycling"}, \code{"inner"} and \code{"outer"}. The latter two require a - one dimensional \code{mstop} value.} + \item{method}{ fitting method, currently two methods are supported: + \code{"cyclic"} and \code{"noncycli"}. The latter two requires a one dimensional \code{mstop} + value.} \item{\dots}{Further arguments to be passed to \code{mboostLSS_fit}. In \code{mboostLSS_fit}, \code{\dots} represent further arguments to be passed to \code{\link{mboost}} and \code{\link{mboost_fit}}. So @@ -99,7 +103,7 @@ mboostLSS_fit(formula, data = list(), families = GaussianLSS(), function without \code{LSS}. For further possible arguments see these functions as well as \code{\link{mboost_fit}}. - For \code{method = "cycling"} it is possible to specify one or + For \code{method = "cyclic"} it is possible to specify one or multiple \code{mstop} and \code{nu} values via \code{\link{boost_control}}. In the case of one single value, this value is used for all distribution parameters of the GAMLSS model. @@ -113,12 +117,11 @@ mboostLSS_fit(formula, data = list(), families = GaussianLSS(), specify, e.g., \code{mstop = 100} via \code{\link{boost_control}}. For more-dimensional stopping, one can specify, e.g., \code{mstop = list(mu = 100, sigma = 200)} (see examples). - If \code{method} is set to \code{"inner"} or \code{"outer"}, \code{mstop} has + If \code{method} is set to \code{"noncyclic"}, \code{mstop} has to be a one dimensional integer. Instead of cycling through all distribution - parameters, in each iteration only the best baselearner is used. For - \code{"inner"} one baselearner of every parameter is selected via RSS, the - distribution parameter is then chosen via the loss. For \code{"outer"} the - selection is solely based on the (outer) loss. + parameters, in each iteration only the best baselearner is used. One baselearner of every + parameter is selected via RSS, the distribution parameter is then chosen via the loss. + For details on the noncyclic fitting method see Thomas et. al. (2016). To (potentially) stabilize the model estimation by standardizing the negative gradients one can use the argument \code{stabilization} of @@ -152,6 +155,10 @@ Statistical Society, Series C (Applied Statistics), 54, 507-554. Buehlmann, P. and Hothorn, T. (2007), Boosting algorithms: Regularization, prediction and model fitting. Statistical Science, 22(4), 477--505. + +Thomas, J., Mayr, A., Bischl, B., Schmid, M., Smith, A., and Hofner, B. (2016), +Stability selection for component-wise gradient boosting in multiple dimensions. +arXiv preprint arXiv:1611.10171. } \seealso{ diff --git a/man/stabsel.mboostLSS.Rd b/man/stabsel.mboostLSS.Rd index 3f7c8ef..84800a0 100644 --- a/man/stabsel.mboostLSS.Rd +++ b/man/stabsel.mboostLSS.Rd @@ -127,7 +127,7 @@ dat <- data.frame(x1, x2, x3, x4, x5, x6, y) ### linear model with y ~ . for both components: 400 boosting iterations model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, control = boost_control(mstop = 400), - center = TRUE, method = "inner") + center = TRUE, method = "noncyclic") \donttest{### Do not test the following code per default on CRAN as it takes some time to run: diff --git a/tests/regtest-noncyclic_fitting.R b/tests/regtest-noncyclic_fitting.R index 09ffc58..a312024 100644 --- a/tests/regtest-noncyclic_fitting.R +++ b/tests/regtest-noncyclic_fitting.R @@ -20,36 +20,23 @@ dat <- data.frame(x1, x2, x3, x4, x5, x6, y) #glmboost model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "outer") - -model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "inner") + control = boost_control(mstop = 3), method = "noncyclic") #linear baselearner with bols model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "outer", - baselearner = "bols") - -model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "inner", + control = boost_control(mstop = 3), method = "noncyclic", baselearner = "bols") #nonlinear bbs baselearner model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "outer", + control = boost_control(mstop = 3), method = "noncyclic", baselearner = "bbs") -model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "inner", - baselearner = "bbs") - - - #reducing model and increasing it afterwards should yield the same fit model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 50), method = "outer") + control = boost_control(mstop = 50), method = "noncyclic") m_co <- coef(model) @@ -60,7 +47,7 @@ stopifnot(all.equal(m_co, coef(model))) model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 50), method = "outer", + control = boost_control(mstop = 50), method = "noncyclic", baselearner = "bols") m_co <- coef(model) @@ -72,7 +59,7 @@ stopifnot(all.equal(m_co, coef(model))) model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 50), method = "outer", + control = boost_control(mstop = 50), method = "noncyclic", baselearner = "bbs") m_co <- coef(model) @@ -83,32 +70,8 @@ mstop(model) <- 50 stopifnot(all.equal(m_co, coef(model))) -##inner### - -model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 50), method = "inner") - -m_co <- coef(model) - -mstop(model) <- 5 -mstop(model) <- 50 - -stopifnot(all.equal(m_co, coef(model))) - -model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 50), method = "inner", - baselearner = "bols") - -m_co <- coef(model) - -mstop(model) <- 5 -mstop(model) <- 50 - -stopifnot(all.equal(m_co, coef(model))) - - model <- gamboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 50), method = "inner", + control = boost_control(mstop = 50), method = "noncyclic", baselearner = "bbs") m_co <- coef(model) @@ -120,7 +83,7 @@ stopifnot(all.equal(m_co, coef(model))) ## check cvrisk for noncyclic models model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "outer") + control = boost_control(mstop = 3), method = "noncyclic") cvr1 <- cvrisk(model, grid = 1:50, cv(model.weights(model), B = 5)) cvr1 plot(cvr1) @@ -128,29 +91,35 @@ plot(cvr1) risk(model, merge = TRUE) risk(model, merge = FALSE) -model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "inner") -cvr2 <- cvrisk(model, grid = 1:50, cv(model.weights(model), B = 5)) -cvr2 -plot(cvr2) - -risk(model, merge = TRUE) -risk(model, merge = FALSE) - -model <- glmboostLSS(y ~ ., families = NBinomialLSS(), data = dat, - control = boost_control(mstop = 3), method = "cycling") -cvr3 <- cvrisk(model, grid = make.grid(c(mu = 25, sigma = 25)), cv(model.weights(model), B = 5)) -cvr3 -plot(cvr3) -plot(cvr3, type = "lines") - -risk(model, merge = TRUE) -risk(model, merge = FALSE) - -par(mfrow = c(2, 2)) -plot(cvr1) -plot(cvr2) -plot(cvr3, type = "heatmap") -plot(cvr3, type = "lines") +## test that mstop = 0 is possible +compare_models <- function (m1, m2) { + stopifnot(all.equal(coef(m1), coef(m2))) + stopifnot(all.equal(predict(m1), predict(m2))) + stopifnot(all.equal(fitted(m1), fitted(m2))) + stopifnot(all.equal(selected(m1), selected(m2))) + stopifnot(all.equal(risk(m1), risk(m2))) + ## remove obvious differences from objects + m1$control <- m2$control <- NULL + m1$call <- m2$call <- NULL + if (!all.equal(m1, m2)) + stop("Objects of offset model + 1 step and model with 1 step not identical") + invisible(NULL) +} + +# set up models +mod <- glmboostLSS(y ~ ., data = dat, method = "noncyclic", control = boost_control(mstop = 0)) +mod2 <- glmboostLSS(y ~ ., data = dat, method = "noncyclic", control = boost_control(mstop = 1)) +mod3 <- glmboostLSS(y ~ ., data = dat, method = "noncyclic", control = boost_control(mstop = 1)) + +lapply(coef(mod), function(x) stopifnot(is.null(x))) + +mstop(mod3) <- 0 +mapply(compare_models, m1 = mod, m2 = mod3) + +mstop(mod) <- 1 +mapply(compare_models, m1 = mod, m2 = mod2) + +mstop(mod3) <- 1 +mapply(compare_models, m1 = mod, m2 = mod3)