From 80427d4a9f1015237077e75509aeb611661a3e0f Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Mon, 27 Feb 2023 18:13:23 +0100 Subject: [PATCH 01/14] create proposal for a add_transformation function --- R/convenience-functions.R | 45 +++++++++++++++++++++ tests/testthat/test-convenience-functions.R | 12 ++++++ 2 files changed, 57 insertions(+) create mode 100644 R/convenience-functions.R create mode 100644 tests/testthat/test-convenience-functions.R diff --git a/R/convenience-functions.R b/R/convenience-functions.R new file mode 100644 index 000000000..f38e90b36 --- /dev/null +++ b/R/convenience-functions.R @@ -0,0 +1,45 @@ +#' @title Add transformations +#' +#' @description Add transformations of the forecasts and observations for +#' later scoring. For more information on why this might be desirable, check +#' out the linked reference. +#' +#' @inheritParams score +#' @return A data.table that includes the original data as well as a +#' transformation of the original data. There will be one additional column, +#' 'scale', present which will be set to "natural" for the untransformed +#' forecasts. +#' +#' @importFrom data.table ':=' is.data.table copy +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @export +#' @references Transformation of forecasts for evaluating predictive +#' performance in an epidemiological context +#' Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +#' Sebastian Funk +#' medRxiv 2023.01.23.23284722 +#' \doi{https://doi.org/10.1101/2023.01.23.23284722} +#' # nolint + +#' @keywords check-forecasts +#' @examples +#' add_transformation(example_quantile) + +add_transformation <- function(data, + fun = function(x) {log(x + 1)}, + label = "log", + ...) { + + data <- as.data.table(data) + transformed_data <- data.table::copy(data) + + data[, scale := "natural"] + transformed_data[, scale := label] + + transformed_data[, prediction := fun(prediction, ...)] + transformed_data[, true_value := fun(true_value, ...)] + + out <- rbind(data, transformed_data) + + return(out) +} diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R new file mode 100644 index 000000000..428b7e3be --- /dev/null +++ b/tests/testthat/test-convenience-functions.R @@ -0,0 +1,12 @@ +test_that("function add_transformation works", { + + prediction <- example_quantile$prediction + + one <- suppressWarnings(add_transformation(example_quantile)) + expect_equal(one$prediction, c(prediction, log(predictions + 1))) + + two <- suppressWarnings( + add_transformation(example_quantile, fun = sqrt, label = "sqrt") + ) + expect_equal(two$prediction, c(prediction, sqrt(predictions))) +}) From 710e94d581f2519abfca69ffc74bdfb85336eb6f Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Mon, 27 Feb 2023 18:17:43 +0100 Subject: [PATCH 02/14] add namespace entry and update docs --- NAMESPACE | 1 + man/add_transformation.Rd | 73 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 man/add_transformation.Rd diff --git a/NAMESPACE b/NAMESPACE index 334ea3856..ae01ab5b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(print,scoringutils_check) export(abs_error) export(add_coverage) +export(add_transformation) export(ae_median_quantile) export(ae_median_sample) export(avail_forecasts) diff --git a/man/add_transformation.Rd b/man/add_transformation.Rd new file mode 100644 index 000000000..3952b1377 --- /dev/null +++ b/man/add_transformation.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convenience-functions.R +\name{add_transformation} +\alias{add_transformation} +\title{Add transformations} +\usage{ +add_transformation( + data, + fun = function(x) { + log(x + 1) + }, + label = "log", + ... +) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +For scoring using \code{\link[=score]{score()}}, the following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For scoring integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For scoring predictions in a quantile-format forecast you should provide +a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +} + +In addition a \code{model} column is suggested and if not present this will be +flagged and added to the input data with all forecasts assigned as an +"unspecified model"). + +You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there +are examples for each format (\link{example_quantile}, \link{example_continuous}, +\link{example_integer}, and \link{example_binary}).} + +\item{...}{additional parameters passed down to \code{\link[=score_quantile]{score_quantile()}} (internal +function used for scoring forecasts in a quantile-based format).} +} +\value{ +A data.table that includes the original data as well as a +transformation of the original data. There will be one additional column, +'scale', present which will be set to "natural" for the untransformed +forecasts. +} +\description{ +Add transformations of the forecasts and observations for +later scoring. For more information on why this might be desirable, check +out the linked reference. +} +\examples{ +add_transformation(example_quantile) +} +\references{ +Transformation of forecasts for evaluating predictive +performance in an epidemiological context +Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +Sebastian Funk +medRxiv 2023.01.23.23284722 +\doi{https://doi.org/10.1101/2023.01.23.23284722} +\url{https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1} # nolint +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} +\keyword{check-forecasts} From 21625af3b3b83dad40254cbd80e047b883fd92d9 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Mon, 27 Feb 2023 18:43:37 +0100 Subject: [PATCH 03/14] fix failing tests --- R/convenience-functions.R | 3 +++ man/add_transformation.Rd | 5 +++++ tests/testthat/test-convenience-functions.R | 6 +++--- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index f38e90b36..28abada7c 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -5,6 +5,9 @@ #' out the linked reference. #' #' @inheritParams score +#' @param fun A function used to transform both true values and predictions +#' @param label A string for the newly created 'scale' column to denote the +#' newly transformed values. #' @return A data.table that includes the original data as well as a #' transformation of the original data. There will be one additional column, #' 'scale', present which will be set to "natural" for the untransformed diff --git a/man/add_transformation.Rd b/man/add_transformation.Rd index 3952b1377..377d570d0 100644 --- a/man/add_transformation.Rd +++ b/man/add_transformation.Rd @@ -41,6 +41,11 @@ You can check the format of your data using \code{\link[=check_forecasts]{check_ are examples for each format (\link{example_quantile}, \link{example_continuous}, \link{example_integer}, and \link{example_binary}).} +\item{fun}{A function used to transform both true values and predictions} + +\item{label}{A string for the newly created 'scale' column to denote the +newly transformed values.} + \item{...}{additional parameters passed down to \code{\link[=score_quantile]{score_quantile()}} (internal function used for scoring forecasts in a quantile-based format).} } diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index 428b7e3be..b4716a86b 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -1,12 +1,12 @@ test_that("function add_transformation works", { - prediction <- example_quantile$prediction + predictions <- example_quantile$prediction one <- suppressWarnings(add_transformation(example_quantile)) - expect_equal(one$prediction, c(prediction, log(predictions + 1))) + expect_equal(one$prediction, c(predictions, log(predictions + 1))) two <- suppressWarnings( add_transformation(example_quantile, fun = sqrt, label = "sqrt") ) - expect_equal(two$prediction, c(prediction, sqrt(predictions))) + expect_equal(two$prediction, c(predictions, sqrt(predictions))) }) From b970ca7884953d43cf0cb2e4cd36824dfbb9aeda Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 3 Mar 2023 16:32:14 +0100 Subject: [PATCH 04/14] function draft that has specific options --- NAMESPACE | 2 +- R/convenience-functions.R | 54 +++++++++++++++---- ...ansformation.Rd => transform_forecasts.Rd} | 14 ++--- 3 files changed, 53 insertions(+), 17 deletions(-) rename man/{add_transformation.Rd => transform_forecasts.Rd} (94%) diff --git a/NAMESPACE b/NAMESPACE index ae01ab5b9..f84c4dd72 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ S3method(print,scoringutils_check) export(abs_error) export(add_coverage) -export(add_transformation) export(ae_median_quantile) export(ae_median_sample) export(avail_forecasts) @@ -46,6 +45,7 @@ export(squared_error) export(summarise_scores) export(summarize_scores) export(theme_scoringutils) +export(transform_forecasts) importFrom(data.table,"%like%") importFrom(data.table,':=') importFrom(data.table,.I) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 28abada7c..384d2cd64 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -28,21 +28,57 @@ #' @examples #' add_transformation(example_quantile) -add_transformation <- function(data, - fun = function(x) {log(x + 1)}, - label = "log", - ...) { +transform_forecasts <- function(data, + transform = c("log", "sqrt"), + fun = NULL, + add = TRUE, + label = transform, + ...) { - data <- as.data.table(data) - transformed_data <- data.table::copy(data) + if (is.null(fun)) { + if (transform == "log") { + if (any(data$true_value < 0 | any(data$prediction < 0))) { + stop("Can't apply log transformation, values < 0 present") + } else if (any(data$true_value == 0 | any(data$prediction == 0))) { + fun <- function(x) {log(x + 1)} + message("Detected zeros in the data, using log(x + 1) as transformation instead.") + } else { + fun <- log + } + } - data[, scale := "natural"] - transformed_data[, scale := label] + if (transform == "sqrt") { + if (any(data$true_value < 0 | any(data$prediction < 0))) { + stop("Can't apply sqrt transformation, values < 0 present") + } else { + fun <- sqrt + } + } + } + + # check if a column called "scale" is already present and if so, only + # restrict to transformations of the original data + if ("scale" %in% colnames(data)) { + if (!("natural" %in% data$scale)) { + stop("If a column 'scale' is present, entries with scale =='natural' are needed") + } + transformed_data <- data.table::copy(data[data$scale == "natural", ]) + } else { + transformed_data <- data.table::copy(data) + } transformed_data[, prediction := fun(prediction, ...)] transformed_data[, true_value := fun(true_value, ...)] - out <- rbind(data, transformed_data) + if (add) { + data <- as.data.table(data) + data[, scale := "natural"] + transformed_data[, scale := label] + out <- rbind(data, transformed_data) + return(out) + } + + return(out) } diff --git a/man/add_transformation.Rd b/man/transform_forecasts.Rd similarity index 94% rename from man/add_transformation.Rd rename to man/transform_forecasts.Rd index 377d570d0..5f4ecd813 100644 --- a/man/add_transformation.Rd +++ b/man/transform_forecasts.Rd @@ -1,15 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/convenience-functions.R -\name{add_transformation} -\alias{add_transformation} +\name{transform_forecasts} +\alias{transform_forecasts} \title{Add transformations} \usage{ -add_transformation( +transform_forecasts( data, - fun = function(x) { - log(x + 1) - }, - label = "log", + transform = c("log", "sqrt"), + fun = NULL, + add = TRUE, + label = transform, ... ) } From e977438d379a8e7252dc9e4fc80014aee269f693 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Sun, 5 Mar 2023 14:54:33 +0100 Subject: [PATCH 05/14] update suggested draft for transform_forecasts, add documentation and tests --- R/convenience-functions.R | 97 +++++++++++++++------ man/transform_forecasts.Rd | 53 ++++++++--- tests/testthat/test-convenience-functions.R | 23 +++-- 3 files changed, 123 insertions(+), 50 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 384d2cd64..f1331f10b 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -1,14 +1,37 @@ #' @title Add transformations #' #' @description Add transformations of the forecasts and observations for -#' later scoring. For more information on why this might be desirable, check -#' out the linked reference. +#' later scoring. +#' +#' @details There are a few reasons, depending on the circumstances, for +#' why this might be desirable (check out the linked reference for more info). +#' In epidemiology, for example, it may be useful to log-transform incidence +#' counts before evaluating forecasts using scores such as the weighted interval +#' score (WIS) or the continuous ranked probability score (CRPS). +#' Log-transforming forecasts and observations changes the interpretation of +#' the score from a measure of absolute distance between forecast and +#' observation to a score that evaluates a forecast of the exponential growth +#' rate. Another motivation can be to apply a variance-stabilising +#' transformation or to standardise incidence counts by population. +#' +#' Note that if you want to apply a transformation, it is important to transform +#' the forecasts and observations and then apply the score. Applying a +#' transformation after the score risks losing propriety of the proper scoring +#' rule. #' #' @inheritParams score #' @param fun A function used to transform both true values and predictions +#' @param add whether or not to add a transformed version of the data to the +#' currently existing data (default is TRUE). If selected, the data gets +#' transformed and appended to the existing data frame, making it possible to +#' use the outcome directly in `score()`. An additional column, 'scale', gets +#' created that denotes which rows or untransformed ('scale' has the value +#' "natural") and which have been transformed ('scale' has the value passed to +#' the argument `label`). #' @param label A string for the newly created 'scale' column to denote the -#' newly transformed values. -#' @return A data.table that includes the original data as well as a +#' newly transformed values. Only relevant if `add = TRUE`. +#' @return A data.table with either a transformed version of the data, or one +#' with both the untransformed and the transformed data. includes the original data as well as a #' transformation of the original data. There will be one additional column, #' 'scale', present which will be set to "natural" for the untransformed #' forecasts. @@ -26,36 +49,54 @@ #' @keywords check-forecasts #' @examples -#' add_transformation(example_quantile) +#' +#' # replace negative values with zero +#' transformed <- example_quantile |> +#' transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) +#' +#' # add log transformed forecasts (produces a warning as some values are zero) +#' transform_forecasts(transformed) +#' +#' # specifying an offset manually for the log transformation removes the warning +#' transform_forecasts(transformed, offset = 1) + transform_forecasts <- function(data, - transform = c("log", "sqrt"), - fun = NULL, + fun = log, add = TRUE, - label = transform, + label = "log", ...) { - if (is.null(fun)) { - if (transform == "log") { - if (any(data$true_value < 0 | any(data$prediction < 0))) { - stop("Can't apply log transformation, values < 0 present") - } else if (any(data$true_value == 0 | any(data$prediction == 0))) { - fun <- function(x) {log(x + 1)} - message("Detected zeros in the data, using log(x + 1) as transformation instead.") - } else { - fun <- log - } + if (identical(fun, log)) { + if (any(data$true_value < 0, na.rm = TRUE) | + any(data$prediction < 0, na.rm = TRUE)) { + stop("Can't apply log transformation, values < 0 present") } - if (transform == "sqrt") { - if (any(data$true_value < 0 | any(data$prediction < 0))) { - stop("Can't apply sqrt transformation, values < 0 present") - } else { - fun <- sqrt - } + zeros_present <- (any(data$true_value == 0, na.rm = TRUE) | + any(data$prediction == 0, na.rm = TRUE)) + if (!("offset" %in% names(list(...))) && zeros_present) { + offset_default <- 1 + warning( + paste0( + "Detected zeros in the data, using log(x + 1) as transformation instead. ", + "You can specify offset = 1 (or any different offset) to remove this warning." + ) + ) + } else if (!("offset" %in% names(list(...)))) { + offset_default <- 0 } + fun <- function(x, offset = offset_default) {log(x + offset)} } + if (identical(fun, sqrt)) { + if (any(data$true_value < 0, na.rm = TRUE) | + any(data$prediction < 0, na.rm = TRUE)) { + stop("Can't apply sqrt transformation, values < 0 present") + } + } + + # check if a column called "scale" is already present and if so, only # restrict to transformations of the original data if ("scale" %in% colnames(data)) { @@ -75,10 +116,8 @@ transform_forecasts <- function(data, data[, scale := "natural"] transformed_data[, scale := label] out <- rbind(data, transformed_data) - return(out) + return(out[]) } - - - - return(out) + out <- transformed_data + return(out[]) } diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 5f4ecd813..8a5ef21bc 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -4,14 +4,7 @@ \alias{transform_forecasts} \title{Add transformations} \usage{ -transform_forecasts( - data, - transform = c("log", "sqrt"), - fun = NULL, - add = TRUE, - label = transform, - ... -) +transform_forecasts(data, fun = log, add = TRUE, label = "log", ...) } \arguments{ \item{data}{A data.frame or data.table with the predictions and observations. @@ -43,25 +36,59 @@ are examples for each format (\link{example_quantile}, \link{example_continuous} \item{fun}{A function used to transform both true values and predictions} +\item{add}{whether or not to add a transformed version of the data to the +currently existing data (default is TRUE). If selected, the data gets +transformed and appended to the existing data frame, making it possible to +use the outcome directly in \code{score()}. An additional column, 'scale', gets +created that denotes which rows or untransformed ('scale' has the value +"natural") and which have been transformed ('scale' has the value passed to +the argument \code{label}).} + \item{label}{A string for the newly created 'scale' column to denote the -newly transformed values.} +newly transformed values. Only relevant if \code{add = TRUE}.} \item{...}{additional parameters passed down to \code{\link[=score_quantile]{score_quantile()}} (internal function used for scoring forecasts in a quantile-based format).} } \value{ -A data.table that includes the original data as well as a +A data.table with either a transformed version of the data, or one +with both the untransformed and the transformed data. includes the original data as well as a transformation of the original data. There will be one additional column, 'scale', present which will be set to "natural" for the untransformed forecasts. } \description{ Add transformations of the forecasts and observations for -later scoring. For more information on why this might be desirable, check -out the linked reference. +later scoring. +} +\details{ +There are a few reasons, depending on the circumstances, for +why this might be desirable (check out the linked reference for more info). +In epidemiology, for example, it may be useful to log-transform incidence +counts before evaluating forecasts using scores such as the weighted interval +score (WIS) or the continuous ranked probability score (CRPS). +Log-transforming forecasts and observations changes the interpretation of +the score from a measure of absolute distance between forecast and +observation to a score that evaluates a forecast of the exponential growth +rate. Another motivation can be to apply a variance-stabilising +transformation or to standardise incidence counts by population. + +Note that if you want to apply a transformation, it is important to transform +the forecasts and observations and then apply the score. Applying a +transformation after the score risks losing propriety of the proper scoring +rule. } \examples{ -add_transformation(example_quantile) + +# replace negative values with zero +transformed <- example_quantile |> + transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) + +# add log transformed forecasts (produces a warning as some values are zero) +transform_forecasts(transformed) + +# specifying an offset manually for the log transformation removes the warning +transform_forecasts(transformed, offset = 1) } \references{ Transformation of forecasts for evaluating predictive diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index b4716a86b..859ece267 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -1,12 +1,19 @@ -test_that("function add_transformation works", { +test_that("function transform_forecasts works", { - predictions <- example_quantile$prediction + predictions_original <- example_quantile$prediction + predictions <- transform_forecasts( + example_quantile, + fun = function(x) pmax(0, x), + add = FALSE + ) - one <- suppressWarnings(add_transformation(example_quantile)) - expect_equal(one$prediction, c(predictions, log(predictions + 1))) + expect_equal(predictions$prediction, pmax(0, predictions_original)) - two <- suppressWarnings( - add_transformation(example_quantile, fun = sqrt, label = "sqrt") - ) - expect_equal(two$prediction, c(predictions, sqrt(predictions))) + one <- transform_forecasts(predictions, offset = 1) + expect_equal(one$prediction, + c(predictions$prediction, log(predictions$prediction + 1))) + + two <- transform_forecasts(predictions, fun = sqrt, label = "sqrt") + expect_equal(two$prediction, + c(predictions$prediction, sqrt(predictions$prediction))) }) From 59c0809e2de1546a0183d149169e2a329643a938 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Sun, 5 Mar 2023 15:03:54 +0100 Subject: [PATCH 06/14] add example for adding multiple transformations --- R/convenience-functions.R | 6 +++++- man/transform_forecasts.Rd | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index f1331f10b..6cc95dcaa 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -59,7 +59,11 @@ #' #' # specifying an offset manually for the log transformation removes the warning #' transform_forecasts(transformed, offset = 1) - +#' +#' # adding multiple transformations +#' transformed |> +#' transform_forecasts(offset = 1) |> +#' transform_forecasts(fun = sqrt, label = "sqrt") transform_forecasts <- function(data, fun = log, diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 8a5ef21bc..7ddb0d8e6 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -89,6 +89,11 @@ transform_forecasts(transformed) # specifying an offset manually for the log transformation removes the warning transform_forecasts(transformed, offset = 1) + +# adding multiple transformations +transformed |> + transform_forecasts(offset = 1) |> + transform_forecasts(fun = sqrt, label = "sqrt") } \references{ Transformation of forecasts for evaluating predictive From 149c3543b7309a63c2bc9635ca2f65a8e615d173 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Sun, 5 Mar 2023 15:36:48 +0100 Subject: [PATCH 07/14] update examples, handle adding multiple transformations --- R/convenience-functions.R | 29 ++++++++++++++++----- man/transform_forecasts.Rd | 10 +++++-- tests/testthat/test-convenience-functions.R | 10 +++++++ 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 6cc95dcaa..841ee20bc 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -20,7 +20,11 @@ #' rule. #' #' @inheritParams score -#' @param fun A function used to transform both true values and predictions +#' @param fun A function used to transform both true values and predictions. +#' If `fun = log` it automatically checks internally whether there are any +#' zero values. If so, the function tries to apply `log(x + 1)` instead and +#' gives a warning. You can pass an argument `offset = 1` (or any other value) +#' to the function to avoid the warning. #' @param add whether or not to add a transformed version of the data to the #' currently existing data (default is TRUE). If selected, the data gets #' transformed and appended to the existing data frame, making it possible to @@ -50,8 +54,9 @@ #' @keywords check-forecasts #' @examples #' +#' library(magrittr) #(pipe operator) #' # replace negative values with zero -#' transformed <- example_quantile |> +#' transformed <- example_quantile %>% #' transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) #' #' # add log transformed forecasts (produces a warning as some values are zero) @@ -61,9 +66,11 @@ #' transform_forecasts(transformed, offset = 1) #' #' # adding multiple transformations -#' transformed |> -#' transform_forecasts(offset = 1) |> -#' transform_forecasts(fun = sqrt, label = "sqrt") +#' transformed %>% +#' transform_forecasts(offset = 1) %>% +#' transform_forecasts(fun = sqrt, label = "sqrt") %>% +#' score() %>% +#' summarise_scores(by = c("model", "scale")) transform_forecasts <- function(data, fun = log, @@ -117,7 +124,17 @@ transform_forecasts <- function(data, if (add) { data <- as.data.table(data) - data[, scale := "natural"] + + if (!("scale" %in% colnames(data))) { + data[, scale := "natural"] + } else if (label %in% data$scale) { + warning( + paste0( + "Adding new transformations with label '", label, + "', even though that entry is already present in column 'scale'." + ) + ) + } transformed_data[, scale := label] out <- rbind(data, transformed_data) return(out[]) diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 7ddb0d8e6..c7e545613 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -34,7 +34,11 @@ You can check the format of your data using \code{\link[=check_forecasts]{check_ are examples for each format (\link{example_quantile}, \link{example_continuous}, \link{example_integer}, and \link{example_binary}).} -\item{fun}{A function used to transform both true values and predictions} +\item{fun}{A function used to transform both true values and predictions. +If \code{fun = log} it automatically checks internally whether there are any +zero values. If so, the function tries to apply \code{log(x + 1)} instead and +gives a warning. You can pass an argument \code{offset = 1} (or any other value) +to the function to avoid the warning.} \item{add}{whether or not to add a transformed version of the data to the currently existing data (default is TRUE). If selected, the data gets @@ -93,7 +97,9 @@ transform_forecasts(transformed, offset = 1) # adding multiple transformations transformed |> transform_forecasts(offset = 1) |> - transform_forecasts(fun = sqrt, label = "sqrt") + transform_forecasts(fun = sqrt, label = "sqrt") |> + score() |> + summarise_scores(by = c("model", "scale")) } \references{ Transformation of forecasts for evaluating predictive diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index 859ece267..5eba0cbd4 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -16,4 +16,14 @@ test_that("function transform_forecasts works", { two <- transform_forecasts(predictions, fun = sqrt, label = "sqrt") expect_equal(two$prediction, c(predictions$prediction, sqrt(predictions$prediction))) + + + # expect a warning if existing transformation is overwritte + expect_warning( + transform_forecasts(one, fun = sqrt) + ) + + # multiple transformations + three <- transform_forecasts(one, fun = sqrt, label = "sqrt") + expect_equal(unique(three$scale), c("natural", "log", "sqrt")) }) From 0dc87f6718d3ba84405547162c86714d032f505d Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Sun, 5 Mar 2023 16:29:10 +0100 Subject: [PATCH 08/14] update doc --- man/transform_forecasts.Rd | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index c7e545613..65ea0ae1f 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -84,8 +84,9 @@ rule. } \examples{ +library(magrittr) #(pipe operator) # replace negative values with zero -transformed <- example_quantile |> +transformed <- example_quantile \%>\% transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) # add log transformed forecasts (produces a warning as some values are zero) @@ -95,10 +96,10 @@ transform_forecasts(transformed) transform_forecasts(transformed, offset = 1) # adding multiple transformations -transformed |> - transform_forecasts(offset = 1) |> - transform_forecasts(fun = sqrt, label = "sqrt") |> - score() |> +transformed \%>\% + transform_forecasts(offset = 1) \%>\% + transform_forecasts(fun = sqrt, label = "sqrt") \%>\% + score() \%>\% summarise_scores(by = c("model", "scale")) } \references{ From 70d0e30c7ad4fae5d9b1e09058964fa7c7af6d90 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Mon, 6 Mar 2023 09:10:57 +0100 Subject: [PATCH 09/14] update documentation --- R/convenience-functions.R | 6 +++++- man/transform_forecasts.Rd | 8 +++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 841ee20bc..c001b025c 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -34,6 +34,10 @@ #' the argument `label`). #' @param label A string for the newly created 'scale' column to denote the #' newly transformed values. Only relevant if `add = TRUE`. +#' @param ... Additional parameters to pass to the function you supplied. If +#' you're using `fun = log`, one parameter added by default is the option to +#' set `offset = 1` (or any other number) to apply the function +#' `log (1 + offset)` instead (useful if you have e.g zero values in the data). #' @return A data.table with either a transformed version of the data, or one #' with both the untransformed and the transformed data. includes the original data as well as a #' transformation of the original data. There will be one additional column, @@ -60,7 +64,7 @@ #' transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) #' #' # add log transformed forecasts (produces a warning as some values are zero) -#' transform_forecasts(transformed) +#' transform_forecasts(transformed, fun = log) #' #' # specifying an offset manually for the log transformation removes the warning #' transform_forecasts(transformed, offset = 1) diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 65ea0ae1f..fa34d0c06 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -51,8 +51,10 @@ the argument \code{label}).} \item{label}{A string for the newly created 'scale' column to denote the newly transformed values. Only relevant if \code{add = TRUE}.} -\item{...}{additional parameters passed down to \code{\link[=score_quantile]{score_quantile()}} (internal -function used for scoring forecasts in a quantile-based format).} +\item{...}{Additional parameters to pass to the function you supplied. If +you're using \code{fun = log}, one parameter added by default is the option to +set \code{offset = 1} (or any other number) to apply the function +\code{log (1 + offset)} instead (useful if you have e.g zero values in the data).} } \value{ A data.table with either a transformed version of the data, or one @@ -90,7 +92,7 @@ transformed <- example_quantile \%>\% transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) # add log transformed forecasts (produces a warning as some values are zero) -transform_forecasts(transformed) +transform_forecasts(transformed, fun = log) # specifying an offset manually for the log transformation removes the warning transform_forecasts(transformed, offset = 1) From 6dba7e396afe3703eea7f394f647f884cedd890c Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 24 Mar 2023 18:13:54 +0100 Subject: [PATCH 10/14] add a log_shift() function that can be called from transform_forecasts() --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 8 ++ R/convenience-functions.R | 128 +++++++++++++------- man/log_shift.Rd | 52 ++++++++ man/transform_forecasts.Rd | 27 ++--- tests/testthat/test-convenience-functions.R | 2 +- 7 files changed, 162 insertions(+), 58 deletions(-) create mode 100644 man/log_shift.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 85b588f2b..d5a9adad9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: scoringutils Title: Utilities for Scoring and Assessing Predictions -Version: 1.1.1 +Version: 1.1.2 Language: en-GB Authors@R: c( person(given = "Nikos", diff --git a/NAMESPACE b/NAMESPACE index f84c4dd72..68eb44f95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(crps_sample) export(dss_sample) export(find_duplicates) export(interval_score) +export(log_shift) export(logs_binary) export(logs_sample) export(mad_sample) diff --git a/NEWS.md b/NEWS.md index 79ab29fd9..1635621ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# scoringutils 1.1.2 + +## Feature updates + +- added a new function, `transform_forecasts()` to make it easy to transform forecasts before scoring them, as suggested in Bosse et al. (2023), https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1. +- added another function, `log_shift()` that implements the default transformation function. The function allows to truncate negative values and add an offset before applying the logarithm. + + # scoringutils 1.1.1 - added a small change to `interval_score()` which explicitly converts the logical vector to a numeric one. This should happen implicitly anyway, but is now done explicitly in order to avoid issues that may come up if the input vector has a type that doesn't allow the implict conversion. diff --git a/R/convenience-functions.R b/R/convenience-functions.R index c001b025c..ab196e718 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -1,7 +1,6 @@ -#' @title Add transformations +#' @title Transform forecasts and observed values #' -#' @description Add transformations of the forecasts and observations for -#' later scoring. +#' @description Function to transform forecasts and true values before scoring. #' #' @details There are a few reasons, depending on the circumstances, for #' why this might be desirable (check out the linked reference for more info). @@ -25,15 +24,15 @@ #' zero values. If so, the function tries to apply `log(x + 1)` instead and #' gives a warning. You can pass an argument `offset = 1` (or any other value) #' to the function to avoid the warning. -#' @param add whether or not to add a transformed version of the data to the -#' currently existing data (default is TRUE). If selected, the data gets +#' @param append whether or not to append a transformed version of the data to +#' the currently existing data (default is TRUE). If selected, the data gets #' transformed and appended to the existing data frame, making it possible to #' use the outcome directly in `score()`. An additional column, 'scale', gets #' created that denotes which rows or untransformed ('scale' has the value #' "natural") and which have been transformed ('scale' has the value passed to #' the argument `label`). #' @param label A string for the newly created 'scale' column to denote the -#' newly transformed values. Only relevant if `add = TRUE`. +#' newly transformed values. Only relevant if `append = TRUE`. #' @param ... Additional parameters to pass to the function you supplied. If #' you're using `fun = log`, one parameter added by default is the option to #' set `offset = 1` (or any other number) to apply the function @@ -58,52 +57,28 @@ #' @keywords check-forecasts #' @examples #' -#' library(magrittr) #(pipe operator) -#' # replace negative values with zero -#' transformed <- example_quantile %>% -#' transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) -#' #' # add log transformed forecasts (produces a warning as some values are zero) -#' transform_forecasts(transformed, fun = log) +#' transform_forecasts(example_quantile, truncate = TRUE, append = FALSE) #' #' # specifying an offset manually for the log transformation removes the warning -#' transform_forecasts(transformed, offset = 1) +#' transform_forecasts(example_quantile, truncate = TRUE, offset = 1) #' #' # adding multiple transformations -#' transformed %>% +#' library(magrittr) # pipe operator +#' example_quantile %>% +#' # truncate all negative values for both log and sqrt +#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% #' transform_forecasts(offset = 1) %>% #' transform_forecasts(fun = sqrt, label = "sqrt") %>% #' score() %>% #' summarise_scores(by = c("model", "scale")) transform_forecasts <- function(data, - fun = log, - add = TRUE, + fun = log_shift, + append = TRUE, label = "log", ...) { - if (identical(fun, log)) { - if (any(data$true_value < 0, na.rm = TRUE) | - any(data$prediction < 0, na.rm = TRUE)) { - stop("Can't apply log transformation, values < 0 present") - } - - zeros_present <- (any(data$true_value == 0, na.rm = TRUE) | - any(data$prediction == 0, na.rm = TRUE)) - if (!("offset" %in% names(list(...))) && zeros_present) { - offset_default <- 1 - warning( - paste0( - "Detected zeros in the data, using log(x + 1) as transformation instead. ", - "You can specify offset = 1 (or any different offset) to remove this warning." - ) - ) - } else if (!("offset" %in% names(list(...)))) { - offset_default <- 0 - } - fun <- function(x, offset = offset_default) {log(x + offset)} - } - if (identical(fun, sqrt)) { if (any(data$true_value < 0, na.rm = TRUE) | any(data$prediction < 0, na.rm = TRUE)) { @@ -111,7 +86,6 @@ transform_forecasts <- function(data, } } - # check if a column called "scale" is already present and if so, only # restrict to transformations of the original data if ("scale" %in% colnames(data)) { @@ -126,7 +100,7 @@ transform_forecasts <- function(data, transformed_data[, prediction := fun(prediction, ...)] transformed_data[, true_value := fun(true_value, ...)] - if (add) { + if (append) { data <- as.data.table(data) if (!("scale" %in% colnames(data))) { @@ -134,7 +108,7 @@ transform_forecasts <- function(data, } else if (label %in% data$scale) { warning( paste0( - "Adding new transformations with label '", label, + "Appending new transformations with label '", label, "', even though that entry is already present in column 'scale'." ) ) @@ -146,3 +120,75 @@ transform_forecasts <- function(data, out <- transformed_data return(out[]) } + + + + + + +#' @title Log transformation with an additive shift +#' +#' @description Function that shifts a value by some offset and then applies the +#' natural logarithm to it. +#' +#' @details The output is computed as log(x + offset). +#' +#' @param x vector of input values to be transformed +#' @param offset number to add to the input value before taking the natural +#' logarithm +#' @param truncate whether to truncate negative values to 0. Default is FALSE. +#' @param base a positive or complex number: the base with respect to which +#' logarithms are computed. Defaults to e = exp(1). +#' @return A numeric vector with transformed values +#' @export +#' @references Transformation of forecasts for evaluating predictive +#' performance in an epidemiological context +#' Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +#' Sebastian Funk +#' medRxiv 2023.01.23.23284722 +#' \doi{https://doi.org/10.1101/2023.01.23.23284722} +#' # nolint + +#' @keywords check-forecasts +#' @examples +#' +#' log_shift(1:10) +#' log_shift(0:9, offset = 1) +#' log_shift(-1:9, offset = 1, truncate = TRUE) +#' +#' transform_forecasts( +#' example_quantile, +#' fun = log_shift, +#' offset = 1, +#' truncate = TRUE +#' ) + +log_shift <- function( + x, + offset = 0, + truncate = FALSE, + base = exp(1) +) { + + if (truncate) { + x <- pmax(0, x) + } + + if (any (x < 0, na.rm = TRUE)) { + w <- paste( + "Detected input values < 0.", + "Try truncating negative values (use truncate = TRUE)" + ) + warning(w) + } + + if (any(x == 0, na.rm = TRUE) && offset == 0) { + w <- paste0( + "Detected zeros in input values.", + "Try specifying offset = 1 (or any other offset)." + ) + warning(w) + } + log(x + offset, base = base) +} + diff --git a/man/log_shift.Rd b/man/log_shift.Rd new file mode 100644 index 000000000..24b4a9d98 --- /dev/null +++ b/man/log_shift.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convenience-functions.R +\name{log_shift} +\alias{log_shift} +\title{Log transformation with an additive shift} +\usage{ +log_shift(x, offset = 0, truncate = FALSE, base = exp(1)) +} +\arguments{ +\item{x}{vector of input values to be transformed} + +\item{offset}{number to add to the input value before taking the natural +logarithm} + +\item{truncate}{whether to truncate negative values to 0. Default is FALSE.} + +\item{base}{a positive or complex number: the base with respect to which +logarithms are computed. Defaults to e = exp(1).} +} +\value{ +A numeric vector with transformed values +} +\description{ +Function that shifts a value by some offset and then applies the +natural logarithm to it. +} +\details{ +The output is computed as log(x + offset). +} +\examples{ + +log_shift(1:10) +log_shift(0:9, offset = 1) +log_shift(-1:9, offset = 1, truncate = TRUE) + +transform_forecasts( + example_quantile, + fun = log_shift, + offset = 1, + truncate = TRUE + ) +} +\references{ +Transformation of forecasts for evaluating predictive +performance in an epidemiological context +Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +Sebastian Funk +medRxiv 2023.01.23.23284722 +\doi{https://doi.org/10.1101/2023.01.23.23284722} +\url{https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1} # nolint +} +\keyword{check-forecasts} diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index fa34d0c06..5193e090c 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/convenience-functions.R \name{transform_forecasts} \alias{transform_forecasts} -\title{Add transformations} +\title{Transform forecasts and observed values} \usage{ -transform_forecasts(data, fun = log, add = TRUE, label = "log", ...) +transform_forecasts(data, fun = log_shift, append = TRUE, label = "log", ...) } \arguments{ \item{data}{A data.frame or data.table with the predictions and observations. @@ -40,8 +40,8 @@ zero values. If so, the function tries to apply \code{log(x + 1)} instead and gives a warning. You can pass an argument \code{offset = 1} (or any other value) to the function to avoid the warning.} -\item{add}{whether or not to add a transformed version of the data to the -currently existing data (default is TRUE). If selected, the data gets +\item{append}{whether or not to append a transformed version of the data to +the currently existing data (default is TRUE). If selected, the data gets transformed and appended to the existing data frame, making it possible to use the outcome directly in \code{score()}. An additional column, 'scale', gets created that denotes which rows or untransformed ('scale' has the value @@ -49,7 +49,7 @@ created that denotes which rows or untransformed ('scale' has the value the argument \code{label}).} \item{label}{A string for the newly created 'scale' column to denote the -newly transformed values. Only relevant if \code{add = TRUE}.} +newly transformed values. Only relevant if \code{append = TRUE}.} \item{...}{Additional parameters to pass to the function you supplied. If you're using \code{fun = log}, one parameter added by default is the option to @@ -64,8 +64,7 @@ transformation of the original data. There will be one additional column, forecasts. } \description{ -Add transformations of the forecasts and observations for -later scoring. +Function to transform forecasts and true values before scoring. } \details{ There are a few reasons, depending on the circumstances, for @@ -86,19 +85,17 @@ rule. } \examples{ -library(magrittr) #(pipe operator) -# replace negative values with zero -transformed <- example_quantile \%>\% - transform_forecasts(fun = function(x) {pmax(0, x)}, add = FALSE) - # add log transformed forecasts (produces a warning as some values are zero) -transform_forecasts(transformed, fun = log) +transform_forecasts(example_quantile, truncate = TRUE, append = FALSE) # specifying an offset manually for the log transformation removes the warning -transform_forecasts(transformed, offset = 1) +transform_forecasts(example_quantile, truncate = TRUE, offset = 1) # adding multiple transformations -transformed \%>\% +library(magrittr) # pipe operator +example_quantile \%>\% + # truncate all negative values for both log and sqrt + transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% transform_forecasts(offset = 1) \%>\% transform_forecasts(fun = sqrt, label = "sqrt") \%>\% score() \%>\% diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index 5eba0cbd4..ded7ad398 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -4,7 +4,7 @@ test_that("function transform_forecasts works", { predictions <- transform_forecasts( example_quantile, fun = function(x) pmax(0, x), - add = FALSE + append = FALSE ) expect_equal(predictions$prediction, pmax(0, predictions_original)) From 846d1e929ffdc60076e1f9e27f3e569b76091cc6 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 24 Mar 2023 23:58:58 +0100 Subject: [PATCH 11/14] update code to enable transforming existing natural scale values without adding anything --- R/convenience-functions.R | 88 +++++++++++---------- man/log_shift.Rd | 3 +- man/transform_forecasts.Rd | 21 +++-- tests/testthat/test-convenience-functions.R | 11 ++- 4 files changed, 67 insertions(+), 56 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index ab196e718..8e3ab248c 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -20,23 +20,20 @@ #' #' @inheritParams score #' @param fun A function used to transform both true values and predictions. -#' If `fun = log` it automatically checks internally whether there are any -#' zero values. If so, the function tries to apply `log(x + 1)` instead and -#' gives a warning. You can pass an argument `offset = 1` (or any other value) -#' to the function to avoid the warning. +#' The default function is [log_shift()], a custom function that is essentially +#' the same as [log()], but has two additional arguments (`offset` and +#' `truncate`) that allow you to truncate negative values to zero and add an +#' offset before applying the logarithm. #' @param append whether or not to append a transformed version of the data to #' the currently existing data (default is TRUE). If selected, the data gets #' transformed and appended to the existing data frame, making it possible to -#' use the outcome directly in `score()`. An additional column, 'scale', gets +#' use the outcome directly in [score()]. An additional column, 'scale', gets #' created that denotes which rows or untransformed ('scale' has the value #' "natural") and which have been transformed ('scale' has the value passed to #' the argument `label`). #' @param label A string for the newly created 'scale' column to denote the #' newly transformed values. Only relevant if `append = TRUE`. -#' @param ... Additional parameters to pass to the function you supplied. If -#' you're using `fun = log`, one parameter added by default is the option to -#' set `offset = 1` (or any other number) to apply the function -#' `log (1 + offset)` instead (useful if you have e.g zero values in the data). +#' @param ... Additional parameters to pass to the function you supplied. #' @return A data.table with either a transformed version of the data, or one #' with both the untransformed and the transformed data. includes the original data as well as a #' transformation of the original data. There will be one additional column, @@ -66,12 +63,14 @@ #' # adding multiple transformations #' library(magrittr) # pipe operator #' example_quantile %>% -#' # truncate all negative values for both log and sqrt +#' transform_forecasts(offset = 1, truncate = TRUE) %>% +#' # manually truncate all negative values before applying sqrt #' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% -#' transform_forecasts(offset = 1) %>% #' transform_forecasts(fun = sqrt, label = "sqrt") %>% #' score() %>% #' summarise_scores(by = c("model", "scale")) +#' +#' transform_forecasts <- function(data, fun = log_shift, @@ -79,46 +78,48 @@ transform_forecasts <- function(data, label = "log", ...) { - if (identical(fun, sqrt)) { - if (any(data$true_value < 0, na.rm = TRUE) | - any(data$prediction < 0, na.rm = TRUE)) { - stop("Can't apply sqrt transformation, values < 0 present") - } - } + original_data <- as.data.table(data) + scale_col_present <- ("scale" %in% colnames(original_data)) - # check if a column called "scale" is already present and if so, only - # restrict to transformations of the original data - if ("scale" %in% colnames(data)) { - if (!("natural" %in% data$scale)) { - stop("If a column 'scale' is present, entries with scale =='natural' are needed") + # Error handling + if (scale_col_present) { + if (!("natural" %in% original_data$scale)) { + stop("If a column 'scale' is present, entries with scale =='natural' are required for the transformation") + } + if (append && (label %in% original_data$scale)) { + w <- paste0( + "Appending new transformations with label '", label, + "', even though that entry is already present in column 'scale'." + ) + warning(w) } - transformed_data <- data.table::copy(data[data$scale == "natural", ]) - } else { - transformed_data <- data.table::copy(data) } - transformed_data[, prediction := fun(prediction, ...)] - transformed_data[, true_value := fun(true_value, ...)] - if (append) { - data <- as.data.table(data) - - if (!("scale" %in% colnames(data))) { - data[, scale := "natural"] - } else if (label %in% data$scale) { - warning( - paste0( - "Appending new transformations with label '", label, - "', even though that entry is already present in column 'scale'." - ) - ) + if (scale_col_present) { + transformed_data <- data.table::copy(original_data)[scale == "natural"] + } else { + transformed_data <- data.table::copy(original_data) + original_data[, scale := "natural"] } + transformed_data[, prediction := fun(prediction, ...)] + transformed_data[, true_value := fun(true_value, ...)] transformed_data[, scale := label] - out <- rbind(data, transformed_data) + out <- rbind(original_data, transformed_data) return(out[]) } - out <- transformed_data - return(out[]) + + # check if a column called "scale" is already present and if so, only + # restrict to transformations of the original data + if (scale_col_present) { + original_data[scale == "natural", prediction := fun(prediction, ...)] + original_data[scale == "natural", true_value := fun(true_value, ...)] + original_data[scale == "natural", scale := label] + } else { + original_data[, prediction := fun(prediction, ...)] + original_data[, true_value := fun(true_value, ...)] + } + return(original_data[]) } @@ -131,7 +132,8 @@ transform_forecasts <- function(data, #' @description Function that shifts a value by some offset and then applies the #' natural logarithm to it. #' -#' @details The output is computed as log(x + offset). +#' @details The output is computed as log(x + offset) (or +#' log(pmax(0, x) + offset)) if `truncate = TRUE`. #' #' @param x vector of input values to be transformed #' @param offset number to add to the input value before taking the natural diff --git a/man/log_shift.Rd b/man/log_shift.Rd index 24b4a9d98..29543c7ba 100644 --- a/man/log_shift.Rd +++ b/man/log_shift.Rd @@ -25,7 +25,8 @@ Function that shifts a value by some offset and then applies the natural logarithm to it. } \details{ -The output is computed as log(x + offset). +The output is computed as log(x + offset) (or +log(pmax(0, x) + offset)) if \code{truncate = TRUE}. } \examples{ diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 5193e090c..51328e6fd 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -35,15 +35,15 @@ are examples for each format (\link{example_quantile}, \link{example_continuous} \link{example_integer}, and \link{example_binary}).} \item{fun}{A function used to transform both true values and predictions. -If \code{fun = log} it automatically checks internally whether there are any -zero values. If so, the function tries to apply \code{log(x + 1)} instead and -gives a warning. You can pass an argument \code{offset = 1} (or any other value) -to the function to avoid the warning.} +The default function is \code{\link[=log_shift]{log_shift()}}, a custom function that is essentially +the same as \code{\link[=log]{log()}}, but has two additional arguments (\code{offset} and +\code{truncate}) that allow you to truncate negative values to zero and add an +offset before applying the logarithm.} \item{append}{whether or not to append a transformed version of the data to the currently existing data (default is TRUE). If selected, the data gets transformed and appended to the existing data frame, making it possible to -use the outcome directly in \code{score()}. An additional column, 'scale', gets +use the outcome directly in \code{\link[=score]{score()}}. An additional column, 'scale', gets created that denotes which rows or untransformed ('scale' has the value "natural") and which have been transformed ('scale' has the value passed to the argument \code{label}).} @@ -51,10 +51,7 @@ the argument \code{label}).} \item{label}{A string for the newly created 'scale' column to denote the newly transformed values. Only relevant if \code{append = TRUE}.} -\item{...}{Additional parameters to pass to the function you supplied. If -you're using \code{fun = log}, one parameter added by default is the option to -set \code{offset = 1} (or any other number) to apply the function -\code{log (1 + offset)} instead (useful if you have e.g zero values in the data).} +\item{...}{Additional parameters to pass to the function you supplied.} } \value{ A data.table with either a transformed version of the data, or one @@ -94,12 +91,14 @@ transform_forecasts(example_quantile, truncate = TRUE, offset = 1) # adding multiple transformations library(magrittr) # pipe operator example_quantile \%>\% - # truncate all negative values for both log and sqrt + transform_forecasts(offset = 1, truncate = TRUE) \%>\% + # manually truncate all negative values before applying sqrt transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% - transform_forecasts(offset = 1) \%>\% transform_forecasts(fun = sqrt, label = "sqrt") \%>\% score() \%>\% summarise_scores(by = c("model", "scale")) + + } \references{ Transformation of forecasts for evaluating predictive diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index ded7ad398..b4bdecc58 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -18,7 +18,7 @@ test_that("function transform_forecasts works", { c(predictions$prediction, sqrt(predictions$prediction))) - # expect a warning if existing transformation is overwritte + # expect a warning if existing transformation is overwritten expect_warning( transform_forecasts(one, fun = sqrt) ) @@ -26,4 +26,13 @@ test_that("function transform_forecasts works", { # multiple transformations three <- transform_forecasts(one, fun = sqrt, label = "sqrt") expect_equal(unique(three$scale), c("natural", "log", "sqrt")) + + # multiple transformations without append + four <- transform_forecasts(two, fun = log_shift, offset = 1, append = FALSE) + compare <- c( + one$prediction[one$scale == "log"], + three$prediction[three$scale == "sqrt"] + ) + + expect_equal(four$prediction, compare) }) From a4f2037ba7bd71f5ca64dfb6b581361cf1894ec2 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Sat, 25 Mar 2023 00:22:09 +0100 Subject: [PATCH 12/14] fix failing example --- R/convenience-functions.R | 4 +--- man/transform_forecasts.Rd | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 8e3ab248c..5bc682379 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -65,12 +65,10 @@ #' example_quantile %>% #' transform_forecasts(offset = 1, truncate = TRUE) %>% #' # manually truncate all negative values before applying sqrt -#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% +#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE, label = "natural") %>% #' transform_forecasts(fun = sqrt, label = "sqrt") %>% #' score() %>% #' summarise_scores(by = c("model", "scale")) -#' -#' transform_forecasts <- function(data, fun = log_shift, diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 51328e6fd..eec88c21a 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -93,12 +93,10 @@ library(magrittr) # pipe operator example_quantile \%>\% transform_forecasts(offset = 1, truncate = TRUE) \%>\% # manually truncate all negative values before applying sqrt - transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% + transform_forecasts(fun = function(x) pmax(0, x), append = FALSE, label = "natural") \%>\% transform_forecasts(fun = sqrt, label = "sqrt") \%>\% score() \%>\% summarise_scores(by = c("model", "scale")) - - } \references{ Transformation of forecasts for evaluating predictive From fd3f55deabe6680e9b03b4d8bafff65af07573b1 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Thu, 30 Mar 2023 23:36:09 +0200 Subject: [PATCH 13/14] update documentation, rename fucnction argument and lint code --- R/convenience-functions.R | 63 +++++++++++++++++++------------------- man/log_shift.Rd | 11 ++++--- man/transform_forecasts.Rd | 17 ++++++---- 3 files changed, 49 insertions(+), 42 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 5bc682379..6f9286c52 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -22,7 +22,7 @@ #' @param fun A function used to transform both true values and predictions. #' The default function is [log_shift()], a custom function that is essentially #' the same as [log()], but has two additional arguments (`offset` and -#' `truncate`) that allow you to truncate negative values to zero and add an +#' `negative_to_zero`) that allow you to truncate negative values to zero and add an #' offset before applying the logarithm. #' @param append whether or not to append a transformed version of the data to #' the currently existing data (default is TRUE). If selected, the data gets @@ -55,17 +55,22 @@ #' @examples #' #' # add log transformed forecasts (produces a warning as some values are zero) -#' transform_forecasts(example_quantile, truncate = TRUE, append = FALSE) +#' transform_forecasts(example_quantile, negative_to_zero = TRUE, append = FALSE) #' #' # specifying an offset manually for the log transformation removes the warning -#' transform_forecasts(example_quantile, truncate = TRUE, offset = 1) +#' transform_forecasts(example_quantile, negative_to_zero = TRUE, offset = 1) +#' +#' # truncating forecasts manually before sqrt +#' library(magrittr) # pipe operator +#' example_quantile %>% +#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% +#' transform_forecasts(fun = sqrt, label = "sqrt") #' #' # adding multiple transformations #' library(magrittr) # pipe operator #' example_quantile %>% -#' transform_forecasts(offset = 1, truncate = TRUE) %>% -#' # manually truncate all negative values before applying sqrt -#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE, label = "natural") %>% +#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% +#' transform_forecasts(offset = 1) %>% #' transform_forecasts(fun = sqrt, label = "sqrt") %>% #' score() %>% #' summarise_scores(by = c("model", "scale")) @@ -75,18 +80,20 @@ transform_forecasts <- function(data, append = TRUE, label = "log", ...) { - original_data <- as.data.table(data) scale_col_present <- ("scale" %in% colnames(original_data)) # Error handling if (scale_col_present) { if (!("natural" %in% original_data$scale)) { - stop("If a column 'scale' is present, entries with scale =='natural' are required for the transformation") + stop( + "If a column 'scale' is present, entries with scale =='natural' are required for the transformation" + ) } if (append && (label %in% original_data$scale)) { w <- paste0( - "Appending new transformations with label '", label, + "Appending new transformations with label '", + label, "', even though that entry is already present in column 'scale'." ) warning(w) @@ -95,7 +102,8 @@ transform_forecasts <- function(data, if (append) { if (scale_col_present) { - transformed_data <- data.table::copy(original_data)[scale == "natural"] + transformed_data <- + data.table::copy(original_data)[scale == "natural"] } else { transformed_data <- data.table::copy(original_data) original_data[, scale := "natural"] @@ -131,12 +139,13 @@ transform_forecasts <- function(data, #' natural logarithm to it. #' #' @details The output is computed as log(x + offset) (or -#' log(pmax(0, x) + offset)) if `truncate = TRUE`. +#' log(pmax(0, x) + offset)) if `negative_to_zero = TRUE`. #' #' @param x vector of input values to be transformed #' @param offset number to add to the input value before taking the natural #' logarithm -#' @param truncate whether to truncate negative values to 0. Default is FALSE. +#' @param negative_to_zero whether or not to replace all negative values with +#' zero before applying the log transformation. Default is FALSE. #' @param base a positive or complex number: the base with respect to which #' logarithms are computed. Defaults to e = exp(1). #' @return A numeric vector with transformed values @@ -154,41 +163,33 @@ transform_forecasts <- function(data, #' #' log_shift(1:10) #' log_shift(0:9, offset = 1) -#' log_shift(-1:9, offset = 1, truncate = TRUE) +#' log_shift(-1:9, offset = 1, negative_to_zero = TRUE) #' #' transform_forecasts( #' example_quantile, #' fun = log_shift, #' offset = 1, -#' truncate = TRUE +#' negative_to_zero = TRUE #' ) -log_shift <- function( - x, - offset = 0, - truncate = FALSE, - base = exp(1) -) { - - if (truncate) { +log_shift <- function(x, + offset = 0, + negative_to_zero = FALSE, + base = exp(1)) { + if (negative_to_zero) { x <- pmax(0, x) } if (any (x < 0, na.rm = TRUE)) { - w <- paste( - "Detected input values < 0.", - "Try truncating negative values (use truncate = TRUE)" - ) + w <- paste("Detected input values < 0.", + "Try truncating negative values (use negative_to_zero = TRUE)") warning(w) } if (any(x == 0, na.rm = TRUE) && offset == 0) { - w <- paste0( - "Detected zeros in input values.", - "Try specifying offset = 1 (or any other offset)." - ) + w <- paste0("Detected zeros in input values.", + "Try specifying offset = 1 (or any other offset).") warning(w) } log(x + offset, base = base) } - diff --git a/man/log_shift.Rd b/man/log_shift.Rd index 29543c7ba..78ea80567 100644 --- a/man/log_shift.Rd +++ b/man/log_shift.Rd @@ -4,7 +4,7 @@ \alias{log_shift} \title{Log transformation with an additive shift} \usage{ -log_shift(x, offset = 0, truncate = FALSE, base = exp(1)) +log_shift(x, offset = 0, negative_to_zero = FALSE, base = exp(1)) } \arguments{ \item{x}{vector of input values to be transformed} @@ -12,7 +12,8 @@ log_shift(x, offset = 0, truncate = FALSE, base = exp(1)) \item{offset}{number to add to the input value before taking the natural logarithm} -\item{truncate}{whether to truncate negative values to 0. Default is FALSE.} +\item{negative_to_zero}{whether or not to replace all negative values with +zero before applying the log transformation. Default is FALSE.} \item{base}{a positive or complex number: the base with respect to which logarithms are computed. Defaults to e = exp(1).} @@ -26,19 +27,19 @@ natural logarithm to it. } \details{ The output is computed as log(x + offset) (or -log(pmax(0, x) + offset)) if \code{truncate = TRUE}. +log(pmax(0, x) + offset)) if \code{negative_to_zero = TRUE}. } \examples{ log_shift(1:10) log_shift(0:9, offset = 1) -log_shift(-1:9, offset = 1, truncate = TRUE) +log_shift(-1:9, offset = 1, negative_to_zero = TRUE) transform_forecasts( example_quantile, fun = log_shift, offset = 1, - truncate = TRUE + negative_to_zero = TRUE ) } \references{ diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index eec88c21a..39a89c4ef 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -37,7 +37,7 @@ are examples for each format (\link{example_quantile}, \link{example_continuous} \item{fun}{A function used to transform both true values and predictions. The default function is \code{\link[=log_shift]{log_shift()}}, a custom function that is essentially the same as \code{\link[=log]{log()}}, but has two additional arguments (\code{offset} and -\code{truncate}) that allow you to truncate negative values to zero and add an +\code{negative_to_zero}) that allow you to truncate negative values to zero and add an offset before applying the logarithm.} \item{append}{whether or not to append a transformed version of the data to @@ -83,17 +83,22 @@ rule. \examples{ # add log transformed forecasts (produces a warning as some values are zero) -transform_forecasts(example_quantile, truncate = TRUE, append = FALSE) +transform_forecasts(example_quantile, negative_to_zero = TRUE, append = FALSE) # specifying an offset manually for the log transformation removes the warning -transform_forecasts(example_quantile, truncate = TRUE, offset = 1) +transform_forecasts(example_quantile, negative_to_zero = TRUE, offset = 1) + +# truncating forecasts manually before sqrt +library(magrittr) # pipe operator +example_quantile \%>\% + transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% + transform_forecasts(fun = sqrt, label = "sqrt") # adding multiple transformations library(magrittr) # pipe operator example_quantile \%>\% - transform_forecasts(offset = 1, truncate = TRUE) \%>\% - # manually truncate all negative values before applying sqrt - transform_forecasts(fun = function(x) pmax(0, x), append = FALSE, label = "natural") \%>\% + transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% + transform_forecasts(offset = 1) \%>\% transform_forecasts(fun = sqrt, label = "sqrt") \%>\% score() \%>\% summarise_scores(by = c("model", "scale")) From 4a72b1d7bc0005de1ca8455c697d5c4dee72357c Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Mon, 10 Apr 2023 12:13:23 +0200 Subject: [PATCH 14/14] update documentation --- R/convenience-functions.R | 24 +++++++++++++++++++----- man/transform_forecasts.Rd | 23 ++++++++++++++++++----- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 6f9286c52..e660511d3 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -22,8 +22,8 @@ #' @param fun A function used to transform both true values and predictions. #' The default function is [log_shift()], a custom function that is essentially #' the same as [log()], but has two additional arguments (`offset` and -#' `negative_to_zero`) that allow you to truncate negative values to zero and add an -#' offset before applying the logarithm. +#' `negative_to_zero`) that allow you to first truncate negative values to zero +#' and then add an offset before applying the logarithm. #' @param append whether or not to append a transformed version of the data to #' the currently existing data (default is TRUE). If selected, the data gets #' transformed and appended to the existing data frame, making it possible to @@ -54,14 +54,28 @@ #' @keywords check-forecasts #' @examples #' +#' library(magrittr) # pipe operator +#' #' # add log transformed forecasts (produces a warning as some values are zero) +#' # negative values need to be handled (here by replacing them with 0) +#' example_quantile %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' transform_forecasts(append = FALSE) +#' +#' # alternatively: #' transform_forecasts(example_quantile, negative_to_zero = TRUE, append = FALSE) #' #' # specifying an offset manually for the log transformation removes the warning -#' transform_forecasts(example_quantile, negative_to_zero = TRUE, offset = 1) +#' example_quantile %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' transform_forecasts(offset = 1, append = FALSE) #' #' # truncating forecasts manually before sqrt -#' library(magrittr) # pipe operator +#' example_quantile %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' transform_forecasts(fun = sqrt, label = "sqrt") +#' +#' # alternatively, this achieves the same #' example_quantile %>% #' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% #' transform_forecasts(fun = sqrt, label = "sqrt") @@ -69,7 +83,7 @@ #' # adding multiple transformations #' library(magrittr) # pipe operator #' example_quantile %>% -#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% #' transform_forecasts(offset = 1) %>% #' transform_forecasts(fun = sqrt, label = "sqrt") %>% #' score() %>% diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd index 39a89c4ef..afc6c09e1 100644 --- a/man/transform_forecasts.Rd +++ b/man/transform_forecasts.Rd @@ -37,8 +37,8 @@ are examples for each format (\link{example_quantile}, \link{example_continuous} \item{fun}{A function used to transform both true values and predictions. The default function is \code{\link[=log_shift]{log_shift()}}, a custom function that is essentially the same as \code{\link[=log]{log()}}, but has two additional arguments (\code{offset} and -\code{negative_to_zero}) that allow you to truncate negative values to zero and add an -offset before applying the logarithm.} +\code{negative_to_zero}) that allow you to first truncate negative values to zero +and then add an offset before applying the logarithm.} \item{append}{whether or not to append a transformed version of the data to the currently existing data (default is TRUE). If selected, the data gets @@ -82,14 +82,27 @@ rule. } \examples{ +library(magrittr) # pipe operator + # add log transformed forecasts (produces a warning as some values are zero) +example_quantile \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + transform_forecasts(append = FALSE) + +# alternatively: transform_forecasts(example_quantile, negative_to_zero = TRUE, append = FALSE) # specifying an offset manually for the log transformation removes the warning -transform_forecasts(example_quantile, negative_to_zero = TRUE, offset = 1) +example_quantile \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + transform_forecasts(offset = 1, append = FALSE) # truncating forecasts manually before sqrt -library(magrittr) # pipe operator +example_quantile \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + transform_forecasts(fun = sqrt, label = "sqrt") + +# alternatively, this achieves the same example_quantile \%>\% transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% transform_forecasts(fun = sqrt, label = "sqrt") @@ -97,7 +110,7 @@ example_quantile \%>\% # adding multiple transformations library(magrittr) # pipe operator example_quantile \%>\% - transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% transform_forecasts(offset = 1) \%>\% transform_forecasts(fun = sqrt, label = "sqrt") \%>\% score() \%>\%