diff --git a/DESCRIPTION b/DESCRIPTION index 85b588f2b..d5a9adad9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: scoringutils Title: Utilities for Scoring and Assessing Predictions -Version: 1.1.1 +Version: 1.1.2 Language: en-GB Authors@R: c( person(given = "Nikos", diff --git a/NAMESPACE b/NAMESPACE index 334ea3856..68eb44f95 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,6 +17,7 @@ export(crps_sample) export(dss_sample) export(find_duplicates) export(interval_score) +export(log_shift) export(logs_binary) export(logs_sample) export(mad_sample) @@ -45,6 +46,7 @@ export(squared_error) export(summarise_scores) export(summarize_scores) export(theme_scoringutils) +export(transform_forecasts) importFrom(data.table,"%like%") importFrom(data.table,':=') importFrom(data.table,.I) diff --git a/NEWS.md b/NEWS.md index 79ab29fd9..1635621ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,11 @@ +# scoringutils 1.1.2 + +## Feature updates + +- added a new function, `transform_forecasts()` to make it easy to transform forecasts before scoring them, as suggested in Bosse et al. (2023), https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1. +- added another function, `log_shift()` that implements the default transformation function. The function allows to truncate negative values and add an offset before applying the logarithm. + + # scoringutils 1.1.1 - added a small change to `interval_score()` which explicitly converts the logical vector to a numeric one. This should happen implicitly anyway, but is now done explicitly in order to avoid issues that may come up if the input vector has a type that doesn't allow the implict conversion. diff --git a/R/convenience-functions.R b/R/convenience-functions.R new file mode 100644 index 000000000..e660511d3 --- /dev/null +++ b/R/convenience-functions.R @@ -0,0 +1,209 @@ +#' @title Transform forecasts and observed values +#' +#' @description Function to transform forecasts and true values before scoring. +#' +#' @details There are a few reasons, depending on the circumstances, for +#' why this might be desirable (check out the linked reference for more info). +#' In epidemiology, for example, it may be useful to log-transform incidence +#' counts before evaluating forecasts using scores such as the weighted interval +#' score (WIS) or the continuous ranked probability score (CRPS). +#' Log-transforming forecasts and observations changes the interpretation of +#' the score from a measure of absolute distance between forecast and +#' observation to a score that evaluates a forecast of the exponential growth +#' rate. Another motivation can be to apply a variance-stabilising +#' transformation or to standardise incidence counts by population. +#' +#' Note that if you want to apply a transformation, it is important to transform +#' the forecasts and observations and then apply the score. Applying a +#' transformation after the score risks losing propriety of the proper scoring +#' rule. +#' +#' @inheritParams score +#' @param fun A function used to transform both true values and predictions. +#' The default function is [log_shift()], a custom function that is essentially +#' the same as [log()], but has two additional arguments (`offset` and +#' `negative_to_zero`) that allow you to first truncate negative values to zero +#' and then add an offset before applying the logarithm. +#' @param append whether or not to append a transformed version of the data to +#' the currently existing data (default is TRUE). If selected, the data gets +#' transformed and appended to the existing data frame, making it possible to +#' use the outcome directly in [score()]. An additional column, 'scale', gets +#' created that denotes which rows or untransformed ('scale' has the value +#' "natural") and which have been transformed ('scale' has the value passed to +#' the argument `label`). +#' @param label A string for the newly created 'scale' column to denote the +#' newly transformed values. Only relevant if `append = TRUE`. +#' @param ... Additional parameters to pass to the function you supplied. +#' @return A data.table with either a transformed version of the data, or one +#' with both the untransformed and the transformed data. includes the original data as well as a +#' transformation of the original data. There will be one additional column, +#' 'scale', present which will be set to "natural" for the untransformed +#' forecasts. +#' +#' @importFrom data.table ':=' is.data.table copy +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @export +#' @references Transformation of forecasts for evaluating predictive +#' performance in an epidemiological context +#' Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +#' Sebastian Funk +#' medRxiv 2023.01.23.23284722 +#' \doi{https://doi.org/10.1101/2023.01.23.23284722} +#' # nolint + +#' @keywords check-forecasts +#' @examples +#' +#' library(magrittr) # pipe operator +#' +#' # add log transformed forecasts (produces a warning as some values are zero) +#' # negative values need to be handled (here by replacing them with 0) +#' example_quantile %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' transform_forecasts(append = FALSE) +#' +#' # alternatively: +#' transform_forecasts(example_quantile, negative_to_zero = TRUE, append = FALSE) +#' +#' # specifying an offset manually for the log transformation removes the warning +#' example_quantile %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' transform_forecasts(offset = 1, append = FALSE) +#' +#' # truncating forecasts manually before sqrt +#' example_quantile %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' transform_forecasts(fun = sqrt, label = "sqrt") +#' +#' # alternatively, this achieves the same +#' example_quantile %>% +#' transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) %>% +#' transform_forecasts(fun = sqrt, label = "sqrt") +#' +#' # adding multiple transformations +#' library(magrittr) # pipe operator +#' example_quantile %>% +#' .[, true_value := ifelse(true_value < 0, 0, true_value)] %>% +#' transform_forecasts(offset = 1) %>% +#' transform_forecasts(fun = sqrt, label = "sqrt") %>% +#' score() %>% +#' summarise_scores(by = c("model", "scale")) + +transform_forecasts <- function(data, + fun = log_shift, + append = TRUE, + label = "log", + ...) { + original_data <- as.data.table(data) + scale_col_present <- ("scale" %in% colnames(original_data)) + + # Error handling + if (scale_col_present) { + if (!("natural" %in% original_data$scale)) { + stop( + "If a column 'scale' is present, entries with scale =='natural' are required for the transformation" + ) + } + if (append && (label %in% original_data$scale)) { + w <- paste0( + "Appending new transformations with label '", + label, + "', even though that entry is already present in column 'scale'." + ) + warning(w) + } + } + + if (append) { + if (scale_col_present) { + transformed_data <- + data.table::copy(original_data)[scale == "natural"] + } else { + transformed_data <- data.table::copy(original_data) + original_data[, scale := "natural"] + } + transformed_data[, prediction := fun(prediction, ...)] + transformed_data[, true_value := fun(true_value, ...)] + transformed_data[, scale := label] + out <- rbind(original_data, transformed_data) + return(out[]) + } + + # check if a column called "scale" is already present and if so, only + # restrict to transformations of the original data + if (scale_col_present) { + original_data[scale == "natural", prediction := fun(prediction, ...)] + original_data[scale == "natural", true_value := fun(true_value, ...)] + original_data[scale == "natural", scale := label] + } else { + original_data[, prediction := fun(prediction, ...)] + original_data[, true_value := fun(true_value, ...)] + } + return(original_data[]) +} + + + + + + +#' @title Log transformation with an additive shift +#' +#' @description Function that shifts a value by some offset and then applies the +#' natural logarithm to it. +#' +#' @details The output is computed as log(x + offset) (or +#' log(pmax(0, x) + offset)) if `negative_to_zero = TRUE`. +#' +#' @param x vector of input values to be transformed +#' @param offset number to add to the input value before taking the natural +#' logarithm +#' @param negative_to_zero whether or not to replace all negative values with +#' zero before applying the log transformation. Default is FALSE. +#' @param base a positive or complex number: the base with respect to which +#' logarithms are computed. Defaults to e = exp(1). +#' @return A numeric vector with transformed values +#' @export +#' @references Transformation of forecasts for evaluating predictive +#' performance in an epidemiological context +#' Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +#' Sebastian Funk +#' medRxiv 2023.01.23.23284722 +#' \doi{https://doi.org/10.1101/2023.01.23.23284722} +#' # nolint + +#' @keywords check-forecasts +#' @examples +#' +#' log_shift(1:10) +#' log_shift(0:9, offset = 1) +#' log_shift(-1:9, offset = 1, negative_to_zero = TRUE) +#' +#' transform_forecasts( +#' example_quantile, +#' fun = log_shift, +#' offset = 1, +#' negative_to_zero = TRUE +#' ) + +log_shift <- function(x, + offset = 0, + negative_to_zero = FALSE, + base = exp(1)) { + if (negative_to_zero) { + x <- pmax(0, x) + } + + if (any (x < 0, na.rm = TRUE)) { + w <- paste("Detected input values < 0.", + "Try truncating negative values (use negative_to_zero = TRUE)") + warning(w) + } + + if (any(x == 0, na.rm = TRUE) && offset == 0) { + w <- paste0("Detected zeros in input values.", + "Try specifying offset = 1 (or any other offset).") + warning(w) + } + log(x + offset, base = base) +} diff --git a/man/log_shift.Rd b/man/log_shift.Rd new file mode 100644 index 000000000..78ea80567 --- /dev/null +++ b/man/log_shift.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convenience-functions.R +\name{log_shift} +\alias{log_shift} +\title{Log transformation with an additive shift} +\usage{ +log_shift(x, offset = 0, negative_to_zero = FALSE, base = exp(1)) +} +\arguments{ +\item{x}{vector of input values to be transformed} + +\item{offset}{number to add to the input value before taking the natural +logarithm} + +\item{negative_to_zero}{whether or not to replace all negative values with +zero before applying the log transformation. Default is FALSE.} + +\item{base}{a positive or complex number: the base with respect to which +logarithms are computed. Defaults to e = exp(1).} +} +\value{ +A numeric vector with transformed values +} +\description{ +Function that shifts a value by some offset and then applies the +natural logarithm to it. +} +\details{ +The output is computed as log(x + offset) (or +log(pmax(0, x) + offset)) if \code{negative_to_zero = TRUE}. +} +\examples{ + +log_shift(1:10) +log_shift(0:9, offset = 1) +log_shift(-1:9, offset = 1, negative_to_zero = TRUE) + +transform_forecasts( + example_quantile, + fun = log_shift, + offset = 1, + negative_to_zero = TRUE + ) +} +\references{ +Transformation of forecasts for evaluating predictive +performance in an epidemiological context +Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +Sebastian Funk +medRxiv 2023.01.23.23284722 +\doi{https://doi.org/10.1101/2023.01.23.23284722} +\url{https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1} # nolint +} +\keyword{check-forecasts} diff --git a/man/transform_forecasts.Rd b/man/transform_forecasts.Rd new file mode 100644 index 000000000..afc6c09e1 --- /dev/null +++ b/man/transform_forecasts.Rd @@ -0,0 +1,131 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convenience-functions.R +\name{transform_forecasts} +\alias{transform_forecasts} +\title{Transform forecasts and observed values} +\usage{ +transform_forecasts(data, fun = log_shift, append = TRUE, label = "log", ...) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +For scoring using \code{\link[=score]{score()}}, the following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For scoring integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For scoring predictions in a quantile-format forecast you should provide +a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +} + +In addition a \code{model} column is suggested and if not present this will be +flagged and added to the input data with all forecasts assigned as an +"unspecified model"). + +You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there +are examples for each format (\link{example_quantile}, \link{example_continuous}, +\link{example_integer}, and \link{example_binary}).} + +\item{fun}{A function used to transform both true values and predictions. +The default function is \code{\link[=log_shift]{log_shift()}}, a custom function that is essentially +the same as \code{\link[=log]{log()}}, but has two additional arguments (\code{offset} and +\code{negative_to_zero}) that allow you to first truncate negative values to zero +and then add an offset before applying the logarithm.} + +\item{append}{whether or not to append a transformed version of the data to +the currently existing data (default is TRUE). If selected, the data gets +transformed and appended to the existing data frame, making it possible to +use the outcome directly in \code{\link[=score]{score()}}. An additional column, 'scale', gets +created that denotes which rows or untransformed ('scale' has the value +"natural") and which have been transformed ('scale' has the value passed to +the argument \code{label}).} + +\item{label}{A string for the newly created 'scale' column to denote the +newly transformed values. Only relevant if \code{append = TRUE}.} + +\item{...}{Additional parameters to pass to the function you supplied.} +} +\value{ +A data.table with either a transformed version of the data, or one +with both the untransformed and the transformed data. includes the original data as well as a +transformation of the original data. There will be one additional column, +'scale', present which will be set to "natural" for the untransformed +forecasts. +} +\description{ +Function to transform forecasts and true values before scoring. +} +\details{ +There are a few reasons, depending on the circumstances, for +why this might be desirable (check out the linked reference for more info). +In epidemiology, for example, it may be useful to log-transform incidence +counts before evaluating forecasts using scores such as the weighted interval +score (WIS) or the continuous ranked probability score (CRPS). +Log-transforming forecasts and observations changes the interpretation of +the score from a measure of absolute distance between forecast and +observation to a score that evaluates a forecast of the exponential growth +rate. Another motivation can be to apply a variance-stabilising +transformation or to standardise incidence counts by population. + +Note that if you want to apply a transformation, it is important to transform +the forecasts and observations and then apply the score. Applying a +transformation after the score risks losing propriety of the proper scoring +rule. +} +\examples{ + +library(magrittr) # pipe operator + +# add log transformed forecasts (produces a warning as some values are zero) +example_quantile \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + transform_forecasts(append = FALSE) + +# alternatively: +transform_forecasts(example_quantile, negative_to_zero = TRUE, append = FALSE) + +# specifying an offset manually for the log transformation removes the warning +example_quantile \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + transform_forecasts(offset = 1, append = FALSE) + +# truncating forecasts manually before sqrt +example_quantile \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + transform_forecasts(fun = sqrt, label = "sqrt") + +# alternatively, this achieves the same +example_quantile \%>\% + transform_forecasts(fun = function(x) pmax(0, x), append = FALSE) \%>\% + transform_forecasts(fun = sqrt, label = "sqrt") + +# adding multiple transformations +library(magrittr) # pipe operator +example_quantile \%>\% + .[, true_value := ifelse(true_value < 0, 0, true_value)] \%>\% + transform_forecasts(offset = 1) \%>\% + transform_forecasts(fun = sqrt, label = "sqrt") \%>\% + score() \%>\% + summarise_scores(by = c("model", "scale")) +} +\references{ +Transformation of forecasts for evaluating predictive +performance in an epidemiological context +Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +Sebastian Funk +medRxiv 2023.01.23.23284722 +\doi{https://doi.org/10.1101/2023.01.23.23284722} +\url{https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1} # nolint +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} +\keyword{check-forecasts} diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R new file mode 100644 index 000000000..b4bdecc58 --- /dev/null +++ b/tests/testthat/test-convenience-functions.R @@ -0,0 +1,38 @@ +test_that("function transform_forecasts works", { + + predictions_original <- example_quantile$prediction + predictions <- transform_forecasts( + example_quantile, + fun = function(x) pmax(0, x), + append = FALSE + ) + + expect_equal(predictions$prediction, pmax(0, predictions_original)) + + one <- transform_forecasts(predictions, offset = 1) + expect_equal(one$prediction, + c(predictions$prediction, log(predictions$prediction + 1))) + + two <- transform_forecasts(predictions, fun = sqrt, label = "sqrt") + expect_equal(two$prediction, + c(predictions$prediction, sqrt(predictions$prediction))) + + + # expect a warning if existing transformation is overwritten + expect_warning( + transform_forecasts(one, fun = sqrt) + ) + + # multiple transformations + three <- transform_forecasts(one, fun = sqrt, label = "sqrt") + expect_equal(unique(three$scale), c("natural", "log", "sqrt")) + + # multiple transformations without append + four <- transform_forecasts(two, fun = log_shift, offset = 1, append = FALSE) + compare <- c( + one$prediction[one$scale == "log"], + three$prediction[three$scale == "sqrt"] + ) + + expect_equal(four$prediction, compare) +})