From 8d3c8e23f739db32ed17a02d2af1460dd9d88806 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Mon, 27 Feb 2023 18:32:48 +0100 Subject: [PATCH 01/16] first draft proposal of function to set forecast unit manually --- R/convenience-functions.R | 46 +++++++++++++++++++++++++++++++++++++++ R/utils.R | 35 ++++++++++++++++++++++++----- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index f38e90b36..3d973cff7 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -43,3 +43,49 @@ add_transformation <- function(data, return(out) } + + + + +#' @title Set unit of a single forecast manually +#' +#' @description Helper function to set the unit of a single forecast manually. +#' This simple function keeps the columns specified in `forecast_unit` (plus +#' additional protected columns, e.g. for true values, predictions or quantile +#' levels) and removes duplicate rows. +#' +#' @inheritParams score +#' @return A data.table that includes the original data as well as a +#' transformation of the original data. There will be one additional column, +#' 'scale', present which will be set to "natural" for the untransformed +#' forecasts. +#' +#' @importFrom data.table ':=' is.data.table copy +#' @author Nikos Bosse \email{nikosbosse@@gmail.com} +#' @export +#' @references Transformation of forecasts for evaluating predictive +#' performance in an epidemiological context +#' Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +#' Sebastian Funk +#' medRxiv 2023.01.23.23284722 +#' \doi{https://doi.org/10.1101/2023.01.23.23284722} +#' # nolint + +#' @keywords check-forecasts +#' @examples +#' add_transformation(example_quantile) + +set_forecast_unit <- function(data, + forecast_unit = NULL) { + keep_cols <- intersect( + colnames(data), + get_protected_columns(data) + ) + keep_cols <- c(keep_cols, forecast_unit) + out <- unique(data[, .SD, .SDcols = keep_cols])[] + return(out) +} + + + + diff --git a/R/utils.R b/R/utils.R index 37e87a0df..576d5614c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -242,12 +242,7 @@ get_target_type <- function(data) { get_forecast_unit <- function(data, prediction_type) { - protected_columns <- c( - "prediction", "true_value", "sample", "quantile", "upper", "lower", - "pit_value", - "range", "boundary", available_metrics(), - names(data)[grepl("coverage_", names(data))] - ) + protected_columns <- get_protected_columns(data) if (!missing(prediction_type)) { if (prediction_type == "quantile") { protected_columns <- setdiff(protected_columns, "sample") @@ -256,3 +251,31 @@ get_forecast_unit <- function(data, prediction_type) { forecast_unit <- setdiff(colnames(data), protected_columns) return(forecast_unit) } + + +#' @title Get protected columns from a data frame +#' +#' @description Helper function to get the names of protected columns +#' +#' +#' @inheritParams check_forecasts +#' +#' @return A character vector with the column names that define the unit of +#' a single forecast +#' +#' @keywords internal + +get_protected_columns <- function(data) { + protected_columns <- c( + "prediction", "true_value", "sample", "quantile", "upper", "lower", + "pit_value", + "range", "boundary", available_metrics(), + names(data)[grepl("coverage_", names(data))] + ) + + return(protected_columns) +} + + + + From 66c3e724178fe5c816b3d13c57c867403ab933dd Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Thu, 2 Mar 2023 12:40:04 +0100 Subject: [PATCH 02/16] add some error handling for set_forecast_unit --- R/convenience-functions.R | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 3d973cff7..3fc9af601 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -55,10 +55,8 @@ add_transformation <- function(data, #' levels) and removes duplicate rows. #' #' @inheritParams score -#' @return A data.table that includes the original data as well as a -#' transformation of the original data. There will be one additional column, -#' 'scale', present which will be set to "natural" for the untransformed -#' forecasts. +#' @return A data.table with only those columns kept that are relevant to +#' scoring or denote the unit of a single forecast as specified by the user. #' #' @importFrom data.table ':=' is.data.table copy #' @author Nikos Bosse \email{nikosbosse@@gmail.com} @@ -71,14 +69,28 @@ add_transformation <- function(data, #' \doi{https://doi.org/10.1101/2023.01.23.23284722} #' # nolint -#' @keywords check-forecasts +#' @keywords data-handling #' @examples -#' add_transformation(example_quantile) +#' set_forecast_unit <- function(data, forecast_unit = NULL) { + + datacols <- colnames(data) + missing <- forecast_unit[!(forecast_unit %in% datacols)] + + if (length(missing) > 0) { + warning( + paste0( + "Column(s) '", + missing, + "' are not columns of the data" + ) + ) + } + keep_cols <- intersect( - colnames(data), + datacols, get_protected_columns(data) ) keep_cols <- c(keep_cols, forecast_unit) @@ -89,3 +101,4 @@ set_forecast_unit <- function(data, + From b4ec2174e16116c1e0d958fd94b390a08dbd8a8c Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Wed, 12 Jul 2023 15:38:21 +0200 Subject: [PATCH 03/16] replace a hard-coded vector with protected columns by a function --- R/utils.R | 30 +++++++++++++++++++----- man/get_protected_columns.Rd | 44 ++++++++++++++++++++++++++++++++++++ tests/testthat/test-utils.R | 34 ++++++++++++++++++++++++++++ 3 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 man/get_protected_columns.Rd create mode 100644 tests/testthat/test-utils.R diff --git a/R/utils.R b/R/utils.R index ae3e6f382..edbb69753 100644 --- a/R/utils.R +++ b/R/utils.R @@ -242,12 +242,7 @@ get_target_type <- function(data) { get_forecast_unit <- function(data, prediction_type) { - protected_columns <- c( - "prediction", "true_value", "sample", "quantile", "upper", "lower", - "pit_value", - "range", "boundary", available_metrics(), - grep("coverage_", names(data), fixed = TRUE, value = TRUE) - ) + protected_columns <- get_protected_columns(data) if (!missing(prediction_type)) { if (prediction_type == "quantile") { protected_columns <- setdiff(protected_columns, "sample") @@ -256,3 +251,26 @@ get_forecast_unit <- function(data, prediction_type) { forecast_unit <- setdiff(colnames(data), protected_columns) return(forecast_unit) } + + +#' @title Get protected columns from a data frame +#' +#' @description Helper function to get the names of all columns in a data frame +#' that are protected columns. +#' +#' @inheritParams check_forecasts +#' +#' @return A character vector with the names of protected columns in the data +#' +#' @keywords internal + +get_protected_columns <- function(data) { + protected_columns <- c( + "prediction", "true_value", "sample", "quantile", "upper", "lower", + "pit_value", + "range", "boundary", available_metrics(), + names(data)[grepl("coverage_", names(data))] + ) + + return(protected_columns) +} diff --git a/man/get_protected_columns.Rd b/man/get_protected_columns.Rd new file mode 100644 index 000000000..79af67598 --- /dev/null +++ b/man/get_protected_columns.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{get_protected_columns} +\alias{get_protected_columns} +\title{Get protected columns from a data frame} +\usage{ +get_protected_columns(data) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +For scoring using \code{\link[=score]{score()}}, the following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For scoring integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For scoring predictions in a quantile-format forecast you should provide +a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +} + +In addition a \code{model} column is suggested and if not present this will be +flagged and added to the input data with all forecasts assigned as an +"unspecified model"). + +You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there +are examples for each format (\link{example_quantile}, \link{example_continuous}, +\link{example_integer}, and \link{example_binary}).} +} +\value{ +A character vector with the names of protected columns in the data +} +\description{ +Helper function to get the names of all columns in a data frame +that are protected columns. +} +\keyword{internal} diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R new file mode 100644 index 000000000..904d89572 --- /dev/null +++ b/tests/testthat/test-utils.R @@ -0,0 +1,34 @@ +test_that("get_protected columns returns the correct result", { + + data <- example_quantile + manual <- protected_columns <- c( + "prediction", "true_value", "sample", "quantile", "upper", "lower", + "pit_value", + "range", "boundary", available_metrics(), + grep("coverage_", names(data), fixed = TRUE, value = TRUE) + ) + auto <- get_protected_columns(data) + expect_equal(manual, auto) + + + data <- example_binary + manual <- protected_columns <- c( + "prediction", "true_value", "sample", "quantile", "upper", "lower", + "pit_value", + "range", "boundary", available_metrics(), + grep("coverage_", names(data), fixed = TRUE, value = TRUE) + ) + auto <- get_protected_columns(data) + expect_equal(manual, auto) + + data <- example_continuous + manual <- protected_columns <- c( + "prediction", "true_value", "sample", "quantile", "upper", "lower", + "pit_value", + "range", "boundary", available_metrics(), + grep("coverage_", names(data), fixed = TRUE, value = TRUE) + ) + auto <- get_protected_columns(data) + expect_equal(manual, auto) + +}) From 281377d5736cc21e826ea640920c8a4557a3aba8 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Wed, 12 Jul 2023 16:00:57 +0200 Subject: [PATCH 04/16] add tests for new set_forecast_unit function --- NAMESPACE | 1 + man/set_forecast_unit.Rd | 59 +++++++++++++++++++++ tests/testthat/test-convenience-functions.R | 23 ++++++++ 3 files changed, 83 insertions(+) create mode 100644 man/set_forecast_unit.Rd diff --git a/NAMESPACE b/NAMESPACE index 437b12630..f7ba26377 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,7 @@ export(quantile_score) export(sample_to_quantile) export(score) export(se_mean_sample) +export(set_forecast_unit) export(squared_error) export(summarise_scores) export(summarize_scores) diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd new file mode 100644 index 000000000..a2f13dd14 --- /dev/null +++ b/man/set_forecast_unit.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convenience-functions.R +\name{set_forecast_unit} +\alias{set_forecast_unit} +\title{Set unit of a single forecast manually} +\usage{ +set_forecast_unit(data, forecast_unit = NULL) +} +\arguments{ +\item{data}{A data.frame or data.table with the predictions and observations. +For scoring using \code{\link[=score]{score()}}, the following columns need to be present: +\itemize{ +\item \code{true_value} - the true observed values +\item \code{prediction} - predictions or predictive samples for one +true value. (You only don't need to provide a prediction column if +you want to score quantile forecasts in a wide range format.)} +For scoring integer and continuous forecasts a \code{sample} column is needed: +\itemize{ +\item \code{sample} - an index to identify the predictive samples in the +prediction column generated by one model for one true value. Only +necessary for continuous and integer forecasts, not for +binary predictions.} +For scoring predictions in a quantile-format forecast you should provide +a column called \code{quantile}: +\itemize{ +\item \code{quantile}: quantile to which the prediction corresponds +} + +In addition a \code{model} column is suggested and if not present this will be +flagged and added to the input data with all forecasts assigned as an +"unspecified model"). + +You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there +are examples for each format (\link{example_quantile}, \link{example_continuous}, +\link{example_integer}, and \link{example_binary}).} +} +\value{ +A data.table with only those columns kept that are relevant to +scoring or denote the unit of a single forecast as specified by the user. +} +\description{ +Helper function to set the unit of a single forecast manually. +This simple function keeps the columns specified in \code{forecast_unit} (plus +additional protected columns, e.g. for true values, predictions or quantile +levels) and removes duplicate rows. +} +\references{ +Transformation of forecasts for evaluating predictive +performance in an epidemiological context +Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, +Sebastian Funk +medRxiv 2023.01.23.23284722 +\doi{https://doi.org/10.1101/2023.01.23.23284722} +\url{https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1} # nolint +} +\author{ +Nikos Bosse \email{nikosbosse@gmail.com} +} +\keyword{data-handling} diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index b4bdecc58..8f895f30e 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -36,3 +36,26 @@ test_that("function transform_forecasts works", { expect_equal(four$prediction, compare) }) + + + +test_that("function set_forecast_unit() works", { + + # some columns in the example data have duplicated information. So we can remove + # these and see whether the result stays the same. + + scores1 <- suppressMessages(score(example_quantile)) + scores1 <- scores1[order(location, target_end_date, target_type, horizon, model), ] + + ex2 <- set_forecast_unit( + example_quantile, + c("location", "target_end_date", "target_type", "horizon", "model") + ) + scores2 <- suppressMessages(score(ex2)) + scores2 <- scores2[order(location, target_end_date, target_type, horizon, model), ] + + expect_equal(scores1$interval_score, scores2$interval_score) +}) + + + From 7f2cef93680f450e62155457864ad8ecb517c8ba Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Wed, 12 Jul 2023 16:03:29 +0200 Subject: [PATCH 05/16] update version number --- DESCRIPTION | 2 +- NEWS.md | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index b52246bfb..5377be989 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: scoringutils Title: Utilities for Scoring and Assessing Predictions -Version: 1.1.4 +Version: 1.1.5 Language: en-GB Authors@R: c( person(given = "Nikos", diff --git a/NEWS.md b/NEWS.md index e8fe3b6ca..db7f5270c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# scoringutils 1.1.5 + +## Feature updates +- Added a new function, `set_forecast_unit()` that allows the user to set the +forecast unit manually. They can then pipe the result directly into +`check_forecasts()` and check the result. + # scoringutils 1.1.4 ## Package updates From 24e97ea91dbcae65e265112ff1ca34f0bf9bad15 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Wed, 12 Jul 2023 16:05:59 +0200 Subject: [PATCH 06/16] add example to set_forecast_unit() function --- R/convenience-functions.R | 4 ++++ man/set_forecast_unit.Rd | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 1dfba777a..c40a5e910 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -237,6 +237,10 @@ log_shift <- function(x, #' @keywords data-handling #' @examples +#' set_forecast_unit( +#' example_quantile, +#' c("location", "target_end_date", "target_type", "horizon", "model") +#' ) #' set_forecast_unit <- function(data, diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd index a2f13dd14..609292bb2 100644 --- a/man/set_forecast_unit.Rd +++ b/man/set_forecast_unit.Rd @@ -43,6 +43,13 @@ Helper function to set the unit of a single forecast manually. This simple function keeps the columns specified in \code{forecast_unit} (plus additional protected columns, e.g. for true values, predictions or quantile levels) and removes duplicate rows. +} +\examples{ +set_forecast_unit( + example_quantile, + c("location", "target_end_date", "target_type", "horizon", "model") +) + } \references{ Transformation of forecasts for evaluating predictive From 36b3b7b93befcebdf944176ca053becd7f8d8ffc Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Wed, 12 Jul 2023 16:27:08 +0200 Subject: [PATCH 07/16] fix documentation --- R/convenience-functions.R | 2 ++ man/set_forecast_unit.Rd | 3 +++ 2 files changed, 5 insertions(+) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index c40a5e910..433588ed3 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -221,6 +221,8 @@ log_shift <- function(x, #' levels) and removes duplicate rows. #' #' @inheritParams score +#' @param forecast_unit character vector with the names of the columns that +#' uniquely identify a single forecast #' @return A data.table with only those columns kept that are relevant to #' scoring or denote the unit of a single forecast as specified by the user. #' diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd index 609292bb2..389809688 100644 --- a/man/set_forecast_unit.Rd +++ b/man/set_forecast_unit.Rd @@ -33,6 +33,9 @@ flagged and added to the input data with all forecasts assigned as an You can check the format of your data using \code{\link[=check_forecasts]{check_forecasts()}} and there are examples for each format (\link{example_quantile}, \link{example_continuous}, \link{example_integer}, and \link{example_binary}).} + +\item{forecast_unit}{character vector with the names of the columns that +uniquely identify a single forecast} } \value{ A data.table with only those columns kept that are relevant to From d586813dd5fd7a8ab5308b567cb4b6e7fd0f9bbe Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 10:53:49 +0200 Subject: [PATCH 08/16] update interal get_protected_columns() function to only return present columns --- R/utils.R | 11 +++++++++-- tests/testthat/test-utils.R | 10 ++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/R/utils.R b/R/utils.R index edbb69753..3b25bf6d8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -265,12 +265,19 @@ get_forecast_unit <- function(data, prediction_type) { #' @keywords internal get_protected_columns <- function(data) { + + datacols <- colnames(data) protected_columns <- c( "prediction", "true_value", "sample", "quantile", "upper", "lower", - "pit_value", - "range", "boundary", available_metrics(), + "pit_value", "range", "boundary", available_metrics(), names(data)[grepl("coverage_", names(data))] ) + # only return protected columns that are present + protected_columns <- intersect( + datacols, + protected_columns + ) + return(protected_columns) } diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 904d89572..2831715a5 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -7,8 +7,9 @@ test_that("get_protected columns returns the correct result", { "range", "boundary", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) ) + manual <- intersect(manual, colnames(example_quantile)) auto <- get_protected_columns(data) - expect_equal(manual, auto) + expect_equal(sort(manual), sort(auto)) data <- example_binary @@ -18,8 +19,9 @@ test_that("get_protected columns returns the correct result", { "range", "boundary", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) ) + manual <- intersect(manual, colnames(example_binary)) auto <- get_protected_columns(data) - expect_equal(manual, auto) + expect_equal(sort(manual), sort(auto)) data <- example_continuous manual <- protected_columns <- c( @@ -28,7 +30,7 @@ test_that("get_protected columns returns the correct result", { "range", "boundary", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) ) + manual <- intersect(manual, colnames(example_continuous)) auto <- get_protected_columns(data) - expect_equal(manual, auto) - + expect_equal(sort(manual), sort(auto)) }) From a6effb48f4408083f03b51dd9929c892de96a05d Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 10:58:40 +0200 Subject: [PATCH 09/16] add tests for set_forecast_unit() --- tests/testthat/test-convenience-functions.R | 23 +++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index 8f895f30e..c68bc9208 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -58,4 +58,27 @@ test_that("function set_forecast_unit() works", { }) +test_that("function set_forecast_unit() gives warning when column is not there", { + + expect_warning( + set_forecast_unit( + example_quantile, + c("location", "target_end_date", "target_type", "horizon", "model", "test") + ) + ) +}) + + +test_that("function get_forecast_unit() and set_forecast_unit() work together", { + + fu_set <- c("location", "target_end_date", "target_type", "horizon", "model") + + ex <- set_forecast_unit( + example_binary, + fu + ) + + fu_get <- get_forecast_unit(ex) + expect_equal(fu_set, fu_get) +}) From 247de8307b3f66cef57258ec9108ef367041abc2 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 10:59:39 +0200 Subject: [PATCH 10/16] Clean convenience-functions file, handle columns not present in set_forecast_unit() --- R/convenience-functions.R | 43 ++++++++++++++++----------------------- man/set_forecast_unit.Rd | 14 ++++++++++--- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 433588ed3..0b811e6ce 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -193,9 +193,7 @@ transform_forecasts <- function(data, #' offset = 1 #' ) -log_shift <- function(x, - offset = 0, - base = exp(1)) { +log_shift <- function(x, offset = 0, base = exp(1)) { if (any(x < 0, na.rm = TRUE)) { w <- paste("Detected input values < 0.") @@ -211,18 +209,24 @@ log_shift <- function(x, } - - #' @title Set unit of a single forecast manually #' -#' @description Helper function to set the unit of a single forecast manually. +#' @description Helper function to set the unit of a single forecast (i.e. the +#' combination of columns that uniquely define a single forecast) manually. #' This simple function keeps the columns specified in `forecast_unit` (plus #' additional protected columns, e.g. for true values, predictions or quantile #' levels) and removes duplicate rows. +#' If not done manually, `scoringutils` attempts to determine the unit +#' of a single automatically by simply assuming that all column names are +#' relevant to determine the forecast unit. This can lead to unexpected +#' behaviour, so setting the forecast unit explicitly can help make the code +#' easier to debug and easier to read. When used as part of a workflow, +#' `set_forecast_unit()` can then directly be piped into `check_forecasts()` to +#' check everything is in order. #' #' @inheritParams score -#' @param forecast_unit character vector with the names of the columns that -#' uniquely identify a single forecast +#' @param forecast_unit Character vector with the names of the columns that +#' uniquely identify a single forecast. #' @return A data.table with only those columns kept that are relevant to #' scoring or denote the unit of a single forecast as specified by the user. #' @@ -245,32 +249,21 @@ log_shift <- function(x, #' ) #' -set_forecast_unit <- function(data, - forecast_unit = NULL) { +set_forecast_unit <- function(data, forecast_unit = NULL) { datacols <- colnames(data) missing <- forecast_unit[!(forecast_unit %in% datacols)] if (length(missing) > 0) { warning( - paste0( - "Column(s) '", - missing, - "' are not columns of the data" - ) + "Column(s) '", + missing, + "' are not columns of the data and will be ignored." ) + forecast_unit <- intersect(forecast_unit, datacols) } - keep_cols <- intersect( - datacols, - get_protected_columns(data) - ) - keep_cols <- c(keep_cols, forecast_unit) + keep_cols <- c(get_protected_columns(data), forecast_unit) out <- unique(data[, .SD, .SDcols = keep_cols])[] return(out) } - - - - - diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd index 389809688..5fe8ea582 100644 --- a/man/set_forecast_unit.Rd +++ b/man/set_forecast_unit.Rd @@ -34,18 +34,26 @@ You can check the format of your data using \code{\link[=check_forecasts]{check_ are examples for each format (\link{example_quantile}, \link{example_continuous}, \link{example_integer}, and \link{example_binary}).} -\item{forecast_unit}{character vector with the names of the columns that -uniquely identify a single forecast} +\item{forecast_unit}{Character vector with the names of the columns that +uniquely identify a single forecast.} } \value{ A data.table with only those columns kept that are relevant to scoring or denote the unit of a single forecast as specified by the user. } \description{ -Helper function to set the unit of a single forecast manually. +Helper function to set the unit of a single forecast (i.e. the +combination of columns that uniquely define a single forecast) manually. This simple function keeps the columns specified in \code{forecast_unit} (plus additional protected columns, e.g. for true values, predictions or quantile levels) and removes duplicate rows. +If not done manually, \code{scoringutils} attempts to determine the unit +of a single automatically by simply assuming that all column names are +relevant to determine the forecast unit. This can lead to unexpected +behaviour, so setting the forecast unit explicitly can help make the code +easier to debug and easier to read. When used as part of a workflow, +\code{set_forecast_unit()} can then directly be piped into \code{check_forecasts()} to +check everything is in order. } \examples{ set_forecast_unit( From 3e5ec7f89c92398becc62c8fc7d4a6f72dfc402b Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 10:59:52 +0200 Subject: [PATCH 11/16] Update NEWS.md file --- NEWS.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index db7f5270c..68a2b5a64 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,8 +2,15 @@ ## Feature updates - Added a new function, `set_forecast_unit()` that allows the user to set the -forecast unit manually. They can then pipe the result directly into -`check_forecasts()` and check the result. +forecast unit manually. The function removes all columns that are not relevant +for uniquely identifying a single forecast. If not done manually, `scoringutils` +attempts to determine the unit +of a single automatically by simply assuming that all column names are +relevant to determine the forecast unit. This can lead to unexpected +behaviour, so setting the forecast unit explicitly can help make the code +easier to debug and easier to read. When used as part of a workflow, +`set_forecast_unit()` can then directly be piped into `check_forecasts()` to +check everything is in order. # scoringutils 1.1.4 From 24b4a44f1411b7931e8c2bfbe1df321d2b8a93b1 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 11:09:15 +0200 Subject: [PATCH 12/16] fix tests, remove unnecessary whitespace --- R/convenience-functions.R | 7 ------- man/log_shift.Rd | 1 - man/set_forecast_unit.Rd | 1 - tests/testthat/test-convenience-functions.R | 2 +- 4 files changed, 1 insertion(+), 10 deletions(-) diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 0b811e6ce..997fd5eee 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -156,10 +156,6 @@ transform_forecasts <- function(data, } - - - - #' @title Log transformation with an additive shift #' #' @description Function that shifts a value by some offset and then applies the @@ -183,7 +179,6 @@ transform_forecasts <- function(data, #' # nolint #' @keywords check-forecasts #' @examples -#' #' log_shift(1:10) #' log_shift(0:9, offset = 1) #' @@ -240,14 +235,12 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' medRxiv 2023.01.23.23284722 #' \doi{https://doi.org/10.1101/2023.01.23.23284722} #' # nolint - #' @keywords data-handling #' @examples #' set_forecast_unit( #' example_quantile, #' c("location", "target_end_date", "target_type", "horizon", "model") #' ) -#' set_forecast_unit <- function(data, forecast_unit = NULL) { diff --git a/man/log_shift.Rd b/man/log_shift.Rd index ddeb26ca3..66e0ffb53 100644 --- a/man/log_shift.Rd +++ b/man/log_shift.Rd @@ -26,7 +26,6 @@ natural logarithm to it. The output is computed as log(x + offset) } \examples{ - log_shift(1:10) log_shift(0:9, offset = 1) diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd index 5fe8ea582..54b47d055 100644 --- a/man/set_forecast_unit.Rd +++ b/man/set_forecast_unit.Rd @@ -60,7 +60,6 @@ set_forecast_unit( example_quantile, c("location", "target_end_date", "target_type", "horizon", "model") ) - } \references{ Transformation of forecasts for evaluating predictive diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index c68bc9208..ad7a40550 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -75,7 +75,7 @@ test_that("function get_forecast_unit() and set_forecast_unit() work together", ex <- set_forecast_unit( example_binary, - fu + fu_set ) fu_get <- get_forecast_unit(ex) From 90c60d09ed162f56e3e5ff4c8b39b91facdb4fad Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 11:11:54 +0200 Subject: [PATCH 13/16] reference issue in news file --- NEWS.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 68a2b5a64..04cde6894 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,8 @@ attempts to determine the unit of a single automatically by simply assuming that all column names are relevant to determine the forecast unit. This can lead to unexpected behaviour, so setting the forecast unit explicitly can help make the code -easier to debug and easier to read. When used as part of a workflow, +easier to debug and easier to read (see issue #268). +When used as part of a workflow, `set_forecast_unit()` can then directly be piped into `check_forecasts()` to check everything is in order. From edb5c732311f98ce8d3bb19f942617869aaecbb2 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 13:08:49 +0200 Subject: [PATCH 14/16] fix linter issue --- R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 3b25bf6d8..029a70da8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -270,7 +270,7 @@ get_protected_columns <- function(data) { protected_columns <- c( "prediction", "true_value", "sample", "quantile", "upper", "lower", "pit_value", "range", "boundary", available_metrics(), - names(data)[grepl("coverage_", names(data))] + grep("coverage_", names(data), fixed = TRUE, value = TRUE) ) # only return protected columns that are present From 627a3a58d11f1ba788dd11e1186af2e06b367f6d Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 15:32:08 +0200 Subject: [PATCH 15/16] remove default for function argument in set_forecast_unit(), fix some formatting and typos --- NEWS.md | 13 +++---------- R/convenience-functions.R | 16 ++++------------ 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 04cde6894..d81135d29 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,16 +1,9 @@ # scoringutils 1.1.5 ## Feature updates -- Added a new function, `set_forecast_unit()` that allows the user to set the -forecast unit manually. The function removes all columns that are not relevant -for uniquely identifying a single forecast. If not done manually, `scoringutils` -attempts to determine the unit -of a single automatically by simply assuming that all column names are -relevant to determine the forecast unit. This can lead to unexpected -behaviour, so setting the forecast unit explicitly can help make the code -easier to debug and easier to read (see issue #268). -When used as part of a workflow, -`set_forecast_unit()` can then directly be piped into `check_forecasts()` to +- Added a new function, `set_forecast_unit()` that allows the user to set the forecast unit manually. The function removes all columns that are not relevant for uniquely identifying a single forecast. If not done manually, `scoringutils` attempts to determine the unit of a single automatically by simply assuming that all column names are +relevant to determine the forecast unit. This can lead to unexpected behaviour, so setting the forecast unit explicitly can help make the code easier to debug and easier to read (see issue #268). +When used as part of a workflow, `set_forecast_unit()` can be directly piped into `check_forecasts()` to check everything is in order. # scoringutils 1.1.4 diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 997fd5eee..f21aeefc8 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -212,11 +212,11 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' additional protected columns, e.g. for true values, predictions or quantile #' levels) and removes duplicate rows. #' If not done manually, `scoringutils` attempts to determine the unit -#' of a single automatically by simply assuming that all column names are -#' relevant to determine the forecast unit. This can lead to unexpected +#' of a single forecast automatically by simply assuming that all column names +#' are relevant to determine the forecast unit. This may lead to unexpected #' behaviour, so setting the forecast unit explicitly can help make the code #' easier to debug and easier to read. When used as part of a workflow, -#' `set_forecast_unit()` can then directly be piped into `check_forecasts()` to +#' `set_forecast_unit()` can be directly piped into `check_forecasts()` to #' check everything is in order. #' #' @inheritParams score @@ -226,15 +226,7 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' scoring or denote the unit of a single forecast as specified by the user. #' #' @importFrom data.table ':=' is.data.table copy -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} #' @export -#' @references Transformation of forecasts for evaluating predictive -#' performance in an epidemiological context -#' Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, -#' Sebastian Funk -#' medRxiv 2023.01.23.23284722 -#' \doi{https://doi.org/10.1101/2023.01.23.23284722} -#' # nolint #' @keywords data-handling #' @examples #' set_forecast_unit( @@ -242,7 +234,7 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' c("location", "target_end_date", "target_type", "horizon", "model") #' ) -set_forecast_unit <- function(data, forecast_unit = NULL) { +set_forecast_unit <- function(data, forecast_unit) { datacols <- colnames(data) missing <- forecast_unit[!(forecast_unit %in% datacols)] From db5c00dd770c4196c98154e0d4fca9f01e8d0c44 Mon Sep 17 00:00:00 2001 From: nikosbosse Date: Fri, 14 Jul 2023 16:02:50 +0200 Subject: [PATCH 16/16] update docs --- man/set_forecast_unit.Rd | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/man/set_forecast_unit.Rd b/man/set_forecast_unit.Rd index 54b47d055..8482a27d7 100644 --- a/man/set_forecast_unit.Rd +++ b/man/set_forecast_unit.Rd @@ -4,7 +4,7 @@ \alias{set_forecast_unit} \title{Set unit of a single forecast manually} \usage{ -set_forecast_unit(data, forecast_unit = NULL) +set_forecast_unit(data, forecast_unit) } \arguments{ \item{data}{A data.frame or data.table with the predictions and observations. @@ -48,11 +48,11 @@ This simple function keeps the columns specified in \code{forecast_unit} (plus additional protected columns, e.g. for true values, predictions or quantile levels) and removes duplicate rows. If not done manually, \code{scoringutils} attempts to determine the unit -of a single automatically by simply assuming that all column names are -relevant to determine the forecast unit. This can lead to unexpected +of a single forecast automatically by simply assuming that all column names +are relevant to determine the forecast unit. This may lead to unexpected behaviour, so setting the forecast unit explicitly can help make the code easier to debug and easier to read. When used as part of a workflow, -\code{set_forecast_unit()} can then directly be piped into \code{check_forecasts()} to +\code{set_forecast_unit()} can be directly piped into \code{check_forecasts()} to check everything is in order. } \examples{ @@ -61,16 +61,4 @@ set_forecast_unit( c("location", "target_end_date", "target_type", "horizon", "model") ) } -\references{ -Transformation of forecasts for evaluating predictive -performance in an epidemiological context -Nikos I. Bosse, Sam Abbott, Anne Cori, Edwin van Leeuwen, Johannes Bracher, -Sebastian Funk -medRxiv 2023.01.23.23284722 -\doi{https://doi.org/10.1101/2023.01.23.23284722} -\url{https://www.medrxiv.org/content/10.1101/2023.01.23.23284722v1} # nolint -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} \keyword{data-handling}