diff --git a/NAMESPACE b/NAMESPACE index fab2e17d3..d3b6ffe22 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,6 @@ S3method(score,default) S3method(score,scoringutils_binary) S3method(score,scoringutils_point) S3method(score,scoringutils_quantile) -S3method(score,scoringutils_quantile_new) S3method(score,scoringutils_sample) S3method(validate,default) S3method(validate,scoringutils_binary) @@ -32,6 +31,7 @@ export(crps_sample) export(dispersion) export(dss_sample) export(get_duplicate_forecasts) +export(interval_coverage_deviation_quantile) export(interval_coverage_quantile) export(interval_coverage_sample) export(interval_score) @@ -82,6 +82,7 @@ importFrom(checkmate,assert_data_frame) importFrom(checkmate,assert_data_table) importFrom(checkmate,assert_factor) importFrom(checkmate,assert_list) +importFrom(checkmate,assert_logical) importFrom(checkmate,assert_number) importFrom(checkmate,assert_numeric) importFrom(checkmate,check_atomic_vector) @@ -108,6 +109,7 @@ importFrom(data.table,nafill) importFrom(data.table,rbindlist) importFrom(data.table,setDT) importFrom(data.table,setattr) +importFrom(data.table,setcolorder) importFrom(data.table,setnames) importFrom(ggdist,geom_lineribbon) importFrom(ggplot2,.data) diff --git a/NEWS.md b/NEWS.md index 322b2b25d..a00ab7f95 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,7 @@ The update introduces a lot of breaking changes. If you want to keep using the o - `quantile`: numeric, a vector with quantile-levels. Can alternatively be a matrix of the same shape as `predicted`. - `check_forecasts()` was replaced by a new function `validate()`. `validate()` validates the input and in that sense fulfills the purpose of `check_forecasts()`. It has different methods: `validate.default()` assigns the input a class based on their forecast type. Other methods validate the input specifically for the various forecast types. - The functionality for computing pairwise comparisons was now split from `summarise_scores()`. Instead of doing pairwise comparisons as part of summarising scores, a new function, `add_pairwise_comparison()`, was introduced that takes summarised scores as an input and adds pairwise comparisons to it. +- `add_coverage()` was reworked completely. It's new purpose is now to add coverage information to the raw forecast data (essentially fulfilling some of the functionality that was previously covered by `score_quantile()`) - The function `find_duplicates()` was renamed to `get_duplicate_forecasts()` - Changes to `avail_forecasts()` and `plot_avail_forecasts()`: - The function `avail_forecasts()` was renamed to `available_forecasts()` for consistency with `available_metrics()`. The old function, `avail_forecasts()` is still available as an alias, but will be removed in the future. diff --git a/R/add_coverage.R b/R/add_coverage.R new file mode 100644 index 000000000..684556026 --- /dev/null +++ b/R/add_coverage.R @@ -0,0 +1,83 @@ +#' @title Add Coverage Values to Quantile-Based Forecasts +#' +#' @description Adds interval coverage of central prediction intervals, +#' quantile coverage for predictive quantiles, as well as the deviation between +#' desired and actual coverage to a data.table. Forecasts should be in a +#' quantile format (following the input requirements of `score()`). +#' +#' **Interval coverage** +#' +#' Coverage for a given interval range is defined as the proportion of +#' observations that fall within the corresponding central prediction intervals. +#' Central prediction intervals are symmetric around the median and and formed +#' by two quantiles that denote the lower and upper bound. For example, the 50% +#' central prediction interval is the interval between the 0.25 and 0.75 +#' quantiles of the predictive distribution. +#' +#' The function `add_coverage()` computes the coverage per central prediction +#' interval, so the coverage will always be either `TRUE` (observed value falls +#' within the interval) or `FALSE` (observed value falls outside the interval). +#' You can summarise the coverage values to get the proportion of observations +#' that fall within the central prediction intervals. +#' +#' **Quantile coverage** +#' +#' Quantile coverage for a given quantile is defined as the proportion of +#' observed values that are smaller than the corresponding predictive quantile. +#' For example, the 0.5 quantile coverage is the proportion of observed values +#' that are smaller than the 0.5 quantile of the predictive distribution. +#' +#' **Coverage deviation** +#' +#' The coverage deviation is the difference between the desired coverage and the +#' actual coverage. For example, if the desired coverage is 90% and the actual +#' coverage is 80%, the coverage deviation is -0.1. +#' +#' @inheritParams score +#' @return a data.table with the input and columns "interval_coverage", +#' "interval_coverage_deviation", "quantile_coverage", +#' "quantile_coverage_deviation" added. +#' @importFrom data.table setcolorder +#' @examples +#' library(magrittr) # pipe operator +#' example_quantile %>% +#' add_coverage() +#' @export +#' @keywords scoring +#' @export +add_coverage <- function(data) { + stored_attributes <- get_scoringutils_attributes(data) + data <- validate(data) + forecast_unit <- get_forecast_unit(data) + data_cols <- colnames(data) # store so we can reset column order later + + # what happens if quantiles are not symmetric around the median? + # should things error? Also write tests for that. + interval_data <- quantile_to_interval(data, format = "wide") + interval_data[, interval_coverage := ifelse( + observed <= upper & observed >= lower, + TRUE, + FALSE) + ][, c("lower", "upper", "observed") := NULL] + + data[, range := get_range_from_quantile(quantile)] + + data <- merge(interval_data, data, by = unique(c(forecast_unit, "range"))) + data[, interval_coverage_deviation := interval_coverage - range / 100] + data[, quantile_coverage := observed <= predicted] + data[, quantile_coverage_deviation := quantile_coverage - quantile] + + # reset column order + new_metrics <- c("interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation") + setcolorder(data, unique(c(data_cols, "range", new_metrics))) + + # add coverage "metrics" to list of stored metrics + # this makes it possible to use `summarise_scores()` later on + stored_attributes[["metric_names"]] <- c( + stored_attributes[["metric_names"]], + new_metrics + ) + data <- assign_attributes(data, stored_attributes) + return(data[]) +} diff --git a/R/check-input-helpers.R b/R/check-input-helpers.R index cfaa24b2c..95869f02b 100644 --- a/R/check-input-helpers.R +++ b/R/check-input-helpers.R @@ -297,12 +297,22 @@ check_columns_present <- function(data, columns) { } assert_character(columns, min.len = 1) colnames <- colnames(data) + missing <- list() for (x in columns){ if (!(x %in% colnames)) { - msg <- paste0("Column '", x, "' not found in data") - return(msg) + missing[[x]] <- x } } + missing <- unlist(missing) + if (length(missing) > 1) { + msg <- paste0( + "Columns '", paste(missing, collapse = "', '"), "' not found in data" + ) + return(msg) + } else if (length(missing) == 1) { + msg <- paste0("Column '", missing, "' not found in data") + return(msg) + } return(TRUE) } diff --git a/R/convenience-functions.R b/R/convenience-functions.R index 870e4ac3d..31f3ab5ab 100644 --- a/R/convenience-functions.R +++ b/R/convenience-functions.R @@ -235,21 +235,13 @@ log_shift <- function(x, offset = 0, base = exp(1)) { #' example_quantile, #' c("location", "target_end_date", "target_type", "horizon", "model") #' ) - set_forecast_unit <- function(data, forecast_unit) { - - datacols <- colnames(data) - missing <- forecast_unit[!(forecast_unit %in% datacols)] - - if (length(missing) > 0) { - warning( - "Column(s) '", - missing, - "' are not columns of the data and will be ignored." - ) - forecast_unit <- intersect(forecast_unit, datacols) + data <- ensure_data.table(data) + missing <- check_columns_present(data, forecast_unit) + if (!is.logical(missing)) { + warning(missing) + forecast_unit <- intersect(forecast_unit, colnames(data)) } - keep_cols <- c(get_protected_columns(data), forecast_unit) out <- unique(data[, .SD, .SDcols = keep_cols])[] return(out) diff --git a/R/data.R b/R/data.R index 48548ffed..c57e044d8 100644 --- a/R/data.R +++ b/R/data.R @@ -19,7 +19,7 @@ #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_quantile" @@ -44,7 +44,7 @@ #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_point" @@ -69,7 +69,7 @@ #' \item{predicted}{predicted value} #' \item{sample_id}{id for the corresponding sample} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_continuous" @@ -124,7 +124,7 @@ #' \item{horizon}{forecast horizon in weeks} #' \item{predicted}{predicted value} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_binary" @@ -147,7 +147,7 @@ #' \item{model}{name of the model that generated the forecasts} #' \item{horizon}{forecast horizon in weeks} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_quantile_forecasts_only" @@ -167,7 +167,7 @@ #' \item{observed}{observed values} #' \item{location_name}{name of the country for which a prediction was made} #' } -#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} "example_truth_only" #' Summary information for selected metrics @@ -216,8 +216,13 @@ #' #' A named list with functions: #' - "wis" = [wis()] +#' - "overprediction" = [overprediction()] +#' - "underprediction" = [underprediction()] +#' - "dispersion" = [dispersion()] #' - "bias" = [bias_quantile()] -#' - "coverage_50" = \(...) {run_safely(..., range = 50, fun = interval_coverage_quantile)} #nolint -#' - "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)} #nolint +#' - "coverage_50" = \(...) {run_safely(..., range = 50, fun = [interval_coverage_quantile][interval_coverage_quantile()])} +#' - "coverage_90" = \(...) {run_safely(..., range = 90, fun = [interval_coverage_quantile][interval_coverage_quantile()])} +#' - "coverage_deviation" = [interval_coverage_deviation_quantile()], +#' - "ae_median" = [ae_median_quantile()] #' @keywords info "metrics_quantile" diff --git a/R/get_-functions.R b/R/get_-functions.R index 22aaa47a9..b55e1ef4f 100644 --- a/R/get_-functions.R +++ b/R/get_-functions.R @@ -193,6 +193,8 @@ get_protected_columns <- function(data = NULL) { protected_columns <- c( "predicted", "observed", "sample_id", "quantile", "upper", "lower", "pit_value", "range", "boundary", "relative_skill", "scaled_rel_skill", + "interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation", available_metrics(), grep("coverage_", names(data), fixed = TRUE, value = TRUE) ) diff --git a/R/metrics-quantile.R b/R/metrics-quantile.R index f3598690e..985b00002 100644 --- a/R/metrics-quantile.R +++ b/R/metrics-quantile.R @@ -80,14 +80,21 @@ #' `overprediction`, `underprediction`, and `dispersion.` #' #' @inheritParams interval_score -#' @param predicted vector of size n with the predicted values +#' @param observed numeric vector of size n with the observed values +#' @param predicted numeric nxN matrix of predictive +#' quantiles, n (number of rows) being the number of forecasts (corresponding +#' to the number of observed values) and N +#' (number of columns) the number of quantiles per forecast. +#' If `observed` is just a single number, then predicted can just be a +#' vector of size N. #' @param quantile vector with quantile levels of size N #' @param count_median_twice if TRUE, count the median twice in the score #' @param na.rm if TRUE, ignore NA values when computing the score #' @importFrom stats weighted.mean +#' @importFrom checkmate assert_logical #' @return -#' `wis()`: a numeric vector with WIS values (one per observation), or a list -#' with separate entries if `separate_results` is `TRUE`. +#' `wis()`: a numeric vector with WIS values of size n (one per observation), +#' or a list with separate entries if `separate_results` is `TRUE`. #' @export wis <- function(observed, predicted, @@ -99,6 +106,11 @@ wis <- function(observed, assert_input_quantile(observed, predicted, quantile) reformatted <- quantile_to_interval(observed, predicted, quantile) + assert_logical(separate_results, len = 1) + assert_logical(weigh, len = 1) + assert_logical(count_median_twice, len = 1) + assert_logical(na.rm, len = 1) + if (separate_results) { cols <- c("wis", "dispersion", "underprediction", "overprediction") } else { @@ -205,7 +217,7 @@ interval_coverage_quantile <- function(observed, predicted, quantile, range = 50 if (!all(necessary_quantiles %in% quantile)) { warning( "To compute the coverage for a range of ", range, "%, the quantiles ", - necessary_quantiles, " are required. Returnting `NA`.") + necessary_quantiles, " are required. Returning `NA`.") return(NA) } r <- range @@ -218,6 +230,90 @@ interval_coverage_quantile <- function(observed, predicted, quantile, range = 50 } +#' @title Interval Coverage Deviation (For Quantile-Based Forecasts) +#' @description Check the agreement between desired and actual interval coverage +#' of a forecast. +#' +#' The function is similar to [interval_coverage_quantile()], +#' but looks at all provided prediction intervals instead of only one. It +#' compares nominal coverage (i.e. the desired coverage) with the actual +#' observed coverage. +#' +#' A central symmetric prediction interval is defined by a lower and an +#' upper bound formed by a pair of predictive quantiles. For example, a 50% +#' prediction interval is formed by the 0.25 and 0.75 quantiles of the +#' predictive distribution. Ideally, a forecaster should aim to cover about +#' 50% of all observed values with their 50% prediction intervals, 90% of all +#' observed values with their 90% prediction intervals, and so on. +#' +#' For every prediction interval, the deviation is computed as the difference +#' between the observed coverage and the nominal coverage +#' For a single observed value and a single prediction interval, +#' coverage is always either 0 or 1. This is not the case for a single observed +#' value and multiple prediction intervals, but it still doesn't make that much +#' sense to compare nominal (desired) coverage and actual coverage for a single +#' observation. In that sense coverage deviation only really starts to make +#' sense as a metric when averaged across multiple observations). +#' +#' Positive values of coverage deviation are an indication for underconfidence, +#' i.e. the forecaster could likely have issued a narrower forecast. Negative +#' values are an indication for overconfidence, i.e. the forecasts were too +#' narrow. +#' +#' \deqn{ +#' \textrm{coverage deviation} = +#' \mathbf{1}(\textrm{observed value falls within interval} - +#' \textrm{nominal coverage}) +#' }{ +#' coverage deviation = +#' 1(observed value falls within interval) - nominal coverage +#' } +#' The coverage deviation is then averaged across all prediction intervals. +#' The median is ignored when computing coverage deviation. +#' @inheritParams wis +#' @return A numeric vector of length n with the coverage deviation for each +#' forecast (comprising one or multiple prediction intervals). +#' @export +#' @examples +#' observed <- c(1, -15, 22) +#' predicted <- rbind( +#' c(-1, 0, 1, 2, 3), +#' c(-2, 1, 2, 2, 4), +#' c(-2, 0, 3, 3, 4) +#' ) +#' quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +#' interval_coverage_deviation_quantile(observed, predicted, quantile) +interval_coverage_deviation_quantile <- function(observed, predicted, quantile) { + assert_input_quantile(observed, predicted, quantile) + + # transform available quantiles into central interval ranges + available_ranges <- unique(get_range_from_quantile(quantile)) + + # check if all necessary quantiles are available + necessary_quantiles <- unique(c( + (100 - available_ranges) / 2, + 100 - (100 - available_ranges) / 2) / 100 + ) + if (!all(necessary_quantiles %in% quantile)) { + missing <- necessary_quantiles[!necessary_quantiles %in% quantile] + warning( + "To compute coverage deviation, all quantiles must form central ", + "symmetric prediction intervals. Missing quantiles: ", + toString(missing), ". Returning `NA`.") + return(NA) + } + + reformatted <- quantile_to_interval(observed, predicted, quantile)[range != 0] + reformatted[, coverage := ifelse( + observed >= lower & observed <= upper, TRUE, FALSE + )] + reformatted[, coverage_deviation := coverage - range / 100] + out <- reformatted[, .(coverage_deviation = mean(coverage_deviation)), + by = c("forecast_id")] + return(out$coverage_deviation) +} + + #' @title Determines Bias of Quantile Forecasts #' #' @description @@ -364,6 +460,45 @@ bias_quantile_single_vector <- function(observed, predicted, quantile, na.rm) { } +#' @title Absolute Error of the Median (Quantile-based Version) +#' @description +#' Compute the absolute error of the median calculated as +#' \deqn{ +#' \textrm{abs}(\textrm{observed} - \textrm{median prediction}) +#' }{ +#' abs(observed - median_prediction) +#' } +#' The median prediction is the predicted value for which quantile == 0.5, +#' the function therefore requires 0.5 to be among the quantile levels in +#' `quantile`. +#' @inheritParams wis +#' @return numeric vector of length N with the absolute error of the median +#' @seealso [ae_median_sample()], [abs_error()] +#' @importFrom stats median +#' @examples +#' observed <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) +#' ae_median_quantile(observed, predicted_values, quantile = 0.5) +#' @export +#' @keywords metric +ae_median_quantile <- function(observed, predicted, quantile) { + assert_input_quantile(observed, predicted, quantile) + if (!any(quantile == 0.5)) { + warning( + "in order to compute the absolute error of the median, `0.5` must be ", + "among the quantiles given. Returning `NA`." + ) + return(NA_real_) + } + if (is.null(dim(predicted))) { + predicted <- matrix(predicted, nrow = 1) + } + predicted <- predicted[, quantile == 0.5] + abs_error_median <- abs(observed - predicted) + return(abs_error_median) +} + + ################################################################################ # Metrics with a one-to-one relationship between input and score ################################################################################ @@ -529,51 +664,3 @@ wis_one_to_one <- function(observed, } } } - - -#' @title Absolute Error of the Median (Quantile-based Version) -#' -#' @description -#' Absolute error of the median calculated as -#' -#' \deqn{ -#' \textrm{abs}(\textrm{observed} - \textrm{prediction}) -#' }{ -#' abs(observed - median_prediction) -#' } -#' -#' The function was created for internal use within [score()], but can also -#' used as a standalone function. -#' -#' @param predicted numeric vector with predictions, corresponding to the -#' quantiles in a second vector, `quantiles`. -#' @param quantiles numeric vector that denotes the quantile for the values -#' in `predicted`. Only those predictions where `quantiles == 0.5` will -#' be kept. If `quantiles` is `NULL`, then all `predicted` and -#' `observed` will be used (this is then the same as [abs_error()]) -#' @return vector with the scoring values -#' @seealso [ae_median_sample()], [abs_error()] -#' @importFrom stats median -#' @inheritParams ae_median_sample -#' @examples -#' observed <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) -#' ae_median_quantile(observed, predicted_values, quantiles = 0.5) -#' @export -#' @keywords metric - -ae_median_quantile <- function(observed, predicted, quantiles = NULL) { - if (!is.null(quantiles)) { - if (!any(quantiles == 0.5) && !anyNA(quantiles)) { - return(NA_real_) - warning( - "in order to compute the absolute error of the median, `0.5` must be ", - "among the quantiles given. Maybe you want to use `abs_error()`?" - ) - } - observed <- observed[quantiles == 0.5] - predicted <- predicted[quantiles == 0.5] - } - abs_error_median <- abs(observed - predicted) - return(abs_error_median) -} diff --git a/R/metrics-sample.R b/R/metrics-sample.R index 73026bfcf..803fb4c4e 100644 --- a/R/metrics-sample.R +++ b/R/metrics-sample.R @@ -96,18 +96,17 @@ bias_sample <- function(observed, predicted) { #' @importFrom stats median #' @examples #' observed <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) #' ae_median_sample(observed, predicted_values) #' @export #' @keywords metric ae_median_sample <- function(observed, predicted) { + assert_input_sample(observed, predicted) median_predictions <- apply( - as.matrix(predicted), MARGIN = 1, FUN = median # this is rowwise + as.matrix(predicted), MARGIN = 1, FUN = median # this is row-wise ) - ae_median <- abs(observed - median_predictions) - return(ae_median) } @@ -118,11 +117,11 @@ ae_median_sample <- function(observed, predicted) { #' Squared error of the mean calculated as #' #' \deqn{ -#' \textrm{mean}(\textrm{observed} - \textrm{prediction})^2 +#' \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2 #' }{ -#' mean(observed - mean_prediction)^2 +#' mean(observed - mean prediction)^2 #' } -#' +#' The mean prediction is calculated as the mean of the predictive samples. #' @param observed A vector with observed values of size n #' @param predicted nxN matrix of predictive samples, n (number of rows) being #' the number of data points and N (number of columns) the number of Monte @@ -131,12 +130,13 @@ ae_median_sample <- function(observed, predicted) { #' @seealso [squared_error()] #' @examples #' observed <- rnorm(30, mean = 1:30) -#' predicted_values <- rnorm(30, mean = 1:30) +#' predicted_values <- matrix(rnorm(30, mean = 1:30)) #' se_mean_sample(observed, predicted_values) #' @export #' @keywords metric se_mean_sample <- function(observed, predicted) { + assert_input_sample(observed, predicted) mean_predictions <- rowMeans(as.matrix(predicted)) se_mean <- (observed - mean_predictions)^2 diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R index f23316254..eab561adb 100644 --- a/R/pairwise-comparisons.R +++ b/R/pairwise-comparisons.R @@ -28,7 +28,7 @@ #' #' @param scores A data.table of scores as produced by [score()]. #' @param metric A character vector of length one with the metric to do the -#' comparison on. The default is "auto", meaning that either "interval_score", +#' comparison on. The default is "auto", meaning that either "wis", #' "crps", or "brier_score" will be selected where available. #' @param by character vector with names of columns present in the input #' data.frame. `by` determines how pairwise comparisons will be computed. @@ -366,8 +366,8 @@ compare_two_models <- function(scores, #' @keywords internal infer_rel_skill_metric <- function(scores) { - if ("interval_score" %in% colnames(scores)) { - rel_skill_metric <- "interval_score" + if ("wis" %in% colnames(scores)) { + rel_skill_metric <- "wis" } else if ("crps" %in% colnames(scores)) { rel_skill_metric <- "crps" } else if ("brier_score" %in% colnames(scores)) { diff --git a/R/pit.R b/R/pit.R index 2e00e4b90..ee5e09c7c 100644 --- a/R/pit.R +++ b/R/pit.R @@ -189,18 +189,10 @@ pit <- function(data, data <- remove_na_observed_predicted(data) forecast_type <- get_forecast_type(data) - # if prediction type is quantile, simply extract coverage values from - # score and returned a list with named vectors if (forecast_type == "quantile") { - coverage <- - score(data, metrics = "quantile_coverage") - - coverage <- summarise_scores(coverage, - by = unique(c(by, "quantile")) - ) - # remove all existing attributes and class - coverage <- remove_scoringutils_class(coverage) - + data[, quantile_coverage := (observed <= predicted)] + coverage <- data[, .(quantile_coverage = mean(quantile_coverage)), + by = c(unique(c(by, "quantile")))] coverage <- coverage[order(quantile), .( quantile = c(quantile, 1), @@ -208,7 +200,6 @@ pit <- function(data, ), by = c(get_forecast_unit(coverage)) ] - return(coverage[]) } diff --git a/R/plot.R b/R/plot.R index f90be0599..84ae33eb1 100644 --- a/R/plot.R +++ b/R/plot.R @@ -221,7 +221,7 @@ plot_wis <- function(scores, #' produced by [score()] or [summarise_scores()]. Note that "range" must be included #' in the `by` argument when running [summarise_scores()] #' @param y The variable from the scores you want to show on the y-Axis. -#' This could be something like "interval_score" (the default) or "dispersion" +#' This could be something like "wis" (the default) or "dispersion" #' @param x The variable from the scores you want to show on the x-Axis. #' Usually this will be "model" #' @param colour Character vector of length one used to determine a variable @@ -233,18 +233,18 @@ plot_wis <- function(scores, #' @export #' @examples #' library(ggplot2) -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +#' # scores <- score(example_quantile) +#' # scores <- summarise_scores(scores, by = c("model", "target_type", "range")) #' -#' plot_ranges(scores, x = "model") + -#' facet_wrap(~target_type, scales = "free") +#' # plot_ranges(scores, x = "model") + +#' # facet_wrap(~target_type, scales = "free") #' #' # visualise dispersion instead of interval score -#' plot_ranges(scores, y = "dispersion", x = "model") + -#' facet_wrap(~target_type) +#' # plot_ranges(scores, y = "dispersion", x = "model") + +#' # facet_wrap(~target_type) plot_ranges <- function(scores, - y = "interval_score", + y = "wis", x = "model", colour = "range") { plot <- ggplot( @@ -296,7 +296,7 @@ plot_ranges <- function(scores, #' @export #' @examples #' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +#' scores <- summarise_scores(scores, by = c("model", "target_type")) #' #' plot_heatmap(scores, x = "target_type", metric = "bias") @@ -582,10 +582,10 @@ make_na <- make_NA #' @importFrom data.table dcast #' @export #' @examples -#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "range")) -#' plot_interval_coverage(scores) +#' # data.table::setDTthreads(1) # only needed to avoid issues on CRAN +#' # scores <- score(example_quantile) +#' # scores <- summarise_scores(scores, by = c("model", "range")) +#' # plot_interval_coverage(scores) plot_interval_coverage <- function(scores, colour = "model") { @@ -613,7 +613,7 @@ plot_interval_coverage <- function(scores, colour = "grey", linetype = "dashed" ) + - geom_line(aes(y = coverage * 100)) + + geom_line(aes(y = interval_coverage * 100)) + theme_scoringutils() + ylab("% Obs inside interval") + xlab("Nominal interval coverage") + @@ -638,9 +638,9 @@ plot_interval_coverage <- function(scores, #' @importFrom data.table dcast #' @export #' @examples -#' scores <- score(example_quantile) -#' scores <- summarise_scores(scores, by = c("model", "quantile")) -#' plot_quantile_coverage(scores) +#' # scores <- score(example_quantile) +#' # scores <- summarise_scores(scores, by = c("model", "quantile")) +#' # plot_quantile_coverage(scores) plot_quantile_coverage <- function(scores, colour = "model") { diff --git a/R/score.R b/R/score.R index 3d1dc7f07..29230e6e0 100644 --- a/R/score.R +++ b/R/score.R @@ -152,18 +152,10 @@ score.scoringutils_binary <- function(data, metrics = metrics_binary, ...) { data <- remove_na_observed_predicted(data) metrics <- validate_metrics(metrics) - # Extract the arguments passed in ... - args <- list(...) - lapply(seq_along(metrics), function(i, ...) { - metric_name <- names(metrics[i]) - fun <- metrics[[i]] - matching_args <- filter_function_args(fun, args) - - data[, (metric_name) := do.call( - fun, c(list(observed, predicted), matching_args) - )] - return() - }, ...) + data <- apply_metrics( + data, metrics, + data$observed, data$predicted, ... + ) setattr(data, "metric_names", names(metrics)) @@ -180,18 +172,10 @@ score.scoringutils_point <- function(data, metrics = metrics_point, ...) { data <- remove_na_observed_predicted(data) metrics <- validate_metrics(metrics) - # Extract the arguments passed in ... - args <- list(...) - lapply(seq_along(metrics), function(i, ...) { - metric_name <- names(metrics[i]) - fun <- metrics[[i]] - matching_args <- filter_function_args(fun, args) - - data[, (metric_name) := do.call( - fun, c(list(observed, predicted), matching_args) - )] - return() - }, ...) + data <- apply_metrics( + data, metrics, + data$observed, data$predicted, ... + ) setattr(data, "metric_names", names(metrics)) @@ -206,101 +190,67 @@ score.scoringutils_sample <- function(data, metrics = metrics_sample, ...) { forecast_unit <- attr(data, "forecast_unit") metrics <- validate_metrics(metrics) - # Extract the arguments passed in ... - args <- list(...) - lapply(seq_along(metrics), function(i, ...) { - metric_name <- names(metrics[i]) - fun <- metrics[[i]] - matching_args <- filter_function_args(fun, args) + # transpose the forecasts that belong to the same forecast unit + d_transposed <- data[, .(predicted = list(predicted), + observed = unique(observed), + scoringutils_N = length(list(sample_id))), + by = forecast_unit] - data[, (metric_name) := do.call( - fun, c(list(unique(observed), t(predicted)), matching_args) - ), by = forecast_unit] - return() - }, - ...) + # split according to number of samples and do calculations for different + # sample lengths separately + d_split <- split(d_transposed, d_transposed$scoringutils_N) - data <- data[ - , lapply(.SD, unique), - .SDcols = colnames(data) %like% paste(names(metrics), collapse = "|"), - by = forecast_unit - ] + split_result <- lapply(d_split, function(data) { + # create a matrix + observed <- data$observed + predicted <- do.call(rbind, data$predicted) + data[, c("observed", "predicted", "scoringutils_N") := NULL] + data <- apply_metrics( + data, metrics, + observed, predicted, ... + ) + return(data) + }) + data <- rbindlist(split_result) setattr(data, "metric_names", names(metrics)) return(data[]) } +#' @importFrom data.table `:=` as.data.table rbindlist %like% #' @rdname score #' @export -score.scoringutils_quantile <- function(data, metrics = NULL, ...) { - data <- validate(data) - data <- remove_na_observed_predicted(data) - forecast_unit <- attr(data, "forecast_unit") - - if (is.null(metrics)) { - metrics <- available_metrics() - } - metrics <- metrics[metrics %in% available_metrics()] - scores <- score_quantile( - data = data, - forecast_unit = forecast_unit, - metrics = metrics, - ... - ) - - setattr(scores, "metric_names", metrics[metrics %in% colnames(scores)]) - # manual hack to make sure that the correct attributes are there. - setattr(scores, "forecast_unit", forecast_unit) - setattr(scores, "forecast_type", "quantile") - scores <- new_scoringutils(scores, "scoringutils_quantile") - - return(scores[]) -} - - -#' @rdname score -#' @export -score.scoringutils_quantile_new <- function(data, metrics = metrics_quantile, ...) { +score.scoringutils_quantile <- function(data, metrics = metrics_quantile, ...) { data <- validate(data) data <- remove_na_observed_predicted(data) forecast_unit <- attr(data, "forecast_unit") metrics <- validate_metrics(metrics) - # Extract the arguments passed in ... - args <- list(...) - # transpose the forecasts that belong to the same forecast unit # make sure the quantiles and predictions are ordered in the same way d_transposed <- data[, .(predicted = list(predicted[order(quantile)]), observed = unique(observed), quantile = list(quantile[order(quantile)]), - N = length(quantile)), by = forecast_unit] + scoringutils_quantile = toString(quantile[order(quantile)])), + by = forecast_unit] # split according to quantile lengths and do calculations for different # quantile lengths separately. The function `wis()` assumes that all # forecasts have the same quantiles - d_split <- split(d_transposed, d_transposed$N) + d_split <- split(d_transposed, d_transposed$scoringutils_quantile) split_result <- lapply(d_split, function(data) { # create a matrix out of the list of predicted values and quantiles observed <- data$observed predicted <- do.call(rbind, data$predicted) quantile <- unlist(unique(data$quantile)) - data[, c("observed", "predicted", "quantile", "N") := NULL] - - # for each metric, compute score - lapply(seq_along(metrics), function(i, ...) { - metric_name <- names(metrics[i]) - fun <- metrics[[i]] - matching_args <- filter_function_args(fun, args) + data[, c("observed", "predicted", "quantile", "scoringutils_quantile") := NULL] - data[, eval(metric_name) := do.call( - fun, c(list(observed), list(predicted), list(quantile), matching_args) - )] - return() - }, - ...) + data <- apply_metrics( + data, metrics, + observed, predicted, quantile, ... + ) return(data) }) @@ -309,3 +259,18 @@ score.scoringutils_quantile_new <- function(data, metrics = metrics_quantile, .. return(data[]) } + +apply_metrics <- function(data, metrics, ...) { + expr <- expression( + data[, (metric_name) := do.call(run_safely, list(..., fun = fun))] + ) + lapply(seq_along(metrics), function(i, data, ...) { + metric_name <- names(metrics[i]) + fun <- metrics[[i]] + eval(expr) + }, data, ...) + return(data) +} + + + diff --git a/R/score_quantile.R b/R/score_quantile.R deleted file mode 100644 index f2da97f04..000000000 --- a/R/score_quantile.R +++ /dev/null @@ -1,167 +0,0 @@ -#' @title Evaluate forecasts in a Quantile-Based Format -#' -#' @inheritParams score -#' @inheritParams interval_score -#' @param count_median_twice logical that controls whether or not to count the -#' median twice when summarising (default is \code{FALSE}). Counting the -#' median twice would conceptually treat it as a 0\% prediction interval, where -#' the median is the lower as well as the upper bound. The alternative is to -#' treat the median as a single quantile forecast instead of an interval. The -#' interval score would then be better understood as an average of quantile -#' scores. -#' @param forecast_unit A character vector with the column names that define -#' the unit of a single forecast, i.e. a forecast was made for a combination -#' of the values in `forecast_unit` -#' -#' @return A data.table with appropriate scores. For more information see -#' [score()] -#' -#' @importFrom data.table ':=' as.data.table rbindlist %like% -#' -#' @author Nikos Bosse \email{nikosbosse@@gmail.com} -#' @inherit score references -#' @keywords internal - -score_quantile <- function(data, - forecast_unit, - metrics, - weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE) { - - data <- remove_na_observed_predicted(data) - - # make sure to have both quantile as well as range format -------------------- - range_data <- quantile_to_interval( - data, - keep_quantile_col = FALSE - ) - # adds the range column to the quantile data set - quantile_data <- range_long_to_quantile( - range_data, - keep_range_col = TRUE - ) - - # to deal with point forecasts in a quantile format. This in effect adds - # a third column next to lower and upper after pivoting - range_data[is.na(range), boundary := "point"] - - range_data <- data.table::dcast(range_data, ... ~ boundary, - value.var = "predicted" - ) - - # if we only score point forecasts, it may be true that there are no columns - # upper and lower in the data.frame. If so, these need to be added - if (!all(c("upper", "lower") %in% colnames(range_data))) { - range_data[, c("upper", "lower") := NA] - } - - # set up results data.table that will then be modified throughout ------------ - res <- data.table::copy(range_data) - - # calculate scores on range format ------------------------------------------- - if ("interval_score" %in% metrics) { - # compute separate results if desired - if (separate_results) { - outcols <- c( - "interval_score", "dispersion", - "underprediction", "overprediction" - ) - } else { - outcols <- "interval_score" - } - res <- res[, eval(outcols) := do.call( - scoringutils::interval_score, - list(observed, lower, - upper, range, - weigh, - separate_results = separate_results - ) - )] - } - - # compute coverage for every single observation - if ("coverage" %in% metrics) { - res[, coverage := ifelse(observed <= upper & observed >= lower, 1, 0)] # nolint - res[, coverage_deviation := coverage - range / 100] - } - - # compute bias - if ("bias" %in% metrics) { - res[, bias := bias_range( - range = range, lower = lower, upper = upper, - observed = unique(observed) - ), - by = forecast_unit - ] - } - - # compute absolute and squared error for point forecasts - # these are marked by an NA in range, and a numeric value for point - compute_point <- any( - c("se_point, se_mean, ae_point", "ae_median", "absolute_error") %in% metrics - ) - if (compute_point && "point" %in% colnames(res)) { - res[ - is.na(range) & is.numeric(point), - `:=`(ae_point = abs_error(predicted = point, observed), - se_point = squared_error(predicted = point, observed)) - ] - } - - # calculate scores on quantile format ---------------------------------------- - # compute absolute error of the median - if ("ae_median" %in% metrics) { - quantile_data[, ae_median := ae_median_quantile( - observed, - predicted, - quantile - ), - by = forecast_unit - ] - } - - # compute quantile coverage based on quantile version - if ("quantile_coverage" %in% metrics) { - quantile_data[, quantile_coverage := (observed <= predicted)] - } - - # merge metrics computed on quantile data (i.e. ae_median, quantile_coverage) back - # into metrics computed on range data. One important side effect of this is - # that it controls whether we count the median twice for the interval score - # (row is then duplicated) or only once. However, merge only needs to happen - # if we computed either the interval score or the ae_median or quantile coverage - if (any(c("ae_median", "interval_score", "quantile_coverage") %in% metrics)) { - # delete unnecessary columns before merging back - keep_cols <- unique(c( - forecast_unit, "quantile", "ae_median", "quantile_coverage", - "boundary", "range" - )) - delete_cols <- names(quantile_data)[!(names(quantile_data) %in% keep_cols)] - quantile_data[, eval(delete_cols) := NULL] - - # duplicate median column before merging if median is to be counted twice - # if this is false, then the res will have one entry for every quantile, - # which translates to two rows for every interval, but only one for the median - if (count_median_twice) { - median <- quantile_data[quantile == 0.5, ][, boundary := "upper"] - quantile_data <- data.table::rbindlist(list(quantile_data, median)) - } - - # merge back with other metrics - merge_cols <- setdiff(keep_cols, c( - "ae_median", "quantile_coverage", "quantile", - "boundary" - )) - # specify all.x = TRUE as the point forecasts got deleted when - # going from range to quantile above - res <- merge(res, quantile_data, by = merge_cols, all.x = TRUE) - } - - # delete internal columns before returning result - res <- delete_columns( - res, c("upper", "lower", "boundary", "point", "observed") - ) - - return(res[]) -} diff --git a/R/summarise_scores.R b/R/summarise_scores.R index e62bac789..032026402 100644 --- a/R/summarise_scores.R +++ b/R/summarise_scores.R @@ -306,78 +306,3 @@ check_summary_params <- function(scores, } return(relative_skill) } - - - -#' @title Add coverage of central prediction intervals -#' -#' @description Adds a column with the coverage of central prediction intervals -#' to unsummarised scores as produced by [score()] -#' -#' The coverage values that are added are computed according to the values -#' specified in `by`. If, for example, `by = "model"`, then there will be one -#' coverage value for every model and [add_coverage()] will compute the coverage -#' for every model across the values present in all other columns which define -#' the unit of a single forecast. -#' -#' @inheritParams summarise_scores -#' @param by character vector with column names to add the coverage for. -#' @param ranges numeric vector of the ranges of the central prediction intervals -#' for which coverage values shall be added. -#' @return a data.table with unsummarised scores with columns added for the -#' coverage of the central prediction intervals. While the overall data.table -#' is still unsummarised, note that for the coverage columns some level of -#' summary is present according to the value specified in `by`. -#' @examples -#' library(magrittr) # pipe operator -#' score(example_quantile) %>% -#' add_coverage(by = c("model", "target_type")) %>% -#' summarise_scores(by = c("model", "target_type")) %>% -#' summarise_scores(fun = signif, digits = 2) -#' @export -#' @keywords scoring - -add_coverage <- function(scores, - by = NULL, - ranges = c(50, 90)) { - - stored_attributes <- get_scoringutils_attributes(scores) - if (!is.null(attr(scores, "unsummarised_scores"))) { - scores <- attr(scores, "unsummarised_scores") - } - - if (is.null(by) && !is.null(stored_attributes[["scoringutils_by"]])) { - by <- stored_attributes[["scoringutils_by"]] - } else if (is.null(by)) { - # Need to check this again. - # (mentioned in https://github.com/epiforecasts/scoringutils/issues/346) - by <- get_forecast_unit(scores) - } - - summarised_scores <- summarise_scores( - scores, - by = c(by, "range") - )[range %in% ranges] - - - # create cast formula - cast_formula <- - paste( - paste(by, collapse = "+"), - "~", - "paste0('coverage_', range)" - ) - - coverages <- dcast( - summarised_scores, - value.var = "coverage", - formula = cast_formula - ) - - scores_with_coverage <- merge(scores, coverages, by = by) - scores_with_coverage <- assign_attributes( - scores_with_coverage, stored_attributes - ) - - return(scores_with_coverage[]) -} diff --git a/R/utils.R b/R/utils.R index 3de1a9eb8..53a2d800e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -5,7 +5,8 @@ #' @keywords info available_metrics <- function() { - return(unique(scoringutils::metrics$Name)) + return(unique(c(scoringutils::metrics$Name, + "wis", "coverage_50", "coverage_90"))) } @@ -251,3 +252,21 @@ run_safely <- function(..., fun) { return(result) } + +#' Ensure That an Object is a Data Table +#' @description This function ensures that an object is a data table. +#' If the object is not a data table, it is converted to one. If the object +#' is a data table, a copy of the object is returned. +#' @param data An object to ensure is a data table +#' @return A data table +#' @keywords internal +#' @importFrom data.table copy is.data.table as.data.table +ensure_data.table <- function(data) { + if (!is.data.table(data)) { + data <- as.data.table(data) + } else { + data <- copy(data) + } + return(data) +} + diff --git a/R/utils_data_handling.R b/R/utils_data_handling.R index cb9470e87..1b1302dbf 100644 --- a/R/utils_data_handling.R +++ b/R/utils_data_handling.R @@ -101,12 +101,7 @@ merge_pred_and_obs <- function(forecasts, observations, sample_to_quantile <- function(data, quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95), type = 7) { - if (!is.data.table(data)) { - data <- data.table::as.data.table(data) - } else { - data <- copy(data) - } - + data <- ensure_data.table(data) reserved_columns <- c("predicted", "sample_id") by <- setdiff(colnames(data), reserved_columns) @@ -208,19 +203,10 @@ quantile_to_interval.data.frame <- function(dt, format = "long", keep_quantile_col = FALSE, ...) { - if (!is.data.table(dt)) { - dt <- data.table::as.data.table(dt) - } else { - # use copy to avoid - dt <- copy(dt) - } + dt <- ensure_data.table(dt) dt[, boundary := ifelse(quantile <= 0.5, "lower", "upper")] - dt[, range := ifelse( - boundary == "lower", - round((1 - 2 * quantile) * 100, 10), - round((2 * quantile - 1) * 100, 10) - )] + dt[, range := get_range_from_quantile(quantile)] # add median quantile median <- dt[quantile == 0.5, ] @@ -234,6 +220,10 @@ quantile_to_interval.data.frame <- function(dt, if (format == "wide") { delete_columns(dt, "quantile") dt <- dcast(dt, ... ~ boundary, value.var = "predicted") + # if there are NA values in `predicted`, this introduces a column "NA" + if ("NA" %in% colnames(dt) && all(is.na(dt[["NA"]]))) { + dt[, "NA" := NULL] + } } return(dt[]) } @@ -311,3 +301,25 @@ sample_to_range_long <- function(data, return(data[]) } + +#' Get Range Belonging to a Quantile +#' @description Every quantile can be thought of either as the lower or the +#' upper bound of a symmetric central prediction interval. This helper function +#' returns the range of the central prediction interval to which the quantile +#' belongs. +#' +#' Due to numeric instability that sometimes occurred in the past, ranges are +#' rounded to 10 decimal places. This is not a problem for the vast majority of +#' use cases, but it is something to be aware of. +#' @param quantile a numeric vector of quantile levels of size N +#' @return a numeric vector of interval ranges of size N +#' @keywords internal +get_range_from_quantile <- function(quantile) { + boundary <- ifelse(quantile <= 0.5, "lower", "upper") + range <- ifelse( + boundary == "lower", + round((1 - 2 * quantile) * 100, digits = 10), + round((2 * quantile - 1) * 100, digits = 10) + ) + return(range) +} diff --git a/R/z_globalVariables.R b/R/z_globalVariables.R index b81fb3452..89cfc2eab 100644 --- a/R/z_globalVariables.R +++ b/R/z_globalVariables.R @@ -30,9 +30,12 @@ globalVariables(c( "identifCol", "Interval_Score", "interval_range", + "interval_coverage", + "interval_coverage_deviation", "overprediction", "underprediction", "quantile_coverage", + "quantile_coverage_deviation", "LogS", "log_score", "lower", @@ -59,6 +62,7 @@ globalVariables(c( "rel_to_baseline", "relative_skill", "rn", + "sample_id", "scoringutils_InternalDuplicateCheck", "scoringutils_InternalNumCheck", "se_mean", @@ -72,6 +76,7 @@ globalVariables(c( "var_of_interest", "variable", "weight", + "wis", "wis_component_name", "x", "y", diff --git a/README.Rmd b/README.Rmd index 77584d145..0c4c41223 100644 --- a/README.Rmd +++ b/README.Rmd @@ -91,8 +91,8 @@ Forecasts can be easily and quickly scored using the `score()` function. `score( example_quantile %>% set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>% validate() %>% + add_coverage() %>% score() %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores( by = c("model", "target_type") ) %>% diff --git a/README.md b/README.md index 2b0e1d1f9..0a5f93881 100644 --- a/README.md +++ b/README.md @@ -129,8 +129,8 @@ details. Finally we summarise these scores by model and target type. example_quantile %>% set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>% validate() %>% + add_coverage() %>% score() %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores( by = c("model", "target_type") ) %>% @@ -144,15 +144,15 @@ example_quantile %>% kable() ``` -| model | target_type | interval_score | dispersion | underprediction | overprediction | coverage_deviation | bias | ae_median | coverage_50 | coverage_90 | relative_skill | scaled_rel_skill | -|:----------------------|:------------|---------------:|-----------:|----------------:|---------------:|-------------------:|--------:|----------:|------------:|------------:|---------------:|-----------------:| -| EuroCOVIDhub-baseline | Cases | 28000 | 4100 | 10000.0 | 14000.0 | -0.110 | 0.0980 | 38000 | 0.33 | 0.82 | 1.30 | 1.6 | -| EuroCOVIDhub-baseline | Deaths | 160 | 91 | 2.1 | 66.0 | 0.120 | 0.3400 | 230 | 0.66 | 1.00 | 2.30 | 3.8 | -| EuroCOVIDhub-ensemble | Cases | 18000 | 3700 | 4200.0 | 10000.0 | -0.098 | -0.0560 | 24000 | 0.39 | 0.80 | 0.82 | 1.0 | -| EuroCOVIDhub-ensemble | Deaths | 41 | 30 | 4.1 | 7.1 | 0.200 | 0.0730 | 53 | 0.88 | 1.00 | 0.60 | 1.0 | -| UMass-MechBayes | Deaths | 53 | 27 | 17.0 | 9.0 | -0.023 | -0.0220 | 78 | 0.46 | 0.88 | 0.75 | 1.3 | -| epiforecasts-EpiNow2 | Cases | 21000 | 5700 | 3300.0 | 12000.0 | -0.067 | -0.0790 | 28000 | 0.47 | 0.79 | 0.95 | 1.2 | -| epiforecasts-EpiNow2 | Deaths | 67 | 32 | 16.0 | 19.0 | -0.043 | -0.0051 | 100 | 0.42 | 0.91 | 0.98 | 1.6 | +| model | target_type | wis | overprediction | underprediction | dispersion | bias | coverage_50 | coverage_90 | coverage_deviation | ae_median | relative_skill | scaled_rel_skill | +|:----------------------|:------------|------:|---------------:|----------------:|-----------:|--------:|------------:|------------:|-------------------:|----------:|---------------:|-----------------:| +| EuroCOVIDhub-baseline | Cases | 28000 | 14000.0 | 10000.0 | 4100 | 0.0980 | 0.33 | 0.82 | -0.120 | 38000 | 1.30 | 1.6 | +| EuroCOVIDhub-baseline | Deaths | 160 | 66.0 | 2.1 | 91 | 0.3400 | 0.66 | 1.00 | 0.120 | 230 | 2.30 | 3.8 | +| EuroCOVIDhub-ensemble | Cases | 18000 | 10000.0 | 4200.0 | 3700 | -0.0560 | 0.39 | 0.80 | -0.100 | 24000 | 0.82 | 1.0 | +| EuroCOVIDhub-ensemble | Deaths | 41 | 7.1 | 4.1 | 30 | 0.0730 | 0.88 | 1.00 | 0.200 | 53 | 0.60 | 1.0 | +| UMass-MechBayes | Deaths | 53 | 9.0 | 17.0 | 27 | -0.0220 | 0.46 | 0.88 | -0.025 | 78 | 0.75 | 1.3 | +| epiforecasts-EpiNow2 | Cases | 21000 | 12000.0 | 3300.0 | 5700 | -0.0790 | 0.47 | 0.79 | -0.070 | 28000 | 0.95 | 1.2 | +| epiforecasts-EpiNow2 | Deaths | 67 | 19.0 | 16.0 | 32 | -0.0051 | 0.42 | 0.91 | -0.045 | 100 | 0.98 | 1.6 | `scoringutils` contains additional functionality to transform forecasts, to summarise scores at different levels, to visualise them, and to @@ -174,20 +174,27 @@ example_quantile %>% score %>% summarise_scores(by = c("model", "target_type", "scale")) %>% head() -#> model target_type scale interval_score dispersion -#> 1: EuroCOVIDhub-baseline Cases log 1.169972e+00 0.4373146 -#> 2: EuroCOVIDhub-baseline Cases natural 2.209046e+04 4102.5009443 -#> 3: EuroCOVIDhub-ensemble Cases log 5.500974e-01 0.1011850 -#> 4: EuroCOVIDhub-ensemble Cases natural 1.155071e+04 3663.5245788 -#> 5: epiforecasts-EpiNow2 Cases log 6.005778e-01 0.1066329 -#> 6: epiforecasts-EpiNow2 Cases natural 1.443844e+04 5664.3779484 -#> underprediction overprediction coverage_deviation bias ae_median -#> 1: 3.521964e-01 0.3804607 -0.10940217 0.09726562 1.185905e+00 -#> 2: 1.028497e+04 7702.9836957 -0.10940217 0.09726562 3.208048e+04 -#> 3: 1.356563e-01 0.3132561 -0.09785326 -0.05640625 7.410484e-01 -#> 4: 4.237177e+03 3650.0047554 -0.09785326 -0.05640625 1.770795e+04 -#> 5: 1.858699e-01 0.3080750 -0.06660326 -0.07890625 7.656591e-01 -#> 6: 3.260356e+03 5513.7058424 -0.06660326 -0.07890625 2.153070e+04 +#> model target_type scale wis overprediction +#> 1: EuroCOVIDhub-ensemble Cases natural 11550.70664 3650.004755 +#> 2: EuroCOVIDhub-baseline Cases natural 22090.45747 7702.983696 +#> 3: epiforecasts-EpiNow2 Cases natural 14438.43943 5513.705842 +#> 4: EuroCOVIDhub-ensemble Deaths natural 41.42249 7.138247 +#> 5: EuroCOVIDhub-baseline Deaths natural 159.40387 65.899117 +#> 6: UMass-MechBayes Deaths natural 52.65195 8.978601 +#> underprediction dispersion bias coverage_50 coverage_90 +#> 1: 4237.177310 3663.52458 -0.05640625 0.3906250 0.8046875 +#> 2: 10284.972826 4102.50094 0.09726562 0.3281250 0.8203125 +#> 3: 3260.355639 5664.37795 -0.07890625 0.4687500 0.7890625 +#> 4: 4.103261 30.18099 0.07265625 0.8750000 1.0000000 +#> 5: 2.098505 91.40625 0.33906250 0.6640625 1.0000000 +#> 6: 16.800951 26.87239 -0.02234375 0.4609375 0.8750000 +#> coverage_deviation ae_median +#> 1: -0.10230114 17707.95312 +#> 2: -0.11437500 32080.48438 +#> 3: -0.06963068 21530.69531 +#> 4: 0.20380682 53.13281 +#> 5: 0.12142045 233.25781 +#> 6: -0.02488636 78.47656 ``` ## Citation diff --git a/data/metrics_quantile.rda b/data/metrics_quantile.rda index 2f2607933..b5598113d 100644 Binary files a/data/metrics_quantile.rda and b/data/metrics_quantile.rda differ diff --git a/inst/create-list-available-forecasts.R b/inst/create-list-available-forecasts.R index 21a2c8682..07105a7f4 100644 --- a/inst/create-list-available-forecasts.R +++ b/inst/create-list-available-forecasts.R @@ -28,7 +28,9 @@ metrics_quantile <- list( "underprediction" = underprediction, "dispersion" = dispersion, "bias" = bias_quantile, - "coverage_50" = \(...) {run_safely(..., range = 50, fun = interval_coverage_quantile)}, - "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)} + "coverage_50" = interval_coverage_quantile, + "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)}, + "coverage_deviation" = interval_coverage_deviation_quantile, + "ae_median" = ae_median_quantile ) usethis::use_data(metrics_quantile, overwrite = TRUE) diff --git a/man/abs_error.Rd b/man/abs_error.Rd index 099937bfd..197703193 100644 --- a/man/abs_error.Rd +++ b/man/abs_error.Rd @@ -7,10 +7,14 @@ abs_error(observed, predicted) } \arguments{ -\item{observed}{A vector with observed values of size n} +\item{observed}{numeric vector of size n with the observed values} -\item{predicted}{numeric vector with predictions, corresponding to the -quantiles in a second vector, \code{quantiles}.} +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} } \value{ vector with the absolute error diff --git a/man/add_coverage.Rd b/man/add_coverage.Rd index 33990a3bc..d6c82d467 100644 --- a/man/add_coverage.Rd +++ b/man/add_coverage.Rd @@ -1,40 +1,56 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/summarise_scores.R +% Please edit documentation in R/add_coverage.R \name{add_coverage} \alias{add_coverage} -\title{Add coverage of central prediction intervals} +\title{Add Coverage Values to Quantile-Based Forecasts} \usage{ -add_coverage(scores, by = NULL, ranges = c(50, 90)) +add_coverage(data) } \arguments{ -\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} - -\item{by}{character vector with column names to add the coverage for.} - -\item{ranges}{numeric vector of the ranges of the central prediction intervals -for which coverage values shall be added.} +\item{data}{A data.frame or data.table with predicted and observed values.} } \value{ -a data.table with unsummarised scores with columns added for the -coverage of the central prediction intervals. While the overall data.table -is still unsummarised, note that for the coverage columns some level of -summary is present according to the value specified in \code{by}. +a data.table with the input and columns "interval_coverage", +"interval_coverage_deviation", "quantile_coverage", +"quantile_coverage_deviation" added. } \description{ -Adds a column with the coverage of central prediction intervals -to unsummarised scores as produced by \code{\link[=score]{score()}} +Adds interval coverage of central prediction intervals, +quantile coverage for predictive quantiles, as well as the deviation between +desired and actual coverage to a data.table. Forecasts should be in a +quantile format (following the input requirements of \code{score()}). + +\strong{Interval coverage} + +Coverage for a given interval range is defined as the proportion of +observations that fall within the corresponding central prediction intervals. +Central prediction intervals are symmetric around the median and and formed +by two quantiles that denote the lower and upper bound. For example, the 50\% +central prediction interval is the interval between the 0.25 and 0.75 +quantiles of the predictive distribution. + +The function \code{add_coverage()} computes the coverage per central prediction +interval, so the coverage will always be either \code{TRUE} (observed value falls +within the interval) or \code{FALSE} (observed value falls outside the interval). +You can summarise the coverage values to get the proportion of observations +that fall within the central prediction intervals. + +\strong{Quantile coverage} + +Quantile coverage for a given quantile is defined as the proportion of +observed values that are smaller than the corresponding predictive quantile. +For example, the 0.5 quantile coverage is the proportion of observed values +that are smaller than the 0.5 quantile of the predictive distribution. + +\strong{Coverage deviation} -The coverage values that are added are computed according to the values -specified in \code{by}. If, for example, \code{by = "model"}, then there will be one -coverage value for every model and \code{\link[=add_coverage]{add_coverage()}} will compute the coverage -for every model across the values present in all other columns which define -the unit of a single forecast. +The coverage deviation is the difference between the desired coverage and the +actual coverage. For example, if the desired coverage is 90\% and the actual +coverage is 80\%, the coverage deviation is -0.1. } \examples{ library(magrittr) # pipe operator -score(example_quantile) \%>\% - add_coverage(by = c("model", "target_type")) \%>\% - summarise_scores(by = c("model", "target_type")) \%>\% - summarise_scores(fun = signif, digits = 2) +example_quantile \%>\% + add_coverage() } \keyword{scoring} diff --git a/man/ae_median_quantile.Rd b/man/ae_median_quantile.Rd index 841850235..d96965c7c 100644 --- a/man/ae_median_quantile.Rd +++ b/man/ae_median_quantile.Rd @@ -4,38 +4,38 @@ \alias{ae_median_quantile} \title{Absolute Error of the Median (Quantile-based Version)} \usage{ -ae_median_quantile(observed, predicted, quantiles = NULL) +ae_median_quantile(observed, predicted, quantile) } \arguments{ -\item{observed}{A vector with observed values of size n} +\item{observed}{numeric vector of size n with the observed values} -\item{predicted}{numeric vector with predictions, corresponding to the -quantiles in a second vector, \code{quantiles}.} +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} -\item{quantiles}{numeric vector that denotes the quantile for the values -in \code{predicted}. Only those predictions where \code{quantiles == 0.5} will -be kept. If \code{quantiles} is \code{NULL}, then all \code{predicted} and -\code{observed} will be used (this is then the same as \code{\link[=abs_error]{abs_error()}})} +\item{quantile}{vector with quantile levels of size N} } \value{ -vector with the scoring values +numeric vector of length N with the absolute error of the median } \description{ -Absolute error of the median calculated as - +Compute the absolute error of the median calculated as \deqn{ - \textrm{abs}(\textrm{observed} - \textrm{prediction}) + \textrm{abs}(\textrm{observed} - \textrm{median prediction}) }{ abs(observed - median_prediction) } - -The function was created for internal use within \code{\link[=score]{score()}}, but can also -used as a standalone function. +The median prediction is the predicted value for which quantile == 0.5, +the function therefore requires 0.5 to be among the quantile levels in +\code{quantile}. } \examples{ observed <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) -ae_median_quantile(observed, predicted_values, quantiles = 0.5) +predicted_values <- matrix(rnorm(30, mean = 1:30)) +ae_median_quantile(observed, predicted_values, quantile = 0.5) } \seealso{ \code{\link[=ae_median_sample]{ae_median_sample()}}, \code{\link[=abs_error]{abs_error()}} diff --git a/man/ae_median_sample.Rd b/man/ae_median_sample.Rd index d446b3300..1e420fcc4 100644 --- a/man/ae_median_sample.Rd +++ b/man/ae_median_sample.Rd @@ -27,7 +27,7 @@ Absolute error of the median calculated as } \examples{ observed <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) +predicted_values <- matrix(rnorm(30, mean = 1:30)) ae_median_sample(observed, predicted_values) } \seealso{ diff --git a/man/compare_two_models.Rd b/man/compare_two_models.Rd index 39780292e..1d4686a1c 100644 --- a/man/compare_two_models.Rd +++ b/man/compare_two_models.Rd @@ -22,7 +22,7 @@ compare_two_models( \item{name_model2}{character, name of the model to compare against} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", +comparison on. The default is "auto", meaning that either "wis", "crps", or "brier_score" will be selected where available.} \item{one_sided}{Boolean, default is \code{FALSE}, whether two conduct a one-sided diff --git a/man/ensure_data.table.Rd b/man/ensure_data.table.Rd new file mode 100644 index 000000000..6d2457ee5 --- /dev/null +++ b/man/ensure_data.table.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{ensure_data.table} +\alias{ensure_data.table} +\title{Ensure That an Object is a Data Table} +\usage{ +ensure_data.table(data) +} +\arguments{ +\item{data}{An object to ensure is a data table} +} +\value{ +A data table +} +\description{ +This function ensures that an object is a data table. +If the object is not a data table, it is converted to one. If the object +is a data table, a copy of the object is returned. +} +\keyword{internal} diff --git a/man/example_binary.Rd b/man/example_binary.Rd index e7042d6b2..47797b8cd 100644 --- a/man/example_binary.Rd +++ b/man/example_binary.Rd @@ -19,7 +19,7 @@ A data frame with 346 rows and 10 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_binary diff --git a/man/example_continuous.Rd b/man/example_continuous.Rd index d1fba390e..354ebc5d6 100644 --- a/man/example_continuous.Rd +++ b/man/example_continuous.Rd @@ -20,7 +20,7 @@ A data frame with 13,429 rows and 10 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_continuous diff --git a/man/example_point.Rd b/man/example_point.Rd index 62af0e44f..1eb734b76 100644 --- a/man/example_point.Rd +++ b/man/example_point.Rd @@ -19,7 +19,7 @@ A data frame with } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_point diff --git a/man/example_quantile.Rd b/man/example_quantile.Rd index 00250e6d0..2582907e9 100644 --- a/man/example_quantile.Rd +++ b/man/example_quantile.Rd @@ -20,7 +20,7 @@ A data frame with } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd index 3fcaf2722..d789ed1e0 100644 --- a/man/example_quantile_forecasts_only.Rd +++ b/man/example_quantile_forecasts_only.Rd @@ -18,7 +18,7 @@ A data frame with 7,581 rows and 9 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_quantile_forecasts_only diff --git a/man/example_truth_only.Rd b/man/example_truth_only.Rd index 46453ba97..f8ae05afa 100644 --- a/man/example_truth_only.Rd +++ b/man/example_truth_only.Rd @@ -15,7 +15,7 @@ A data frame with 140 rows and 5 columns: } } \source{ -\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint +\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} } \usage{ example_truth_only diff --git a/man/get_range_from_quantile.Rd b/man/get_range_from_quantile.Rd new file mode 100644 index 000000000..eca15af36 --- /dev/null +++ b/man/get_range_from_quantile.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils_data_handling.R +\name{get_range_from_quantile} +\alias{get_range_from_quantile} +\title{Get Range Belonging to a Quantile} +\usage{ +get_range_from_quantile(quantile) +} +\arguments{ +\item{quantile}{a numeric vector of quantile levels of size N} +} +\value{ +a numeric vector of interval ranges of size N +} +\description{ +Every quantile can be thought of either as the lower or the +upper bound of a symmetric central prediction interval. This helper function +returns the range of the central prediction interval to which the quantile +belongs. + +Due to numeric instability that sometimes occurred in the past, ranges are +rounded to 10 decimal places. This is not a problem for the vast majority of +use cases, but it is something to be aware of. +} +\keyword{internal} diff --git a/man/interval_coverage.Rd b/man/interval_coverage.Rd index 8fbfc67d1..74256cc77 100644 --- a/man/interval_coverage.Rd +++ b/man/interval_coverage.Rd @@ -11,9 +11,14 @@ interval_coverage_quantile(observed, predicted, quantile, range = 50) interval_coverage_sample(observed, predicted, range = 50) } \arguments{ -\item{observed}{A vector with observed values of size n} +\item{observed}{numeric vector of size n with the observed values} -\item{predicted}{vector of size n with the predicted values} +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} \item{quantile}{vector with quantile levels of size N} diff --git a/man/interval_coverage_deviation_quantile.Rd b/man/interval_coverage_deviation_quantile.Rd new file mode 100644 index 000000000..9a6029ec7 --- /dev/null +++ b/man/interval_coverage_deviation_quantile.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/metrics-quantile.R +\name{interval_coverage_deviation_quantile} +\alias{interval_coverage_deviation_quantile} +\title{Interval Coverage Deviation (For Quantile-Based Forecasts)} +\usage{ +interval_coverage_deviation_quantile(observed, predicted, quantile) +} +\arguments{ +\item{observed}{numeric vector of size n with the observed values} + +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} + +\item{quantile}{vector with quantile levels of size N} +} +\value{ +A numeric vector of length n with the coverage deviation for each +forecast (comprising one or multiple prediction intervals). +} +\description{ +Check the agreement between desired and actual interval coverage +of a forecast. + +The function is similar to \code{\link[=interval_coverage_quantile]{interval_coverage_quantile()}}, +but looks at all provided prediction intervals instead of only one. It +compares nominal coverage (i.e. the desired coverage) with the actual +observed coverage. + +A central symmetric prediction interval is defined by a lower and an +upper bound formed by a pair of predictive quantiles. For example, a 50\% +prediction interval is formed by the 0.25 and 0.75 quantiles of the +predictive distribution. Ideally, a forecaster should aim to cover about +50\% of all observed values with their 50\% prediction intervals, 90\% of all +observed values with their 90\% prediction intervals, and so on. + +For every prediction interval, the deviation is computed as the difference +between the observed coverage and the nominal coverage +For a single observed value and a single prediction interval, +coverage is always either 0 or 1. This is not the case for a single observed +value and multiple prediction intervals, but it still doesn't make that much +sense to compare nominal (desired) coverage and actual coverage for a single +observation. In that sense coverage deviation only really starts to make +sense as a metric when averaged across multiple observations). + +Positive values of coverage deviation are an indication for underconfidence, +i.e. the forecaster could likely have issued a narrower forecast. Negative +values are an indication for overconfidence, i.e. the forecasts were too +narrow. + +\deqn{ +\textrm{coverage deviation} = +\mathbf{1}(\textrm{observed value falls within interval} - +\textrm{nominal coverage}) +}{ +coverage deviation = +1(observed value falls within interval) - nominal coverage +} +The coverage deviation is then averaged across all prediction intervals. +The median is ignored when computing coverage deviation. +} +\examples{ +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) +interval_coverage_deviation_quantile(observed, predicted, quantile) +} diff --git a/man/metrics_quantile.Rd b/man/metrics_quantile.Rd index 9fda39f03..ea444ee7e 100644 --- a/man/metrics_quantile.Rd +++ b/man/metrics_quantile.Rd @@ -5,7 +5,7 @@ \alias{metrics_quantile} \title{Default metrics for quantile-based forecasts.} \format{ -An object of class \code{list} of length 7. +An object of class \code{list} of length 9. } \usage{ metrics_quantile @@ -14,9 +14,14 @@ metrics_quantile A named list with functions: \itemize{ \item "wis" = \code{\link[=wis]{wis()}} +\item "overprediction" = \code{\link[=overprediction]{overprediction()}} +\item "underprediction" = \code{\link[=underprediction]{underprediction()}} +\item "dispersion" = \code{\link[=dispersion]{dispersion()}} \item "bias" = \code{\link[=bias_quantile]{bias_quantile()}} -\item "coverage_50" = \(...) {run_safely(..., range = 50, fun = interval_coverage_quantile)} #nolint -\item "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)} #nolint +\item "coverage_50" = \(...) {run_safely(..., range = 50, fun = \link[=interval_coverage_quantile]{interval_coverage_quantile})} +\item "coverage_90" = \(...) {run_safely(..., range = 90, fun = \link[=interval_coverage_quantile]{interval_coverage_quantile})} +\item "coverage_deviation" = \code{\link[=interval_coverage_deviation_quantile]{interval_coverage_deviation_quantile()}}, +\item "ae_median" = \code{\link[=ae_median_quantile]{ae_median_quantile()}} } } \keyword{info} diff --git a/man/pairwise_comparison.Rd b/man/pairwise_comparison.Rd index 9288e77fb..d30be1197 100644 --- a/man/pairwise_comparison.Rd +++ b/man/pairwise_comparison.Rd @@ -25,7 +25,7 @@ splitting) and the pairwise comparisons will be computed separately for the split data.frames.} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", +comparison on. The default is "auto", meaning that either "wis", "crps", or "brier_score" will be selected where available.} \item{baseline}{character vector of length one that denotes the baseline diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd index a7d902f15..df59b1472 100644 --- a/man/pairwise_comparison_one_group.Rd +++ b/man/pairwise_comparison_one_group.Rd @@ -10,7 +10,7 @@ pairwise_comparison_one_group(scores, metric, baseline, by, ...) \item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.} \item{metric}{A character vector of length one with the metric to do the -comparison on. The default is "auto", meaning that either "interval_score", +comparison on. The default is "auto", meaning that either "wis", "crps", or "brier_score" will be selected where available.} \item{baseline}{character vector of length one that denotes the baseline diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd index 8b1aac549..837ef4243 100644 --- a/man/plot_heatmap.Rd +++ b/man/plot_heatmap.Rd @@ -29,7 +29,7 @@ different locations. } \examples{ scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +scores <- summarise_scores(scores, by = c("model", "target_type")) plot_heatmap(scores, x = "target_type", metric = "bias") } diff --git a/man/plot_interval_coverage.Rd b/man/plot_interval_coverage.Rd index 9c7da16fe..6c4c3e985 100644 --- a/man/plot_interval_coverage.Rd +++ b/man/plot_interval_coverage.Rd @@ -21,8 +21,8 @@ ggplot object with a plot of interval coverage Plot interval coverage } \examples{ -data.table::setDTthreads(1) # only needed to avoid issues on CRAN -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "range")) -plot_interval_coverage(scores) +# data.table::setDTthreads(1) # only needed to avoid issues on CRAN +# scores <- score(example_quantile) +# scores <- summarise_scores(scores, by = c("model", "range")) +# plot_interval_coverage(scores) } diff --git a/man/plot_quantile_coverage.Rd b/man/plot_quantile_coverage.Rd index 2e6ef489e..c479fb5e3 100644 --- a/man/plot_quantile_coverage.Rd +++ b/man/plot_quantile_coverage.Rd @@ -21,7 +21,7 @@ ggplot object with a plot of interval coverage Plot quantile coverage } \examples{ -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "quantile")) -plot_quantile_coverage(scores) +# scores <- score(example_quantile) +# scores <- summarise_scores(scores, by = c("model", "quantile")) +# plot_quantile_coverage(scores) } diff --git a/man/plot_ranges.Rd b/man/plot_ranges.Rd index a4a999ff2..27922b3c0 100644 --- a/man/plot_ranges.Rd +++ b/man/plot_ranges.Rd @@ -4,7 +4,7 @@ \alias{plot_ranges} \title{Plot Metrics by Range of the Prediction Interval} \usage{ -plot_ranges(scores, y = "interval_score", x = "model", colour = "range") +plot_ranges(scores, y = "wis", x = "model", colour = "range") } \arguments{ \item{scores}{A data.frame of scores based on quantile forecasts as @@ -12,7 +12,7 @@ produced by \code{\link[=score]{score()}} or \code{\link[=summarise_scores]{summ in the \code{by} argument when running \code{\link[=summarise_scores]{summarise_scores()}}} \item{y}{The variable from the scores you want to show on the y-Axis. -This could be something like "interval_score" (the default) or "dispersion"} +This could be something like "wis" (the default) or "dispersion"} \item{x}{The variable from the scores you want to show on the x-Axis. Usually this will be "model"} @@ -31,13 +31,13 @@ sharpness / dispersion changes by range. } \examples{ library(ggplot2) -scores <- score(example_quantile) -scores <- summarise_scores(scores, by = c("model", "target_type", "range")) +# scores <- score(example_quantile) +# scores <- summarise_scores(scores, by = c("model", "target_type", "range")) -plot_ranges(scores, x = "model") + - facet_wrap(~target_type, scales = "free") +# plot_ranges(scores, x = "model") + +# facet_wrap(~target_type, scales = "free") # visualise dispersion instead of interval score -plot_ranges(scores, y = "dispersion", x = "model") + - facet_wrap(~target_type) +# plot_ranges(scores, y = "dispersion", x = "model") + +# facet_wrap(~target_type) } diff --git a/man/score.Rd b/man/score.Rd index 5cf7a1b14..c26fbb24c 100644 --- a/man/score.Rd +++ b/man/score.Rd @@ -7,7 +7,6 @@ \alias{score.scoringutils_point} \alias{score.scoringutils_sample} \alias{score.scoringutils_quantile} -\alias{score.scoringutils_quantile_new} \title{Evaluate forecasts in a data.frame format} \usage{ score(data, ...) @@ -20,9 +19,7 @@ score(data, ...) \method{score}{scoringutils_sample}(data, metrics = metrics_sample, ...) -\method{score}{scoringutils_quantile}(data, metrics = NULL, ...) - -\method{score}{scoringutils_quantile_new}(data, metrics = metrics_quantile, ...) +\method{score}{scoringutils_quantile}(data, metrics = metrics_quantile, ...) } \arguments{ \item{data}{A data.frame or data.table with predicted and observed values.} diff --git a/man/score_quantile.Rd b/man/score_quantile.Rd deleted file mode 100644 index 5f51f94ec..000000000 --- a/man/score_quantile.Rd +++ /dev/null @@ -1,62 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/score_quantile.R -\name{score_quantile} -\alias{score_quantile} -\title{Evaluate forecasts in a Quantile-Based Format} -\usage{ -score_quantile( - data, - forecast_unit, - metrics, - weigh = TRUE, - count_median_twice = FALSE, - separate_results = TRUE -) -} -\arguments{ -\item{data}{A data.frame or data.table with predicted and observed values.} - -\item{forecast_unit}{A character vector with the column names that define -the unit of a single forecast, i.e. a forecast was made for a combination -of the values in \code{forecast_unit}} - -\item{metrics}{A named list of scoring functions. Names will be used as -column names in the output. See \code{\link[=metrics_point]{metrics_point()}}, \code{\link[=metrics_binary]{metrics_binary()}}, -\code{metrics_quantile()}, and \code{\link[=metrics_sample]{metrics_sample()}} for more information on the -default metrics used.} - -\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged -into an interval score that, in the limit, corresponds to CRPS. Alpha is the -decimal value that represents how much is outside a central prediction -interval (e.g. for a 90 percent central prediction interval, alpha is 0.1) -Default: \code{TRUE}.} - -\item{count_median_twice}{logical that controls whether or not to count the -median twice when summarising (default is \code{FALSE}). Counting the -median twice would conceptually treat it as a 0\\% prediction interval, where -the median is the lower as well as the upper bound. The alternative is to -treat the median as a single quantile forecast instead of an interval. The -interval score would then be better understood as an average of quantile -scores.} - -\item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate -parts of the interval score (dispersion penalty, penalties for over- and -under-prediction get returned as separate elements of a list). If you want a -\code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.} -} -\value{ -A data.table with appropriate scores. For more information see -\code{\link[=score]{score()}} -} -\description{ -Evaluate forecasts in a Quantile-Based Format -} -\references{ -Bosse NI, Gruson H, Cori A, van Leeuwen E, Funk S, Abbott S -(2022) Evaluating Forecasts with scoringutils in R. -\doi{10.48550/arXiv.2205.07090} -} -\author{ -Nikos Bosse \email{nikosbosse@gmail.com} -} -\keyword{internal} diff --git a/man/se_mean_sample.Rd b/man/se_mean_sample.Rd index d7c7d332f..c2101d4df 100644 --- a/man/se_mean_sample.Rd +++ b/man/se_mean_sample.Rd @@ -20,14 +20,15 @@ vector with the scoring values Squared error of the mean calculated as \deqn{ - \textrm{mean}(\textrm{observed} - \textrm{prediction})^2 + \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2 }{ - mean(observed - mean_prediction)^2 + mean(observed - mean prediction)^2 } +The mean prediction is calculated as the mean of the predictive samples. } \examples{ observed <- rnorm(30, mean = 1:30) -predicted_values <- rnorm(30, mean = 1:30) +predicted_values <- matrix(rnorm(30, mean = 1:30)) se_mean_sample(observed, predicted_values) } \seealso{ diff --git a/man/wis.Rd b/man/wis.Rd index a76b3c35f..057e9d04a 100644 --- a/man/wis.Rd +++ b/man/wis.Rd @@ -24,9 +24,14 @@ overprediction(observed, predicted, quantile) underprediction(observed, predicted, quantile) } \arguments{ -\item{observed}{A vector with observed values of size n} +\item{observed}{numeric vector of size n with the observed values} -\item{predicted}{vector of size n with the predicted values} +\item{predicted}{numeric nxN matrix of predictive +quantiles, n (number of rows) being the number of forecasts (corresponding +to the number of observed values) and N +(number of columns) the number of quantiles per forecast. +If \code{observed} is just a single number, then predicted can just be a +vector of size N.} \item{quantile}{vector with quantile levels of size N} @@ -46,8 +51,8 @@ Default: \code{TRUE}.} \item{na.rm}{if TRUE, ignore NA values when computing the score} } \value{ -\code{wis()}: a numeric vector with WIS values (one per observation), or a list -with separate entries if \code{separate_results} is \code{TRUE}. +\code{wis()}: a numeric vector with WIS values of size n (one per observation), +or a list with separate entries if \code{separate_results} is \code{TRUE}. \code{dispersion()}: a numeric vector with dispersion values (one per observation) diff --git a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg index f56619980..a33e8a207 100644 --- a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg +++ b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg @@ -25,104 +25,146 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -1 -0.46 -1 -0.28 -0.15 -1 -0.94 -0.32 --0.03 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +1 +0.94 +1 +0.28 +-0.03 +1 +0.46 +0.32 +0.15 +1 +0.11 +0.22 +-0.35 +0.11 1 --0.34 --0.12 --0.33 --0.25 -1 -0.11 -0.11 --0.35 -0.22 -0.06 -1 -0.99 -0.54 -0.34 -0.9 --0.38 -0.1 -1 +-0.21 +-0.15 +-0.21 +-0.09 +0.01 +1 +-0.41 +-0.32 +-0.36 +-0.09 +0.1 +0.37 +1 +-0.34 +-0.25 +-0.33 +-0.12 +0.06 +0.85 +0.64 +1 +0.99 +0.9 +0.34 +0.54 +0.1 +-0.25 +-0.41 +-0.38 +1 -interval_score -dispersion -underprediction -overprediction -coverage_deviation -bias -ae_median - - - +wis +overprediction +underprediction +dispersion +bias +coverage_50 +coverage_90 +coverage_deviation +ae_median + + + + - - - + + + + - - - + + + + - - - -ae_median -bias -coverage_deviation -overprediction -underprediction -dispersion -interval_score - -0.0 -0.5 + + + + +ae_median +coverage_deviation +coverage_90 +coverage_50 +bias +dispersion +underprediction +overprediction +wis + +0.0 +0.5 1.0 Correlation - - + + - - + + plot__correlation diff --git a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg index 528aefb09..8c4222d9a 100644 --- a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg +++ b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg @@ -25,174 +25,20 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 --0.06 -0.06 --0.06 --0.06 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 --0.08 +0.1 -0.08 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.34 -0.07 -0.07 -0.07 -0.07 0.07 -0.07 -0.07 -0.07 -0.07 -0.07 -0.07 -0.07 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 --0.02 +0.34 -0.02 -0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 --0.01 diff --git a/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg b/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg index 548878c34..91848b1dd 100644 --- a/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg +++ b/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg @@ -57,15 +57,17 @@ 100 Nominal interval coverage % Obs inside interval -model - - - - -EuroCOVIDhub-baseline -EuroCOVIDhub-ensemble -UMass-MechBayes -epiforecasts-EpiNow2 +model + + + + + +EuroCOVIDhub-baseline +EuroCOVIDhub-ensemble +UMass-MechBayes +epiforecasts-EpiNow2 +NA plot_interval_coverage diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg index 22e2408dc..458487011 100644 --- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg +++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg @@ -28,8 +28,8 @@ - + @@ -37,8 +37,8 @@ < 0.001 < 0.001 1 -0.298 < 0.001 +0.298 1 0.298 < 0.001 @@ -56,9 +56,9 @@ + - @@ -72,9 +72,9 @@ < 0.001 < 0.001 1 +< 0.001 < 0.001 < 0.001 -< 0.001 1 0.007 < 0.001 diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg index 1ff397cfe..3a7599da4 100644 --- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg +++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg @@ -28,8 +28,8 @@ - + @@ -37,8 +37,8 @@ 1.37 1.59 1 -0.86 0.63 +0.86 1 1.16 0.73 @@ -56,9 +56,9 @@ + - @@ -72,9 +72,9 @@ 3.03 3.85 1 +0.26 0.62 0.79 -0.26 1 0.74 1.27 diff --git a/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg b/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg index bf686eedb..76808cc67 100644 --- a/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg +++ b/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg @@ -57,15 +57,17 @@ 1.00 Quantile % Obs below quantile -model - - - - -EuroCOVIDhub-baseline -EuroCOVIDhub-ensemble -UMass-MechBayes -epiforecasts-EpiNow2 +model + + + + + +EuroCOVIDhub-baseline +EuroCOVIDhub-ensemble +UMass-MechBayes +epiforecasts-EpiNow2 +NA plot_quantile_coverage diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg index 812e1600f..4ad667f8a 100644 --- a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg +++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg @@ -25,42 +25,42 @@ - - - - - - - - - - - + - - - - - - - - - - + - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -82,54 +82,54 @@ - - - - - - - - - - - + - - - - - - - - - - + + - - - - - - - - - - - + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg index 15c9ee3e3..98a9a883c 100644 --- a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg +++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg @@ -25,42 +25,42 @@ - - - - - - - - - - - + - - - - - - - - - - + - - - - - - - - - - - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -82,54 +82,54 @@ - - - - - - - - - - - + - - - - - - - - - - + + - - - - - - - - - - - + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -203,7 +203,7 @@ model -interval_score +wis 0 25 diff --git a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg index 95f9fe247..25b15d296 100644 --- a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg +++ b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg @@ -25,62 +25,78 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -10000 -9000 -50 -10000 -2000 -2000 -30 -3000 -5000 -2000 -20 -2000 -7000 -5000 -9 -6000 -0.002 -0.05 --0.02 --0.06 -0.2 -0.008 --0.02 --0.04 -20000 -10000 -80 -10000 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +9000 +10000 +10000 +50 +5000 +7000 +6000 +9 +2000 +5000 +2000 +20 +2000 +2000 +3000 +30 +0.008 +0.2 +-0.04 +-0.02 +0.6 +0.5 +0.4 +0.5 +0.9 +0.9 +0.8 +0.9 +0.05 +0.002 +-0.06 +-0.02 +10000 +20000 +10000 +80 @@ -93,20 +109,24 @@ - - - + + + + - - - -interval_score -dispersion -underprediction -overprediction -coverage_deviation -bias -ae_median + + + + +wis +overprediction +underprediction +dispersion +bias +coverage_50 +coverage_90 +coverage_deviation +ae_median plot_score_table diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg index 758a3c147..c315cf06d 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg index 987072ca4..fea309214 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis.svg b/tests/testthat/_snaps/plot_wis/plot-wis.svg index 5328b4779..a2bdf8653 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis.svg @@ -25,14 +25,14 @@ - + - + - + @@ -43,16 +43,16 @@ - + - + - + diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index c157fb958..a236f299d 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -1,8 +1,13 @@ # load common required test packages library(ggplot2, quietly = TRUE) +library(data.table) suppressMessages(library(magrittr)) -# compute quantile scores +metrics_no_cov <- metrics_quantile[!grepl("coverage", names(metrics_quantile))] +metrics_no_cov_no_ae <- metrics_no_cov[!grepl("ae", names(metrics_no_cov))] + + +# compute scores scores_quantile <- suppressMessages(score(example_quantile)) scores_continuous <- suppressMessages(score(data = example_continuous)) scores_point <- suppressMessages(score(example_point)) diff --git a/tests/testthat/test-add_coverage.R b/tests/testthat/test-add_coverage.R index fab1e72a1..689b8640b 100644 --- a/tests/testthat/test-add_coverage.R +++ b/tests/testthat/test-add_coverage.R @@ -1,31 +1,13 @@ -ex_coverage <- scores_quantile[model == "EuroCOVIDhub-ensemble"] +ex_coverage <- example_quantile[model == "EuroCOVIDhub-ensemble"] test_that("add_coverage() works as expected", { - expect_error( - add_coverage(ex_coverage, by = c("model", "target_type"), range = c()) - ) - expect_error( - add_coverage(ex_coverage, by = c("model", "target_type")), NA - ) - cov <- add_coverage( - scores_quantile, by = c("model", "target_type"), range = c(10, 20) - ) - expect_equal( - grep("coverage_", colnames(cov), value = TRUE), - c("coverage_deviation", "coverage_10", "coverage_20") - ) -}) - + expect_no_condition(cov <- add_coverage(example_quantile)) -test_that("Order of `add_coverage()` and `summarise_scores()` doesn't matter", { - # Need to update test. Turns out the order does matter... - # see https://github.com/epiforecasts/scoringutils/issues/367 - pw1 <- add_coverage(ex_coverage, by = "model") - pw1_sum <- summarise_scores(pw1, by = "model") - - pw2 <- summarise_scores(ex_coverage, by = "model") - pw2 <- add_coverage(pw2) + required_names <- c( + "range", "interval_coverage", "interval_coverage_deviation", + "quantile_coverage", "quantile_coverage_deviation" + ) + expect_equal(colnames(cov), c(colnames(example_quantile), required_names)) - # expect_true(all(pw1_sum == pw2, na.rm = TRUE)) - # expect_true(all(names(attributes(pw2)) == names(attributes(pw1_sum)))) + expect_equal(nrow(cov), nrow(example_quantile)) }) diff --git a/tests/testthat/test-bias.R b/tests/testthat/test-bias.R deleted file mode 100644 index 42eefa34f..000000000 --- a/tests/testthat/test-bias.R +++ /dev/null @@ -1,311 +0,0 @@ -test_that("bias_sample() throws an error when missing observed", { - observed <- rpois(10, lambda = 1:10) - predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - bias_sample(predicted = predicted), - 'argument "observed" is missing, with no default' - ) -}) - -test_that("bias_sample() throws an error when missing 'predicted'", { - observed <- rpois(10, lambda = 1:10) - predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - bias_sample(observed = observed), - 'argument "predicted" is missing, with no default' - ) -}) - -test_that("bias_sample() works for integer observed and predicted", { - observed <- rpois(10, lambda = 1:10) - predicted <- replicate(10, rpois(10, lambda = 1:10)) - output <- bias_sample( - observed = observed, - predicted = predicted - ) - expect_equal( - length(output), - length(observed) - ) - expect_equal( - class(output), - "numeric" - ) -}) - -test_that("bias_sample() works for continuous observed values and predicted", { - observed <- rnorm(10) - predicted <- replicate(10, rnorm(10)) - output <- bias_sample( - observed = observed, - predicted = predicted - ) - expect_equal( - length(output), - length(observed) - ) - expect_equal( - class(output), - "numeric" - ) -}) - -test_that("bias_sample() works as expected", { - observed <- rpois(30, lambda = 1:30) - predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) - expect_true(all(bias_sample(observed, predicted) == bias_sample(observed, predicted))) - - ## continuous forecasts - observed <- rnorm(30, mean = 1:30) - predicted <- replicate(200, rnorm(30, mean = 1:30)) - - scoringutils2 <- bias_sample(observed, predicted) - scoringutils <- bias_sample(observed, predicted) - - expect_equal(scoringutils, scoringutils2) -}) - - -test_that("bias_quantile() works as expected", { - predicted <- c(1, 2, 3) - quantiles <- c(0.1, 0.5, 0.9) - expect_equal( - bias_quantile(observed = 2, predicted, quantiles), - 0 - ) - predicted <- c(0, 1, 2) - quantiles <- c(0.1, 0.5, 0.9) - expect_equal( - bias_quantile(observed = 2, predicted, quantiles), - -0.8 - ) - - predicted <- c( - 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, - 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, - 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, - 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 - ) - - quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) - - observed <- 8062 - expect_equal(bias_quantile(observed, predicted, quantile), -0.8) -}) - -test_that("bias_quantile handles matrix input", { - observed <- seq(10, 0, length.out = 4) - predicted <- matrix(1:12, ncol = 3) - quantiles <- c(0.1, 0.5, 0.9) - expect_equal( - bias_quantile(observed, predicted, quantiles), - c(-1.0, -0.8, 0.8, 1.0) - ) -}) - - -test_that("bias_quantile() handles vector that is too long", { - predicted <- c(NA, 1, 2, 3) - quantiles <- c(0.1, 0.5, 0.9) - - expect_error( - bias_quantile(observed = 2, predicted, quantiles), - "Assertion on 'quantile' failed: Must have length 4, but has length 3." - ) -}) - -test_that("bias_quantile() handles NA values", { - predicted <- c(NA, 1, 2) - quantiles <- c(0.1, 0.5, 0.9) - expect_equal( - bias_quantile(observed = 2, predicted, quantiles), - -0.8 - ) - predicted <- c(0, 1, 2) - quantiles <- c(0.1, 0.5, NA) - expect_equal( - bias_quantile(observed = 2, predicted, quantiles), - -1 - ) - expect_equal( - bias_quantile(observed = 2, predicted, quantiles, na.rm = FALSE), - NA_real_ - ) -}) - -test_that("bias_quantile() errors if no predictions", { - expect_error( - bias_quantile(observed = 2, numeric(0), numeric(0)), - "Assertion on 'quantile' failed: Must have length >= 1, but has length 0" - ) -}) - -test_that("bias_quantile() returns correct bias if value below the median", { - predicted <- c(1, 2, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - suppressMessages( - expect_equal(bias_quantile(observed = 1, predicted, quantiles), 0.8) - ) -}) - -test_that("bias_quantile() returns correct bias if value above median", { - predicted <- c(1, 2, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - suppressMessages( - expect_equal(bias_quantile(observed = 5, predicted, quantiles), -0.8) - ) -}) - -test_that("bias_quantile() returns correct bias if value at the median", { - predicted <- c(1, 2, 3, 4) - quantiles <- c(0.1, 0.3, 0.5, 0.7) - - expect_equal(bias_quantile(observed = 3, predicted, quantiles), 0) -}) - -test_that("bias_quantile() returns 1 if true value below min prediction", { - predicted <- c(2, 3, 4, 5) - quantiles <- c(0.1, 0.3, 0.7, 0.9) - - suppressMessages( - expect_equal(bias_quantile(observed = 1, predicted, quantiles), 1) - ) -}) - -test_that("bias_quantile() returns -1 if true value above max prediction", { - predicted <- c(1, 2, 3, 4) - quantiles <- c(0.1, 0.3, 0.5, 0.7) - - expect_equal(bias_quantile(observed = 6, predicted, quantiles), -1) -}) - -test_that("bias_quantile(): quantiles must be between 0 and 1", { - predicted <- 1:4 - - # Failing example - quantiles <- c(-0.1, 0.3, 0.5, 0.8) - expect_error( - bias_quantile(observed = 3, predicted, quantiles), - "Assertion on 'quantile' failed: Element 1 is not >= 0." - ) - - # Passing counter example - quantiles <- c(0.1, 0.3, 0.5, 0.8) - expect_silent(bias_quantile(observed = 3, predicted, quantiles)) -}) - -test_that("bias_quantile(): quantiles must be increasing", { - predicted <- 1:4 - - # Failing example - quantiles <- c(0.8, 0.3, 0.5, 0.9) - expect_error( - bias_quantile(observed = 3, predicted, quantiles), - "Predictions must not be decreasing with increasing quantile level" - ) - - # Passing counter example - quantiles <- c(0.3, 0.5, 0.8, 0.9) - expect_silent(bias_quantile(observed = 3, predicted, quantiles)) -}) - -test_that("bias_quantile(): predictions must be increasing", { - predicted <- c(1, 2, 4, 3) - quantiles <- c(0.1, 0.3, 0.5, 0.9) - - expect_error( - bias_quantile(observed = 3, predicted, quantiles), - "Predictions must not be decreasing with increasing quantile level" - ) - expect_silent(bias_quantile( observed = 3, 1:4, quantiles)) -}) - -test_that("bias_quantile(): quantiles must be unique", { - predicted <- 1:4 - - # Failing example - quantiles <- c(0.3, 0.3, 0.5, 0.8) - expect_error( - bias_quantile(observed = 3, predicted, quantiles), - "Assertion on 'quantile' failed: Contains duplicated values, position 2." - ) - - # Passing example - quantiles <- c(0.3, 0.5, 0.8, 0.9) - expect_silent(bias_quantile(observed = 3, predicted, quantiles)) -}) - -test_that("bias_sample() approx equals bias_quantile() for many samples", { - set.seed(123) - - # Generate true value - observed <- 3 - - # Generate many sample predictions - predicted <- sample(rnorm(1000, mean = observed, sd = 2), 1000) - - # Get sample based bias - bias_sample_result <- bias_sample( - observed, matrix(predicted, nrow = 1) - ) - - # Convert predictions to quantiles - quantiles <- seq(0, 1, length.out = 100) - quantile_preds <- quantile(predicted, probs = quantiles) - - # Get quantile based bias - bias_quantile_result <- suppressMessages( - bias_quantile(observed, quantile_preds, quantiles) - ) - - # Difference should be small - expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1) -}) - -test_that("bias_quantile() and bias_range() give the same result", { - predicted <- sort(rnorm(23)) - lower <- rev(predicted[1:12]) - upper <- predicted[12:23] - - range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) - quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) - observed <- rnorm(1) - - range_bias <- bias_range( - lower = lower, upper = upper, - range = range, observed = observed - ) - range_quantile <- bias_quantile( - observed = observed, - predicted = predicted, - quantile = quantiles - ) - expect_equal(range_bias, range_quantile) -}) - -test_that("bias_range() works with point forecasts", { - predicted <- 1 - observed <- 1 - range <- c(0) - - expect_equal(bias_range(predicted, predicted, range, observed), 0) -}) - -test_that("bias_range(): ranges must be between 0 and 100", { - lower <- 4:1 - upper <- 5:8 - - # Failing example - range <- c(-10, 0, 10, 20) - expect_error( - bias_range(lower, upper, range, observed = 3), - "range must be between 0 and 100" - ) - - # Passing counter example - range <- c(0, 10, 20, 30) - expect_silent(bias_range(lower, upper, range, observed = 3)) -}) - diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R index ecfc86653..98d784cd4 100644 --- a/tests/testthat/test-convenience-functions.R +++ b/tests/testthat/test-convenience-functions.R @@ -38,46 +38,55 @@ test_that("function transform_forecasts works", { }) +# ============================================================================ # +# `set_forecast_unit()` +# ============================================================================ # test_that("function set_forecast_unit() works", { - # some columns in the example data have duplicated information. So we can remove # these and see whether the result stays the same. - - scores1 <- suppressMessages(score(example_quantile)) - scores1 <- scores1[order(location, target_end_date, target_type, horizon, model), ] + scores1 <- scores_quantile[order(location, target_end_date, target_type, horizon, model), ] ex2 <- set_forecast_unit( example_quantile, c("location", "target_end_date", "target_type", "horizon", "model") ) - scores2 <- suppressMessages(score(ex2)) + scores2 <- score(ex2) scores2 <- scores2[order(location, target_end_date, target_type, horizon, model), ] expect_equal(scores1$interval_score, scores2$interval_score) }) +test_that("set_forecast_unit() works on input that's not a data.table", { + df <- data.frame( + a = 1:2, + b = 2:3, + c = 3:4 + ) + expect_equal( + colnames(set_forecast_unit(df, c("a", "b"))), + c("a", "b") + ) + # apparently it also works on a matrix... good to know :) + expect_equal( + names(set_forecast_unit(as.matrix(df), "a")), + "a" + ) +}) -test_that("function set_forecast_unit() gives warning when column is not there", { +test_that("function set_forecast_unit() gives warning when column is not there", { expect_warning( set_forecast_unit( example_quantile, - c("location", "target_end_date", "target_type", "horizon", "model", "test") + c("location", "target_end_date", "target_type", "horizon", "model", "test1", "test2") ) ) }) - test_that("function get_forecast_unit() and set_forecast_unit() work together", { - fu_set <- c("location", "target_end_date", "target_type", "horizon", "model") - - ex <- set_forecast_unit( - example_binary, - fu_set - ) - + ex <- set_forecast_unit(example_binary, fu_set) fu_get <- get_forecast_unit(ex) expect_equal(fu_set, fu_get) }) diff --git a/tests/testthat/test-get_-functions.R b/tests/testthat/test-get_-functions.R index 217e954bd..0a499b243 100644 --- a/tests/testthat/test-get_-functions.R +++ b/tests/testthat/test-get_-functions.R @@ -70,3 +70,42 @@ test_that("get_type() handles `NA` values", { expect_equal(get_type(c(1, NA, 3.2)), "continuous") expect_error(get_type(NA), "Can't get type: all values of are NA") }) + + +# `get_duplicate_forecasts()` ================================================== +test_that("get_duplicate_forecasts() works as expected for quantile", { + expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_quantile, example_quantile[1000:1010]))), + 22 + ) +}) + +test_that("get_duplicate_forecasts() works as expected for sample", { + expect_equal(nrow(get_duplicate_forecasts(example_continuous)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_continuous, example_continuous[1040:1050]))), + 22 + ) +}) + + +test_that("get_duplicate_forecasts() works as expected for binary", { + expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_binary, example_binary[1000:1010]))), + 22 + ) +}) + +test_that("get_duplicate_forecasts() works as expected for point", { + expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) + expect_equal( + nrow( + get_duplicate_forecasts(rbind(example_point, example_point[1010:1020]))), + 22 + ) +}) diff --git a/tests/testthat/test-get_duplicate_forecasts.R b/tests/testthat/test-get_duplicate_forecasts.R deleted file mode 100644 index 5487ea0fb..000000000 --- a/tests/testthat/test-get_duplicate_forecasts.R +++ /dev/null @@ -1,37 +0,0 @@ -test_that("get_duplicate_forecasts() works as expected for quantile", { - expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_quantile, example_quantile[1000:1010]))), - 22 - ) -}) - -test_that("get_duplicate_forecasts() works as expected for sample", { - expect_equal(nrow(get_duplicate_forecasts(example_continuous)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_continuous, example_continuous[1040:1050]))), - 22 - ) -}) - - -test_that("get_duplicate_forecasts() works as expected for binary", { - expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_binary, example_binary[1000:1010]))), - 22 - ) -}) - -test_that("get_duplicate_forecasts() works as expected for point", { - expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0) - expect_equal( - nrow( - get_duplicate_forecasts(rbind(example_point, example_point[1010:1020]))), - 22 - ) -}) - diff --git a/tests/testthat/test-lower-level-check-functions.R b/tests/testthat/test-metrics-binary.R similarity index 52% rename from tests/testthat/test-lower-level-check-functions.R rename to tests/testthat/test-metrics-binary.R index 8d73aa528..9c49e7050 100644 --- a/tests/testthat/test-lower-level-check-functions.R +++ b/tests/testthat/test-metrics-binary.R @@ -1,45 +1,13 @@ -test_that("Lower-level input check functions work", { - observed <- rpois(30, lambda = 1:30) - predicted <- replicate(20, rpois(n = 30, lambda = 1:30)) - expect_equal(length(crps_sample(observed, predicted)), 30) - - # should error when wrong prediction type is given - predicted2 <- rpois(30, lambda = 1) - expect_error(crps_sample(observed, predicted2), - "Assertion on 'predicted' failed: Must be of type 'matrix', not 'integer'", - fixed = TRUE - ) - - # predictions have wrong number of rows - predicted3 <- replicate(20, rpois(n = 31, lambda = 1)) - expect_error( - crps_sample(observed, predicted3), - "Assertion on 'predicted' failed: Must have exactly 30 rows, but has 31 rows.", - # "Mismatch: 'observed' has length `30`, but 'predicted' has `31` rows.", - fixed = TRUE - ) - - # error with missing argument - expect_error(crps_sample(predicted = predicted), - 'argument "observed" is missing, with no default', - fixed = TRUE - ) - - # checks work for binary forecasts - observed <- factor(sample(c(0, 1), size = 10, replace = TRUE)) - predicted <- runif(n = 10) - expect_equal(length(brier_score(observed, predicted)), 10) - - # predictions are not between 0 and 1 - predicted2 <- predicted + 2 - expect_error( - brier_score(observed, predicted2), - "Assertion on 'predicted' failed: Element 1 is not <= 1.", - fixed = TRUE - ) -}) - - +observed <- factor(rbinom(10, size = 1, prob = 0.5)) +predicted <- c(0.425, 0.55, 0.541, 0.52, 0.13, 0.469, 0.86, 0.22, 0.74, 0.9) +df <- data.table( + observed = observed, + predicted = predicted, + model = "m1", + id = 1:10 +) + +# test input handling test_that("function throws an error when missing observed or predicted", { observed <- sample(c(0, 1), size = 10, replace = TRUE) predicted <- replicate( @@ -83,17 +51,6 @@ test_that("function throws an error for wrong format of `observed`", { }) test_that("function throws an error for wrong format of predictions", { - observed <- factor(sample(c(0, 1), size = 10, replace = TRUE)) - predicted <- runif(10, min = 0, max = 3) - expect_error( - brier_score( - observed = observed, - predicted = predicted - ), - #"For a binary forecast, all predictions should be probabilities between 0 or 1." - "Assertion on 'predicted' failed: Element 1 is not <= 1." - ) - predicted <- runif(10, min = 0, max = 1) expect_error( brier_score( @@ -114,3 +71,59 @@ test_that("function throws an error for wrong format of predictions", { fixed = TRUE ) }) + +test_that("Input checking for binary forecasts works", { + # everything correct + expect_no_condition( + scoringutils:::assert_input_binary(observed, predicted) + ) + + # predicted > 1 + expect_error( + scoringutils:::assert_input_binary(observed, predicted + 1), + "Assertion on 'predicted' failed: Element 1 is not <= 1." + ) + + # predicted < 0 + expect_error( + scoringutils:::assert_input_binary(observed, predicted - 1), + "Assertion on 'predicted' failed: Element 1 is not >= 0." + ) + + # observed value not factor + expect_error( + scoringutils:::assert_input_binary(1:10, predicted), + "Assertion on 'observed' failed: Must be of type 'factor', not 'integer'." + ) + + # observed value has not 2 levels + expect_error( + scoringutils:::assert_input_binary(factor(1:10), predicted), + "Assertion on 'observed' failed: Must have exactly 2 levels." + ) + + # observed is a single number and does not have the same length as predicted + expect_error( + scoringutils:::assert_input_binary(factor(1), predicted), + "`observed` and `predicted` need to be of same length when scoring binary forecasts", + fixed = TRUE + ) + + # predicted is a matrix + expect_error( + scoringutils:::assert_input_binary(observed, matrix(predicted)), + "Assertion on 'predicted' failed: Must be of type 'atomic vector', not 'matrix'." + ) +}) + +test_that("Binary metrics work within and outside of `score()`", { + result <- score(df) + expect_equal( + brier_score(observed, predicted), + result$brier_score + ) + expect_equal( + logs_binary(observed, predicted), + result$log_score + ) +}) diff --git a/tests/testthat/test-absolute_error.R b/tests/testthat/test-metrics-point.R similarity index 82% rename from tests/testthat/test-absolute_error.R rename to tests/testthat/test-metrics-point.R index f61493b25..2f64226df 100644 --- a/tests/testthat/test-absolute_error.R +++ b/tests/testthat/test-metrics-point.R @@ -1,25 +1,20 @@ -test_that("absolute error (sample based) works", { - observed <- rnorm(30, mean = 1:30) - predicted_values <- rnorm(30, mean = 1:30) - - scoringutils <- ae_median_sample(observed, predicted_values) - - ae <- abs(observed - predicted_values) - expect_equal(ae, scoringutils) -}) - - -# covidHubUtils-tests +# covidHubUtils-tests on absolute error ======================================== +# test are adapted from the package +# covidHubUtils, https://github.com/reichlab/covidHubUtils/ +y <- c(1, -15, 22) +forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + +target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) +horizons <- c("1", "2", "1") +locations <- c("01", "01", "02") +target_variables <- rep("inc death", length(y)) test_that("abs error is correct within score, point forecast only", { - # test is adapted from the package covidHubUtils, https://github.com/reichlab/covidHubUtils/ - y <- c(1, -15, 22) - - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1) forecast_horizons <- rep(horizons, times = 1) @@ -73,21 +68,9 @@ test_that("abs error is correct within score, point forecast only", { }) test_that("abs error is correct, point and median forecasts different", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix)) forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix)) @@ -145,21 +128,9 @@ test_that("abs error is correct, point and median forecasts different", { }) test_that("abs error is correct, point and median forecasts same", { - y <- c(1, -15, 22) - forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) - ) - forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[3] - target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) - horizons <- c("1", "2", "1") - locations <- c("01", "01", "02") - target_variables <- rep("inc death", length(y)) - forecast_target_end_dates <- rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix)) forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix)) @@ -196,7 +167,6 @@ test_that("abs error is correct, point and median forecasts same", { stringsAsFactors = FALSE ) - # bring in scoringutils format truth_scoringutils <- data.table::as.data.table(test_truth) fc_scoringutils <- data.table::as.data.table(test_forecasts) @@ -221,7 +191,5 @@ test_that("abs error is correct, point and median forecasts same", { ) expected <- abs(y - point_forecast) - - # expect_equal(actual$abs_error, expected) expect_equal(eval$ae_point, expected) }) diff --git a/tests/testthat/test-interval_score.R b/tests/testthat/test-metrics-quantile.R similarity index 51% rename from tests/testthat/test-interval_score.R rename to tests/testthat/test-metrics-quantile.R index 2a75c6fdb..8dd6d6a22 100644 --- a/tests/testthat/test-interval_score.R +++ b/tests/testthat/test-metrics-quantile.R @@ -1,12 +1,84 @@ -test_that("wis works, median only", { +observed <- c(1, -15, 22) +predicted <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9) + +# covidHubUtils test: +y <- c(1, -15, 22) +forecast_quantiles_matrix <- rbind( + c(-1, 0, 1, 2, 3), + c(-2, 1, 2, 2, 4), + c(-2, 0, 3, 3, 4) +) +forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) + + +# ============================================================================ # +# Input handling =============================================================== +# ============================================================================ # +test_that("Input checking for quantile forecasts works", { + # everything correct + expect_no_condition( + scoringutils:::assert_input_quantile(observed, predicted, quantile) + ) + + # quantile > 1 + expect_error( + scoringutils:::assert_input_quantile(observed, predicted, quantile + 1), + "Assertion on 'quantile' failed: Element 1 is not <= 1." + ) + + # quantile < 0 + expect_error( + scoringutils:::assert_input_quantile(observed, predicted, quantile - 1), + "Assertion on 'quantile' failed: Element 1 is not >= 0." + ) + + # 10 observations, but only 3 forecasts + expect_error( + scoringutils:::assert_input_quantile(1:10, predicted, quantile), + "Assertion on 'predicted' failed: Must have exactly 10 rows, but has 3 rows." + ) + + # observed value is a factor + expect_error( + scoringutils:::assert_input_quantile(factor(1:10), predicted, quantile), + "Assertion on 'observed' failed: Must be of type 'numeric', not 'factor'." + ) + + # observed is a single number and does not have the same length as predicted + # There seems to be an issue with the error message: there is one \n to many + # such that the test fails when executed alone, but works when executed + # together with others. + expect_error( + scoringutils:::assert_input_quantile(1, predicted, quantile), + "Assertion failed. One of the following must apply:\n * check_numeric_vector(predicted): Must be of type 'atomic vector',\n * not 'matrix'\n * check_matrix(predicted): Must have exactly 1 rows, but has 3 rows", + fixed = TRUE + ) + + # predicted is a vector + expect_error( + scoringutils:::assert_input_quantile(observed, as.vector(predicted), quantile), + "Assertion on 'predicted' failed: Must be of type 'matrix', not 'double'." + ) +}) + + +# ============================================================================ # +# wis ========================================================================== +# ============================================================================ #a +test_that("wis works standalone, median only", { y <- c(1, -15, 22) lower <- upper <- predicted_quantile <- c(1, 2, 3) quantile_probs <- 0.5 actual <- interval_score(y, - lower = lower, upper = upper, - weigh = TRUE, - interval_range = 0 + lower = lower, upper = upper, + weigh = TRUE, + interval_range = 0 ) actual_wis <- wis( @@ -20,7 +92,7 @@ test_that("wis works, median only", { expect_identical(actual, expected) }) -test_that("WIS works within score for median forecast", { +test_that("`wis()` works within score for median forecast", { test_data <- data.frame( observed = c(1, -15, 22), predicted = 1:3, @@ -28,13 +100,14 @@ test_that("WIS works within score for median forecast", { model = "model1", date = 1:3 ) - eval <- scoringutils::score(test_data, - count_median_twice = TRUE + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov ) - expect_equal(eval$ae_median, eval$interval_score) + expect_equal(eval$ae_median, eval$wis) }) -test_that("wis works, 1 interval only", { +test_that("`wis()` equals `interval_score()`, 1 interval only", { y <- c(1, -15, 22) lower <- c(0, 1, 0) upper <- c(2, 2, 3) @@ -61,7 +134,7 @@ test_that("wis works, 1 interval only", { expect_identical(actual_wis, expected) }) -test_that("WIS works within score for one interval", { +test_that("wis() works within score for one interval", { test_data <- data.frame( observed = rep(c(1, -15, 22), times = 2), quantile = rep(c(0.25, 0.75), each = 3), @@ -70,9 +143,10 @@ test_that("WIS works within score for one interval", { date = rep(1:3, times = 2) ) - eval <- suppressMessages(scoringutils::score(test_data, - count_median_twice = TRUE - )) + eval <- score( + test_data, + count_median_twice = TRUE, metrics = list(wis = wis) + ) eval <- summarise_scores(eval, by = c("model", "date")) @@ -82,10 +156,10 @@ test_that("WIS works within score for one interval", { expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3) - expect_equal(expected, eval$interval_score) + expect_equal(expected, eval$wis) }) -test_that("wis works, 1 interval and median", { +test_that("`wis()` works 1 interval and median", { test_data <- data.frame( observed = rep(c(1, -15, 22), times = 3), quantile = rep(c(0.25, 0.5, 0.75), each = 3), @@ -94,8 +168,9 @@ test_that("wis works, 1 interval and median", { date = rep(1:3, times = 3) ) - eval <- scoringutils::score(test_data, - count_median_twice = TRUE + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov ) eval <- summarise_scores(eval, by = c("model", "date")) @@ -117,19 +192,10 @@ test_that("wis works, 1 interval and median", { count_median_twice = TRUE ) - expect_identical(eval$interval_score, expected) + expect_identical(eval$wis, expected) expect_identical(actual_wis, expected) }) -# covidHubUtils test: -y <- c(1, -15, 22) -forecast_quantiles_matrix <- rbind( - c(-1, 0, 1, 2, 3), - c(-2, 1, 2, 2, 4), - c(-2, 0, 3, 3, 4) -) -forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9) - test_that("wis works, 2 intervals and median", { test_data <- data.frame( observed = rep(c(1, -15, 22), times = 5), @@ -142,8 +208,9 @@ test_that("wis works, 2 intervals and median", { date = rep(1:3, times = 5) ) - eval <- scoringutils::score(test_data, - count_median_twice = TRUE + eval <- score( + test_data, + count_median_twice = TRUE, metrics = metrics_no_cov ) eval <- summarise_scores(eval, by = c("model", "date")) @@ -168,7 +235,7 @@ test_that("wis works, 2 intervals and median", { ) expect_equal( - as.numeric(eval$interval_score), + as.numeric(eval$wis), as.numeric(expected) ) expect_identical(actual_wis, expected) @@ -228,8 +295,9 @@ test_that("wis is correct, median only - test corresponds to covidHubUtils", { data_formatted <- merge(forecasts_formated, truth_formatted) - eval <- scoringutils::score(data_formatted, - count_median_twice = FALSE + eval <- score( + data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov ) expected <- abs(y - forecast_quantiles_matrix[, 1]) @@ -241,11 +309,10 @@ test_that("wis is correct, median only - test corresponds to covidHubUtils", { count_median_twice = FALSE ) - expect_equal(eval$interval_score, expected) + expect_equal(eval$wis, expected) expect_equal(actual_wis, expected) }) - test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", { forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE] forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)] @@ -299,15 +366,15 @@ test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", data_formatted <- merge(forecasts_formated, truth_formatted) - eval <- suppressMessages(scoringutils::score(data_formatted, - count_median_twice = FALSE + eval <- suppressMessages(score(data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov_no_ae )) eval <- summarise_scores(eval, - by = c( - "model", "location", "target_variable", - "target_end_date", "forecast_date", "horizon" - ) + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) ) alpha1 <- 0.2 @@ -321,11 +388,10 @@ test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", count_median_twice = FALSE ) - expect_equal(eval$interval_score, expected) + expect_equal(eval$wis, expected) expect_equal(actual_wis, expected) }) - test_that("wis is correct, 2 intervals and median - test corresponds to covidHubUtils", { target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7) horizons <- c("1", "2", "1") @@ -376,15 +442,15 @@ test_that("wis is correct, 2 intervals and median - test corresponds to covidHub data_formatted <- merge(forecasts_formated, truth_formatted) - eval <- scoringutils::score(data_formatted, - count_median_twice = FALSE + eval <- score(data_formatted, + count_median_twice = FALSE, metrics = metrics_no_cov ) eval <- summarise_scores(eval, - by = c( - "model", "location", "target_variable", - "target_end_date", "forecast_date", "horizon" - ) + by = c( + "model", "location", "target_variable", + "target_end_date", "forecast_date", "horizon" + ) ) alpha1 <- 0.2 @@ -402,7 +468,7 @@ test_that("wis is correct, 2 intervals and median - test corresponds to covidHub count_median_twice = FALSE ) - expect_equal(eval$interval_score, expected) + expect_equal(eval$wis, expected) }) test_that("Quantlie score and interval score yield the same result, weigh = FALSE", { @@ -431,20 +497,24 @@ test_that("Quantlie score and interval score yield the same result, weigh = FALS ) qs_lower <- quantile_score(observed, - predicted = lower, - quantile = alpha / 2, - weigh = w + predicted = lower, + quantile = alpha / 2, + weigh = w ) qs_upper <- quantile_score(observed, - predicted = upper, - quantile = 1 - alpha / 2, - weigh = w + predicted = upper, + quantile = 1 - alpha / 2, + weigh = w ) expect_equal((qs_lower + qs_upper) / 2, is) expect_equal(wis, is) } }) + +# ============================================================================ # +# Quantile score ============================================================= # +# ============================================================================ # test_that("Quantlie score and interval score yield the same result, weigh = TRUE", { observed <- rnorm(10, mean = 1:10) alphas <- c(0.1, 0.5, 0.9) @@ -471,14 +541,14 @@ test_that("Quantlie score and interval score yield the same result, weigh = TRUE ) qs_lower <- quantile_score(observed, - predicted = lower, - quantile = alpha / 2, - weigh = w + predicted = lower, + quantile = alpha / 2, + weigh = w ) qs_upper <- quantile_score(observed, - predicted = upper, - quantile = 1 - alpha / 2, - weigh = w + predicted = upper, + quantile = 1 - alpha / 2, + weigh = w ) expect_equal((qs_lower + qs_upper) / 2, is) expect_equal(wis, is) @@ -495,3 +565,229 @@ test_that("wis works with separate results", { expect_equal(wis$wis, wis$dispersion + wis$overprediction + wis$underprediction) }) + +# ============================================================================ # +# overprediction, underprediction, dispersion ================================ # +# ============================================================================ # +test_that("wis is the sum of overprediction, underprediction, dispersion", { + wis <- wis( + observed = y, + predicted = forecast_quantiles_matrix, + quantile = forecast_quantile_probs + ) + + d <- dispersion(y, forecast_quantiles_matrix, forecast_quantile_probs) + o <- overprediction(y, forecast_quantiles_matrix, forecast_quantile_probs) + u <- underprediction(y, forecast_quantiles_matrix, forecast_quantile_probs) + + expect_equal(wis, d + o + u) +}) + + +# ============================================================================ # +# `interval_coverage_quantile` =============================================== # +# ============================================================================ # +test_that("interval_coverage_quantile works", { + expect_equal( + interval_coverage_quantile(observed, predicted, quantile, range = 50), + c(TRUE, FALSE, FALSE) + ) +}) + +test_that("interval_coverage_quantile rejects wrong inputs", { + expect_error( + interval_coverage_quantile(observed, predicted, quantile, range = c(50, 0)), + "Assertion on 'range' failed: Must have length 1." + ) +}) + + +# ============================================================================ # +# `interval_coverage_deviation_quantile` ===================================== # +# ============================================================================ # +test_that("interval_coverage_deviation_quantile works", { + existing_ranges <- unique(get_range_from_quantile(quantile)) + expect_equal(existing_ranges, c(80, 50, 0)) + + cov_50 <- interval_coverage_quantile(observed, predicted, quantile, range = c(50)) + cov_80 <- interval_coverage_quantile(observed, predicted, quantile, range = c(80)) + manual <- 0.5 * (cov_50 - 0.5) + 0.5 * (cov_80 - 0.8) + + expect_equal( + interval_coverage_deviation_quantile(observed, predicted, quantile), + manual + ) +}) + + +# ============================================================================ # +# `bias_quantile` ============================================================== +# ============================================================================ # +test_that("bias_quantile() works as expected", { + predicted <- c(1, 2, 3) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + 0 + ) + predicted <- c(0, 1, 2) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -0.8 + ) + + predicted <- c( + 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996, + 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000, + 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004, + 7973.000, 8340.500, 8675.750, 11555.000, 11976.500 + ) + + quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + + observed <- 8062 + expect_equal(bias_quantile(observed, predicted, quantile), -0.8) +}) + +test_that("bias_quantile handles matrix input", { + observed <- seq(10, 0, length.out = 4) + predicted <- matrix(1:12, ncol = 3) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed, predicted, quantiles), + c(-1.0, -0.8, 0.8, 1.0) + ) +}) + + +test_that("bias_quantile() handles vector that is too long", { + predicted <- c(NA, 1, 2, 3) + quantiles <- c(0.1, 0.5, 0.9) + + expect_error( + bias_quantile(observed = 2, predicted, quantiles), + "Assertion on 'quantile' failed: Must have length 4, but has length 3." + ) +}) + +test_that("bias_quantile() handles NA values", { + predicted <- c(NA, 1, 2) + quantiles <- c(0.1, 0.5, 0.9) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -0.8 + ) + predicted <- c(0, 1, 2) + quantiles <- c(0.1, 0.5, NA) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles), + -1 + ) + expect_equal( + bias_quantile(observed = 2, predicted, quantiles, na.rm = FALSE), + NA_real_ + ) +}) + +test_that("bias_quantile() errors if no predictions", { + expect_error( + bias_quantile(observed = 2, numeric(0), numeric(0)), + "Assertion on 'quantile' failed: Must have length >= 1, but has length 0" + ) +}) + +test_that("bias_quantile() returns correct bias if value below the median", { + predicted <- c(1, 2, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + suppressMessages( + expect_equal(bias_quantile(observed = 1, predicted, quantiles), 0.8) + ) +}) + +test_that("bias_quantile() returns correct bias if value above median", { + predicted <- c(1, 2, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + suppressMessages( + expect_equal(bias_quantile(observed = 5, predicted, quantiles), -0.8) + ) +}) + +test_that("bias_quantile() returns correct bias if value at the median", { + predicted <- c(1, 2, 3, 4) + quantiles <- c(0.1, 0.3, 0.5, 0.7) + + expect_equal(bias_quantile(observed = 3, predicted, quantiles), 0) +}) + +test_that("bias_quantile() returns 1 if true value below min prediction", { + predicted <- c(2, 3, 4, 5) + quantiles <- c(0.1, 0.3, 0.7, 0.9) + + suppressMessages( + expect_equal(bias_quantile(observed = 1, predicted, quantiles), 1) + ) +}) + +test_that("bias_quantile() returns -1 if true value above max prediction", { + predicted <- c(1, 2, 3, 4) + quantiles <- c(0.1, 0.3, 0.5, 0.7) + + expect_equal(bias_quantile(observed = 6, predicted, quantiles), -1) +}) + +test_that("bias_quantile(): quantiles must be between 0 and 1", { + predicted <- 1:4 + + # Failing example + quantiles <- c(-0.1, 0.3, 0.5, 0.8) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Assertion on 'quantile' failed: Element 1 is not >= 0." + ) + + # Passing counter example + quantiles <- c(0.1, 0.3, 0.5, 0.8) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) + +test_that("bias_quantile(): quantiles must be increasing", { + predicted <- 1:4 + + # Failing example + quantiles <- c(0.8, 0.3, 0.5, 0.9) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Predictions must not be decreasing with increasing quantile level" + ) + + # Passing counter example + quantiles <- c(0.3, 0.5, 0.8, 0.9) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) + +test_that("bias_quantile(): predictions must be increasing", { + predicted <- c(1, 2, 4, 3) + quantiles <- c(0.1, 0.3, 0.5, 0.9) + + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Predictions must not be decreasing with increasing quantile level" + ) + expect_silent(bias_quantile( observed = 3, 1:4, quantiles)) +}) + +test_that("bias_quantile(): quantiles must be unique", { + predicted <- 1:4 + + # Failing example + quantiles <- c(0.3, 0.3, 0.5, 0.8) + expect_error( + bias_quantile(observed = 3, predicted, quantiles), + "Assertion on 'quantile' failed: Contains duplicated values, position 2." + ) + + # Passing example + quantiles <- c(0.3, 0.5, 0.8, 0.9) + expect_silent(bias_quantile(observed = 3, predicted, quantiles)) +}) diff --git a/tests/testthat/test-metrics-range.R b/tests/testthat/test-metrics-range.R new file mode 100644 index 000000000..bd0290e9f --- /dev/null +++ b/tests/testthat/test-metrics-range.R @@ -0,0 +1,45 @@ +test_that("bias_quantile() and bias_range() give the same result", { + predicted <- sort(rnorm(23)) + lower <- rev(predicted[1:12]) + upper <- predicted[12:23] + + range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98) + quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) + observed <- rnorm(1) + + range_bias <- bias_range( + lower = lower, upper = upper, + range = range, observed = observed + ) + range_quantile <- bias_quantile( + observed = observed, + predicted = predicted, + quantile = quantiles + ) + expect_equal(range_bias, range_quantile) +}) + +test_that("bias_range() works with point forecasts", { + predicted <- 1 + observed <- 1 + range <- c(0) + + expect_equal(bias_range(predicted, predicted, range, observed), 0) +}) + +test_that("bias_range(): ranges must be between 0 and 100", { + lower <- 4:1 + upper <- 5:8 + + # Failing example + range <- c(-10, 0, 10, 20) + expect_error( + bias_range(lower, upper, range, observed = 3), + "range must be between 0 and 100" + ) + + # Passing counter example + range <- c(0, 10, 20, 30) + expect_silent(bias_range(lower, upper, range, observed = 3)) +}) + diff --git a/tests/testthat/test-metrics-sample.R b/tests/testthat/test-metrics-sample.R new file mode 100644 index 000000000..ded7b52ae --- /dev/null +++ b/tests/testthat/test-metrics-sample.R @@ -0,0 +1,146 @@ +test_that("Input handling", { + observed <- rpois(30, lambda = 1:30) + predicted <- replicate(20, rpois(n = 30, lambda = 1:30)) + expect_equal(length(crps_sample(observed, predicted)), 30) + + # should error when wrong prediction type is given + predicted2 <- rpois(30, lambda = 1) + expect_error(crps_sample(observed, predicted2), + "Assertion on 'predicted' failed: Must be of type 'matrix', not 'integer'", + fixed = TRUE + ) + + # predictions have wrong number of rows + predicted3 <- replicate(20, rpois(n = 31, lambda = 1)) + expect_error( + crps_sample(observed, predicted3), + "Assertion on 'predicted' failed: Must have exactly 30 rows, but has 31 rows.", + # "Mismatch: 'observed' has length `30`, but 'predicted' has `31` rows.", + fixed = TRUE + ) + + # error with missing argument + expect_error(crps_sample(predicted = predicted), + 'argument "observed" is missing, with no default', + fixed = TRUE + ) +}) + + + +test_that("bias_sample() throws an error when missing observed", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(predicted = predicted), + 'argument "observed" is missing, with no default' + ) +}) + +test_that("bias_sample() throws an error when missing 'predicted'", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + bias_sample(observed = observed), + 'argument "predicted" is missing, with no default' + ) +}) + +test_that("bias_sample() works for integer observed and predicted", { + observed <- rpois(10, lambda = 1:10) + predicted <- replicate(10, rpois(10, lambda = 1:10)) + output <- bias_sample( + observed = observed, + predicted = predicted + ) + expect_equal( + length(output), + length(observed) + ) + expect_equal( + class(output), + "numeric" + ) +}) + +test_that("bias_sample() works for continuous observed values and predicted", { + observed <- rnorm(10) + predicted <- replicate(10, rnorm(10)) + output <- bias_sample( + observed = observed, + predicted = predicted + ) + expect_equal( + length(output), + length(observed) + ) + expect_equal( + class(output), + "numeric" + ) +}) + +test_that("bias_sample() works as expected", { + observed <- rpois(30, lambda = 1:30) + predicted <- replicate(200, rpois(n = 30, lambda = 1:30)) + expect_true(all(bias_sample(observed, predicted) == bias_sample(observed, predicted))) + + ## continuous forecasts + observed <- rnorm(30, mean = 1:30) + predicted <- replicate(200, rnorm(30, mean = 1:30)) + + scoringutils2 <- bias_sample(observed, predicted) + scoringutils <- bias_sample(observed, predicted) + + expect_equal(scoringutils, scoringutils2) +}) + + +test_that("bias_sample() approx equals bias_quantile() for many samples", { + set.seed(123) + + # Generate true value + observed <- 3 + + # Generate many sample predictions + predicted <- sample(rnorm(1000, mean = observed, sd = 2), 1000) + + # Get sample based bias + bias_sample_result <- bias_sample( + observed, matrix(predicted, nrow = 1) + ) + + # Convert predictions to quantiles + quantiles <- seq(0, 1, length.out = 100) + quantile_preds <- quantile(predicted, probs = quantiles) + + # Get quantile based bias + bias_quantile_result <- suppressMessages( + bias_quantile(observed, quantile_preds, quantiles) + ) + + # Difference should be small + expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1) +}) + + +# `ae_median_sample` =========================================================== +test_that("ae_median_sample works", { + observed <- rnorm(30, mean = 1:30) + predicted_values <- rnorm(30, mean = 1:30) + scoringutils <- ae_median_sample(observed, matrix(predicted_values)) + ae <- abs(observed - predicted_values) + expect_equal(ae, scoringutils) +}) + +# `mad_sample()` =============================================================== +test_that("function throws an error when missing 'predicted'", { + predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) + + expect_error( + mad_sample() + ) +}) + diff --git a/tests/testthat/test-pairwise_comparison.R b/tests/testthat/test-pairwise_comparison.R index 3d0120adb..74275a06a 100644 --- a/tests/testthat/test-pairwise_comparison.R +++ b/tests/testthat/test-pairwise_comparison.R @@ -53,7 +53,7 @@ test_that("pairwise_comparison() works", { ) # evaluate the toy forecasts, once with and once without a baseline model specified - eval <- suppressMessages(score(data_formatted)) + eval <- score(data_formatted) # check with relative skills eval_without_rel_skill <- summarise_scores( @@ -85,7 +85,7 @@ test_that("pairwise_comparison() works", { # prepare scores for the code Johannes Bracher wrote scores_johannes <- data.table::copy(eval_without_baseline) # doesn't matter which one data.table::setnames(scores_johannes, - old = c("location", "target_end_date", "interval_score"), + old = c("location", "target_end_date", "wis"), new = c("unit", "timezero", "wis") ) @@ -238,7 +238,7 @@ test_that("pairwise_comparison() works", { model = rep(c("model1", "model2", "model3"), each = 10), date = as.Date("2020-01-01") + rep(1:5, each = 2), location = c(1, 2), - interval_score = (abs(rnorm(30))), + wis = (abs(rnorm(30))), ae_median = (abs(rnorm(30))) ) diff --git a/tests/testthat/test-plot_heatmap.R b/tests/testthat/test-plot_heatmap.R index 9118ff21f..d6453bd45 100644 --- a/tests/testthat/test-plot_heatmap.R +++ b/tests/testthat/test-plot_heatmap.R @@ -1,9 +1,7 @@ library(ggplot2, quietly = TRUE) test_that("plot_heatmap() works as expected", { - scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "target_type", "range")) - ) + scores <- summarise_scores(scores_quantile, by = c("model", "target_type")) p <- plot_heatmap(scores, x = "target_type", metric = "bias") expect_s3_class(p, "ggplot") skip_on_cran() diff --git a/tests/testthat/test-plot_interval_coverage.R b/tests/testthat/test-plot_interval_coverage.R index 04f203b03..0e885219f 100644 --- a/tests/testthat/test-plot_interval_coverage.R +++ b/tests/testthat/test-plot_interval_coverage.R @@ -1,11 +1,8 @@ -library(ggplot2, quietly = TRUE) - test_that("plot_interval_coverage() works as expected", { - scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "range")) - ) - p <- plot_interval_coverage(scores) + coverage <- add_coverage(example_quantile) |> + summarise_scores(by = c("model", "range")) + p <- plot_interval_coverage(coverage) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_interval_coverage", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_interval_coverage", p)) }) diff --git a/tests/testthat/test-plot_quantile_coverage.R b/tests/testthat/test-plot_quantile_coverage.R index 6c3593c04..060b9be26 100644 --- a/tests/testthat/test-plot_quantile_coverage.R +++ b/tests/testthat/test-plot_quantile_coverage.R @@ -1,9 +1,9 @@ test_that("plot_quantile_coverage() works as expected", { - scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "quantile")) - ) - p <- plot_quantile_coverage(scores) + coverage <- add_coverage(example_quantile) |> + summarise_scores(by = c("model", "quantile")) + + p <- plot_quantile_coverage(coverage) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_quantile_coverage", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_quantile_coverage", p)) }) diff --git a/tests/testthat/test-plot_ranges.R b/tests/testthat/test-plot_ranges.R index e9ae5575b..b4dec124e 100644 --- a/tests/testthat/test-plot_ranges.R +++ b/tests/testthat/test-plot_ranges.R @@ -1,13 +1,18 @@ -sum_scores <- suppressMessages( - summarise_scores(scores_quantile, by = c("model", "target_type", "range")) -) +m <- modifyList(metrics_no_cov_no_ae, list("bias" = NULL)) + +sum_scores <- copy(example_quantile) %>% + .[, interval_range := scoringutils:::get_range_from_quantile(quantile)] |> + score(metrics = m) |> + summarise_scores(by = c("model", "target_type", "interval_range")) + +sum_scores[, range := interval_range] test_that("plot_ranges() works as expected with interval score", { p <- plot_ranges(sum_scores, x = "model") + - facet_wrap(~target_type, scales = "free") + facet_wrap(~target_type, scales = "free") expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_ranges_interval", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_interval", p)) }) test_that("plot_ranges() works as expected with dispersion", { @@ -15,5 +20,5 @@ test_that("plot_ranges() works as expected with dispersion", { facet_wrap(~target_type) expect_s3_class(p, "ggplot") skip_on_cran() - vdiffr::expect_doppelganger("plot_ranges_dispersion", p) + suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_dispersion", p)) }) diff --git a/tests/testthat/test-plot_score_table.R b/tests/testthat/test-plot_score_table.R index 8336de7a9..5ffc0b029 100644 --- a/tests/testthat/test-plot_score_table.R +++ b/tests/testthat/test-plot_score_table.R @@ -1,7 +1,6 @@ test_that("plot_score_table() works as expected", { p <- suppressMessages( scores_quantile %>% - add_coverage(by = c("model")) %>% summarise_scores(by = c("model")) %>% summarise_scores(by = c("model"), fun = signif, digits = 1) %>% plot_score_table() diff --git a/tests/testthat/test-score.R b/tests/testthat/test-score.R index 9b4040723..a67575984 100644 --- a/tests/testthat/test-score.R +++ b/tests/testthat/test-score.R @@ -168,17 +168,13 @@ test_that("score.scoringutils_point() errors with only NA values", { test_that("score_quantile correctly handles separate results = FALSE", { df <- example_quantile[model == "EuroCOVIDhub-ensemble" & target_type == "Cases" & location == "DE"] - eval <- suppressMessages( - score( - df[!is.na(predicted)], - separate_results = FALSE - ) - ) + eval <- score(df[!is.na(predicted)], separate_results = FALSE) expect_equal( nrow(eval) > 1, TRUE ) + expect_true(all(names(metrics_quantile) %in% colnames(eval))) }) @@ -191,10 +187,12 @@ test_that("score() quantile produces desired metrics", { quantile = rep(c(0.1, 0.9), times = 10) ) - out <- suppressMessages(score(data = data)) + out <- suppressWarnings(suppressMessages( + score(data = data, metrics = metrics_no_cov)) + ) metric_names <- c( "dispersion", "underprediction", "overprediction", - "bias", "ae_median", "coverage_deviation" + "bias", "ae_median" ) expect_true(all(metric_names %in% colnames(out))) @@ -227,27 +225,16 @@ test_that("all quantile and range formats yield the same result", { expect_equal(sort(eval1$ae_median), sort(ae)) }) -test_that("function produces output even if only some metrics are chosen", { - example <- scoringutils::example_quantile - - eval <- suppressMessages(score(example, metrics = "coverage")) - - expect_equal( - nrow(eval) > 1, - TRUE - ) -}) - test_that("WIS is the same with other metrics omitted or included", { eval <- suppressMessages(score(example_quantile, - metrics = "interval_score" + metrics = list("wis" = wis) )) eval2 <- scores_quantile expect_equal( - sum(eval$interval_score), - sum(eval2$interval_score) + sum(eval$wis), + sum(eval2$wis) ) }) diff --git a/tests/testthat/test-sharpness.R b/tests/testthat/test-sharpness.R deleted file mode 100644 index 12dcc4c9f..000000000 --- a/tests/testthat/test-sharpness.R +++ /dev/null @@ -1,7 +0,0 @@ -test_that("function throws an error when missing 'predicted'", { - predicted <- replicate(50, rpois(n = 10, lambda = 1:10)) - - expect_error( - mad_sample() - ) -}) diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R index dc3de70e3..dbd2cde4b 100644 --- a/tests/testthat/test-summarise_scores.R +++ b/tests/testthat/test-summarise_scores.R @@ -1,6 +1,4 @@ test_that("summarise_scores() works without any arguments", { - expect_true("quantile" %in% names(scores_quantile)) - summarised_scores <- summarise_scores(scores_quantile) expect_false("quantile" %in% names(summarised_scores)) diff --git a/tests/testthat/test-utils_data_handling.R b/tests/testthat/test-utils_data_handling.R index e627485ab..4521c1f73 100644 --- a/tests/testthat/test-utils_data_handling.R +++ b/tests/testthat/test-utils_data_handling.R @@ -22,7 +22,7 @@ test_that("range_long_to_quantile works", { -test_that("quantile_to_interval works", { +test_that("quantile_to_interval.data.frame() works", { quantile <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", @@ -30,7 +30,6 @@ test_that("quantile_to_interval works", { predicted = c(2:11, 4:13), quantile = rep(c(0.25, 0.75), each = 10) ) - long <- data.frame( date = as.Date("2020-01-01") + 1:10, model = "model1", @@ -39,19 +38,38 @@ test_that("quantile_to_interval works", { range = 50, boundary = rep(c("lower", "upper"), each = 10) ) - long2 <- as.data.frame(quantile_to_interval( quantile, keep_quantile_col = FALSE )) - data.table::setcolorder(long2, names(long)) - # for some reason this is needed to pass the unit tests on gh actions long2$boundary <- as.character(long2$boundary) long$boundary <- as.character(long$boundary) - expect_equal(long, as.data.frame(long2)) + + # check that it handles NA values + setDT(quantile) + quantile[c(1, 3, 11, 13), c("observed", "predicted", "quantile") := NA] + # in this instance, a problem appears because there is an NA value both + # for the upper and lower bound. + expect_message( + quantile_to_interval( + quantile, + keep_quantile_col = FALSE, + format = "wide" + ), + "Aggregate function missing, defaulting to 'length'" + ) + quantile <- quantile[-c(1, 3), ] + wide2 <- scoringutils:::quantile_to_interval( + quantile, + keep_quantile_col = FALSE, + format = "wide" + ) + expect_equal(nrow(wide2), 10) + expect_true(!("NA") %in% colnames(wide2)) + expect_equal(sum(wide2$lower, na.rm = TRUE), 59) }) diff --git a/vignettes/scoringutils.Rmd b/vignettes/scoringutils.Rmd index 6b55c25b5..4f6989834 100644 --- a/vignettes/scoringutils.Rmd +++ b/vignettes/scoringutils.Rmd @@ -223,7 +223,6 @@ For quantile-based forecasts we are often interested in specific coverage-levels ```{r} score(example_quantile) %>% - add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>% summarise_scores(by = c("model", "target_type")) %>% summarise_scores(fun = signif, digits = 2) ``` @@ -304,20 +303,20 @@ example_quantile[quantile %in% seq(0.1, 0.9, 0.1), ] %>% facet_grid(model ~ target_type) ``` -Another way to look at calibration are interval coverage and quantile coverage. Interval coverage is the percentage of true values that fall inside a given central prediction interval. Quantile coverage is the percentage of observed values that fall below a given quantile level. + -In order to plot interval coverage, you need to include "range" in the `by` argument to `summarise_scores()`. The green area on the plot marks conservative behaviour, i.e. your empirical coverage is greater than it nominally need be (e.g. 55% of true values covered by all 50% central prediction intervals.) + -```{r} +```{r, eval=FALSE, include=FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "range")) %>% plot_interval_coverage() ``` -To visualise quantile coverage, you need to include "quantile" in `by`. Again, the green area corresponds to conservative forecasts, where central prediction intervals would cover more than needed. + -```{r} +```{r, eval=FALSE, include=FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "quantile")) %>% @@ -377,11 +376,11 @@ example_quantile %>% plot_correlation() ``` -### Scores by interval ranges + -If you would like to see how different forecast interval ranges contribute to average scores, you can visualise scores by interval range: + -```{r} +```{r, eval = FALSE, include = FALSE} example_quantile %>% score() %>% summarise_scores(by = c("model", "range", "target_type")) %>% @@ -400,8 +399,7 @@ example_integer %>% sample_to_quantile( quantiles = c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99) ) %>% - score() %>% - add_coverage(by = c("model", "target_type")) + score() ``` ## Available metrics