diff --git a/NAMESPACE b/NAMESPACE
index fab2e17d3..d3b6ffe22 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,7 +8,6 @@ S3method(score,default)
S3method(score,scoringutils_binary)
S3method(score,scoringutils_point)
S3method(score,scoringutils_quantile)
-S3method(score,scoringutils_quantile_new)
S3method(score,scoringutils_sample)
S3method(validate,default)
S3method(validate,scoringutils_binary)
@@ -32,6 +31,7 @@ export(crps_sample)
export(dispersion)
export(dss_sample)
export(get_duplicate_forecasts)
+export(interval_coverage_deviation_quantile)
export(interval_coverage_quantile)
export(interval_coverage_sample)
export(interval_score)
@@ -82,6 +82,7 @@ importFrom(checkmate,assert_data_frame)
importFrom(checkmate,assert_data_table)
importFrom(checkmate,assert_factor)
importFrom(checkmate,assert_list)
+importFrom(checkmate,assert_logical)
importFrom(checkmate,assert_number)
importFrom(checkmate,assert_numeric)
importFrom(checkmate,check_atomic_vector)
@@ -108,6 +109,7 @@ importFrom(data.table,nafill)
importFrom(data.table,rbindlist)
importFrom(data.table,setDT)
importFrom(data.table,setattr)
+importFrom(data.table,setcolorder)
importFrom(data.table,setnames)
importFrom(ggdist,geom_lineribbon)
importFrom(ggplot2,.data)
diff --git a/NEWS.md b/NEWS.md
index 322b2b25d..a00ab7f95 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -22,6 +22,7 @@ The update introduces a lot of breaking changes. If you want to keep using the o
- `quantile`: numeric, a vector with quantile-levels. Can alternatively be a matrix of the same shape as `predicted`.
- `check_forecasts()` was replaced by a new function `validate()`. `validate()` validates the input and in that sense fulfills the purpose of `check_forecasts()`. It has different methods: `validate.default()` assigns the input a class based on their forecast type. Other methods validate the input specifically for the various forecast types.
- The functionality for computing pairwise comparisons was now split from `summarise_scores()`. Instead of doing pairwise comparisons as part of summarising scores, a new function, `add_pairwise_comparison()`, was introduced that takes summarised scores as an input and adds pairwise comparisons to it.
+- `add_coverage()` was reworked completely. It's new purpose is now to add coverage information to the raw forecast data (essentially fulfilling some of the functionality that was previously covered by `score_quantile()`)
- The function `find_duplicates()` was renamed to `get_duplicate_forecasts()`
- Changes to `avail_forecasts()` and `plot_avail_forecasts()`:
- The function `avail_forecasts()` was renamed to `available_forecasts()` for consistency with `available_metrics()`. The old function, `avail_forecasts()` is still available as an alias, but will be removed in the future.
diff --git a/R/add_coverage.R b/R/add_coverage.R
new file mode 100644
index 000000000..684556026
--- /dev/null
+++ b/R/add_coverage.R
@@ -0,0 +1,83 @@
+#' @title Add Coverage Values to Quantile-Based Forecasts
+#'
+#' @description Adds interval coverage of central prediction intervals,
+#' quantile coverage for predictive quantiles, as well as the deviation between
+#' desired and actual coverage to a data.table. Forecasts should be in a
+#' quantile format (following the input requirements of `score()`).
+#'
+#' **Interval coverage**
+#'
+#' Coverage for a given interval range is defined as the proportion of
+#' observations that fall within the corresponding central prediction intervals.
+#' Central prediction intervals are symmetric around the median and and formed
+#' by two quantiles that denote the lower and upper bound. For example, the 50%
+#' central prediction interval is the interval between the 0.25 and 0.75
+#' quantiles of the predictive distribution.
+#'
+#' The function `add_coverage()` computes the coverage per central prediction
+#' interval, so the coverage will always be either `TRUE` (observed value falls
+#' within the interval) or `FALSE` (observed value falls outside the interval).
+#' You can summarise the coverage values to get the proportion of observations
+#' that fall within the central prediction intervals.
+#'
+#' **Quantile coverage**
+#'
+#' Quantile coverage for a given quantile is defined as the proportion of
+#' observed values that are smaller than the corresponding predictive quantile.
+#' For example, the 0.5 quantile coverage is the proportion of observed values
+#' that are smaller than the 0.5 quantile of the predictive distribution.
+#'
+#' **Coverage deviation**
+#'
+#' The coverage deviation is the difference between the desired coverage and the
+#' actual coverage. For example, if the desired coverage is 90% and the actual
+#' coverage is 80%, the coverage deviation is -0.1.
+#'
+#' @inheritParams score
+#' @return a data.table with the input and columns "interval_coverage",
+#' "interval_coverage_deviation", "quantile_coverage",
+#' "quantile_coverage_deviation" added.
+#' @importFrom data.table setcolorder
+#' @examples
+#' library(magrittr) # pipe operator
+#' example_quantile %>%
+#' add_coverage()
+#' @export
+#' @keywords scoring
+#' @export
+add_coverage <- function(data) {
+ stored_attributes <- get_scoringutils_attributes(data)
+ data <- validate(data)
+ forecast_unit <- get_forecast_unit(data)
+ data_cols <- colnames(data) # store so we can reset column order later
+
+ # what happens if quantiles are not symmetric around the median?
+ # should things error? Also write tests for that.
+ interval_data <- quantile_to_interval(data, format = "wide")
+ interval_data[, interval_coverage := ifelse(
+ observed <= upper & observed >= lower,
+ TRUE,
+ FALSE)
+ ][, c("lower", "upper", "observed") := NULL]
+
+ data[, range := get_range_from_quantile(quantile)]
+
+ data <- merge(interval_data, data, by = unique(c(forecast_unit, "range")))
+ data[, interval_coverage_deviation := interval_coverage - range / 100]
+ data[, quantile_coverage := observed <= predicted]
+ data[, quantile_coverage_deviation := quantile_coverage - quantile]
+
+ # reset column order
+ new_metrics <- c("interval_coverage", "interval_coverage_deviation",
+ "quantile_coverage", "quantile_coverage_deviation")
+ setcolorder(data, unique(c(data_cols, "range", new_metrics)))
+
+ # add coverage "metrics" to list of stored metrics
+ # this makes it possible to use `summarise_scores()` later on
+ stored_attributes[["metric_names"]] <- c(
+ stored_attributes[["metric_names"]],
+ new_metrics
+ )
+ data <- assign_attributes(data, stored_attributes)
+ return(data[])
+}
diff --git a/R/check-input-helpers.R b/R/check-input-helpers.R
index cfaa24b2c..95869f02b 100644
--- a/R/check-input-helpers.R
+++ b/R/check-input-helpers.R
@@ -297,12 +297,22 @@ check_columns_present <- function(data, columns) {
}
assert_character(columns, min.len = 1)
colnames <- colnames(data)
+ missing <- list()
for (x in columns){
if (!(x %in% colnames)) {
- msg <- paste0("Column '", x, "' not found in data")
- return(msg)
+ missing[[x]] <- x
}
}
+ missing <- unlist(missing)
+ if (length(missing) > 1) {
+ msg <- paste0(
+ "Columns '", paste(missing, collapse = "', '"), "' not found in data"
+ )
+ return(msg)
+ } else if (length(missing) == 1) {
+ msg <- paste0("Column '", missing, "' not found in data")
+ return(msg)
+ }
return(TRUE)
}
diff --git a/R/convenience-functions.R b/R/convenience-functions.R
index 870e4ac3d..31f3ab5ab 100644
--- a/R/convenience-functions.R
+++ b/R/convenience-functions.R
@@ -235,21 +235,13 @@ log_shift <- function(x, offset = 0, base = exp(1)) {
#' example_quantile,
#' c("location", "target_end_date", "target_type", "horizon", "model")
#' )
-
set_forecast_unit <- function(data, forecast_unit) {
-
- datacols <- colnames(data)
- missing <- forecast_unit[!(forecast_unit %in% datacols)]
-
- if (length(missing) > 0) {
- warning(
- "Column(s) '",
- missing,
- "' are not columns of the data and will be ignored."
- )
- forecast_unit <- intersect(forecast_unit, datacols)
+ data <- ensure_data.table(data)
+ missing <- check_columns_present(data, forecast_unit)
+ if (!is.logical(missing)) {
+ warning(missing)
+ forecast_unit <- intersect(forecast_unit, colnames(data))
}
-
keep_cols <- c(get_protected_columns(data), forecast_unit)
out <- unique(data[, .SD, .SDcols = keep_cols])[]
return(out)
diff --git a/R/data.R b/R/data.R
index 48548ffed..c57e044d8 100644
--- a/R/data.R
+++ b/R/data.R
@@ -19,7 +19,7 @@
#' \item{model}{name of the model that generated the forecasts}
#' \item{horizon}{forecast horizon in weeks}
#' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
"example_quantile"
@@ -44,7 +44,7 @@
#' \item{model}{name of the model that generated the forecasts}
#' \item{horizon}{forecast horizon in weeks}
#' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
"example_point"
@@ -69,7 +69,7 @@
#' \item{predicted}{predicted value}
#' \item{sample_id}{id for the corresponding sample}
#' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
"example_continuous"
@@ -124,7 +124,7 @@
#' \item{horizon}{forecast horizon in weeks}
#' \item{predicted}{predicted value}
#' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
"example_binary"
@@ -147,7 +147,7 @@
#' \item{model}{name of the model that generated the forecasts}
#' \item{horizon}{forecast horizon in weeks}
#' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
"example_quantile_forecasts_only"
@@ -167,7 +167,7 @@
#' \item{observed}{observed values}
#' \item{location_name}{name of the country for which a prediction was made}
#' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
"example_truth_only"
#' Summary information for selected metrics
@@ -216,8 +216,13 @@
#'
#' A named list with functions:
#' - "wis" = [wis()]
+#' - "overprediction" = [overprediction()]
+#' - "underprediction" = [underprediction()]
+#' - "dispersion" = [dispersion()]
#' - "bias" = [bias_quantile()]
-#' - "coverage_50" = \(...) {run_safely(..., range = 50, fun = interval_coverage_quantile)} #nolint
-#' - "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)} #nolint
+#' - "coverage_50" = \(...) {run_safely(..., range = 50, fun = [interval_coverage_quantile][interval_coverage_quantile()])}
+#' - "coverage_90" = \(...) {run_safely(..., range = 90, fun = [interval_coverage_quantile][interval_coverage_quantile()])}
+#' - "coverage_deviation" = [interval_coverage_deviation_quantile()],
+#' - "ae_median" = [ae_median_quantile()]
#' @keywords info
"metrics_quantile"
diff --git a/R/get_-functions.R b/R/get_-functions.R
index 22aaa47a9..b55e1ef4f 100644
--- a/R/get_-functions.R
+++ b/R/get_-functions.R
@@ -193,6 +193,8 @@ get_protected_columns <- function(data = NULL) {
protected_columns <- c(
"predicted", "observed", "sample_id", "quantile", "upper", "lower",
"pit_value", "range", "boundary", "relative_skill", "scaled_rel_skill",
+ "interval_coverage", "interval_coverage_deviation",
+ "quantile_coverage", "quantile_coverage_deviation",
available_metrics(),
grep("coverage_", names(data), fixed = TRUE, value = TRUE)
)
diff --git a/R/metrics-quantile.R b/R/metrics-quantile.R
index f3598690e..985b00002 100644
--- a/R/metrics-quantile.R
+++ b/R/metrics-quantile.R
@@ -80,14 +80,21 @@
#' `overprediction`, `underprediction`, and `dispersion.`
#'
#' @inheritParams interval_score
-#' @param predicted vector of size n with the predicted values
+#' @param observed numeric vector of size n with the observed values
+#' @param predicted numeric nxN matrix of predictive
+#' quantiles, n (number of rows) being the number of forecasts (corresponding
+#' to the number of observed values) and N
+#' (number of columns) the number of quantiles per forecast.
+#' If `observed` is just a single number, then predicted can just be a
+#' vector of size N.
#' @param quantile vector with quantile levels of size N
#' @param count_median_twice if TRUE, count the median twice in the score
#' @param na.rm if TRUE, ignore NA values when computing the score
#' @importFrom stats weighted.mean
+#' @importFrom checkmate assert_logical
#' @return
-#' `wis()`: a numeric vector with WIS values (one per observation), or a list
-#' with separate entries if `separate_results` is `TRUE`.
+#' `wis()`: a numeric vector with WIS values of size n (one per observation),
+#' or a list with separate entries if `separate_results` is `TRUE`.
#' @export
wis <- function(observed,
predicted,
@@ -99,6 +106,11 @@ wis <- function(observed,
assert_input_quantile(observed, predicted, quantile)
reformatted <- quantile_to_interval(observed, predicted, quantile)
+ assert_logical(separate_results, len = 1)
+ assert_logical(weigh, len = 1)
+ assert_logical(count_median_twice, len = 1)
+ assert_logical(na.rm, len = 1)
+
if (separate_results) {
cols <- c("wis", "dispersion", "underprediction", "overprediction")
} else {
@@ -205,7 +217,7 @@ interval_coverage_quantile <- function(observed, predicted, quantile, range = 50
if (!all(necessary_quantiles %in% quantile)) {
warning(
"To compute the coverage for a range of ", range, "%, the quantiles ",
- necessary_quantiles, " are required. Returnting `NA`.")
+ necessary_quantiles, " are required. Returning `NA`.")
return(NA)
}
r <- range
@@ -218,6 +230,90 @@ interval_coverage_quantile <- function(observed, predicted, quantile, range = 50
}
+#' @title Interval Coverage Deviation (For Quantile-Based Forecasts)
+#' @description Check the agreement between desired and actual interval coverage
+#' of a forecast.
+#'
+#' The function is similar to [interval_coverage_quantile()],
+#' but looks at all provided prediction intervals instead of only one. It
+#' compares nominal coverage (i.e. the desired coverage) with the actual
+#' observed coverage.
+#'
+#' A central symmetric prediction interval is defined by a lower and an
+#' upper bound formed by a pair of predictive quantiles. For example, a 50%
+#' prediction interval is formed by the 0.25 and 0.75 quantiles of the
+#' predictive distribution. Ideally, a forecaster should aim to cover about
+#' 50% of all observed values with their 50% prediction intervals, 90% of all
+#' observed values with their 90% prediction intervals, and so on.
+#'
+#' For every prediction interval, the deviation is computed as the difference
+#' between the observed coverage and the nominal coverage
+#' For a single observed value and a single prediction interval,
+#' coverage is always either 0 or 1. This is not the case for a single observed
+#' value and multiple prediction intervals, but it still doesn't make that much
+#' sense to compare nominal (desired) coverage and actual coverage for a single
+#' observation. In that sense coverage deviation only really starts to make
+#' sense as a metric when averaged across multiple observations).
+#'
+#' Positive values of coverage deviation are an indication for underconfidence,
+#' i.e. the forecaster could likely have issued a narrower forecast. Negative
+#' values are an indication for overconfidence, i.e. the forecasts were too
+#' narrow.
+#'
+#' \deqn{
+#' \textrm{coverage deviation} =
+#' \mathbf{1}(\textrm{observed value falls within interval} -
+#' \textrm{nominal coverage})
+#' }{
+#' coverage deviation =
+#' 1(observed value falls within interval) - nominal coverage
+#' }
+#' The coverage deviation is then averaged across all prediction intervals.
+#' The median is ignored when computing coverage deviation.
+#' @inheritParams wis
+#' @return A numeric vector of length n with the coverage deviation for each
+#' forecast (comprising one or multiple prediction intervals).
+#' @export
+#' @examples
+#' observed <- c(1, -15, 22)
+#' predicted <- rbind(
+#' c(-1, 0, 1, 2, 3),
+#' c(-2, 1, 2, 2, 4),
+#' c(-2, 0, 3, 3, 4)
+#' )
+#' quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9)
+#' interval_coverage_deviation_quantile(observed, predicted, quantile)
+interval_coverage_deviation_quantile <- function(observed, predicted, quantile) {
+ assert_input_quantile(observed, predicted, quantile)
+
+ # transform available quantiles into central interval ranges
+ available_ranges <- unique(get_range_from_quantile(quantile))
+
+ # check if all necessary quantiles are available
+ necessary_quantiles <- unique(c(
+ (100 - available_ranges) / 2,
+ 100 - (100 - available_ranges) / 2) / 100
+ )
+ if (!all(necessary_quantiles %in% quantile)) {
+ missing <- necessary_quantiles[!necessary_quantiles %in% quantile]
+ warning(
+ "To compute coverage deviation, all quantiles must form central ",
+ "symmetric prediction intervals. Missing quantiles: ",
+ toString(missing), ". Returning `NA`.")
+ return(NA)
+ }
+
+ reformatted <- quantile_to_interval(observed, predicted, quantile)[range != 0]
+ reformatted[, coverage := ifelse(
+ observed >= lower & observed <= upper, TRUE, FALSE
+ )]
+ reformatted[, coverage_deviation := coverage - range / 100]
+ out <- reformatted[, .(coverage_deviation = mean(coverage_deviation)),
+ by = c("forecast_id")]
+ return(out$coverage_deviation)
+}
+
+
#' @title Determines Bias of Quantile Forecasts
#'
#' @description
@@ -364,6 +460,45 @@ bias_quantile_single_vector <- function(observed, predicted, quantile, na.rm) {
}
+#' @title Absolute Error of the Median (Quantile-based Version)
+#' @description
+#' Compute the absolute error of the median calculated as
+#' \deqn{
+#' \textrm{abs}(\textrm{observed} - \textrm{median prediction})
+#' }{
+#' abs(observed - median_prediction)
+#' }
+#' The median prediction is the predicted value for which quantile == 0.5,
+#' the function therefore requires 0.5 to be among the quantile levels in
+#' `quantile`.
+#' @inheritParams wis
+#' @return numeric vector of length N with the absolute error of the median
+#' @seealso [ae_median_sample()], [abs_error()]
+#' @importFrom stats median
+#' @examples
+#' observed <- rnorm(30, mean = 1:30)
+#' predicted_values <- matrix(rnorm(30, mean = 1:30))
+#' ae_median_quantile(observed, predicted_values, quantile = 0.5)
+#' @export
+#' @keywords metric
+ae_median_quantile <- function(observed, predicted, quantile) {
+ assert_input_quantile(observed, predicted, quantile)
+ if (!any(quantile == 0.5)) {
+ warning(
+ "in order to compute the absolute error of the median, `0.5` must be ",
+ "among the quantiles given. Returning `NA`."
+ )
+ return(NA_real_)
+ }
+ if (is.null(dim(predicted))) {
+ predicted <- matrix(predicted, nrow = 1)
+ }
+ predicted <- predicted[, quantile == 0.5]
+ abs_error_median <- abs(observed - predicted)
+ return(abs_error_median)
+}
+
+
################################################################################
# Metrics with a one-to-one relationship between input and score
################################################################################
@@ -529,51 +664,3 @@ wis_one_to_one <- function(observed,
}
}
}
-
-
-#' @title Absolute Error of the Median (Quantile-based Version)
-#'
-#' @description
-#' Absolute error of the median calculated as
-#'
-#' \deqn{
-#' \textrm{abs}(\textrm{observed} - \textrm{prediction})
-#' }{
-#' abs(observed - median_prediction)
-#' }
-#'
-#' The function was created for internal use within [score()], but can also
-#' used as a standalone function.
-#'
-#' @param predicted numeric vector with predictions, corresponding to the
-#' quantiles in a second vector, `quantiles`.
-#' @param quantiles numeric vector that denotes the quantile for the values
-#' in `predicted`. Only those predictions where `quantiles == 0.5` will
-#' be kept. If `quantiles` is `NULL`, then all `predicted` and
-#' `observed` will be used (this is then the same as [abs_error()])
-#' @return vector with the scoring values
-#' @seealso [ae_median_sample()], [abs_error()]
-#' @importFrom stats median
-#' @inheritParams ae_median_sample
-#' @examples
-#' observed <- rnorm(30, mean = 1:30)
-#' predicted_values <- rnorm(30, mean = 1:30)
-#' ae_median_quantile(observed, predicted_values, quantiles = 0.5)
-#' @export
-#' @keywords metric
-
-ae_median_quantile <- function(observed, predicted, quantiles = NULL) {
- if (!is.null(quantiles)) {
- if (!any(quantiles == 0.5) && !anyNA(quantiles)) {
- return(NA_real_)
- warning(
- "in order to compute the absolute error of the median, `0.5` must be ",
- "among the quantiles given. Maybe you want to use `abs_error()`?"
- )
- }
- observed <- observed[quantiles == 0.5]
- predicted <- predicted[quantiles == 0.5]
- }
- abs_error_median <- abs(observed - predicted)
- return(abs_error_median)
-}
diff --git a/R/metrics-sample.R b/R/metrics-sample.R
index 73026bfcf..803fb4c4e 100644
--- a/R/metrics-sample.R
+++ b/R/metrics-sample.R
@@ -96,18 +96,17 @@ bias_sample <- function(observed, predicted) {
#' @importFrom stats median
#' @examples
#' observed <- rnorm(30, mean = 1:30)
-#' predicted_values <- rnorm(30, mean = 1:30)
+#' predicted_values <- matrix(rnorm(30, mean = 1:30))
#' ae_median_sample(observed, predicted_values)
#' @export
#' @keywords metric
ae_median_sample <- function(observed, predicted) {
+ assert_input_sample(observed, predicted)
median_predictions <- apply(
- as.matrix(predicted), MARGIN = 1, FUN = median # this is rowwise
+ as.matrix(predicted), MARGIN = 1, FUN = median # this is row-wise
)
-
ae_median <- abs(observed - median_predictions)
-
return(ae_median)
}
@@ -118,11 +117,11 @@ ae_median_sample <- function(observed, predicted) {
#' Squared error of the mean calculated as
#'
#' \deqn{
-#' \textrm{mean}(\textrm{observed} - \textrm{prediction})^2
+#' \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2
#' }{
-#' mean(observed - mean_prediction)^2
+#' mean(observed - mean prediction)^2
#' }
-#'
+#' The mean prediction is calculated as the mean of the predictive samples.
#' @param observed A vector with observed values of size n
#' @param predicted nxN matrix of predictive samples, n (number of rows) being
#' the number of data points and N (number of columns) the number of Monte
@@ -131,12 +130,13 @@ ae_median_sample <- function(observed, predicted) {
#' @seealso [squared_error()]
#' @examples
#' observed <- rnorm(30, mean = 1:30)
-#' predicted_values <- rnorm(30, mean = 1:30)
+#' predicted_values <- matrix(rnorm(30, mean = 1:30))
#' se_mean_sample(observed, predicted_values)
#' @export
#' @keywords metric
se_mean_sample <- function(observed, predicted) {
+ assert_input_sample(observed, predicted)
mean_predictions <- rowMeans(as.matrix(predicted))
se_mean <- (observed - mean_predictions)^2
diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R
index f23316254..eab561adb 100644
--- a/R/pairwise-comparisons.R
+++ b/R/pairwise-comparisons.R
@@ -28,7 +28,7 @@
#'
#' @param scores A data.table of scores as produced by [score()].
#' @param metric A character vector of length one with the metric to do the
-#' comparison on. The default is "auto", meaning that either "interval_score",
+#' comparison on. The default is "auto", meaning that either "wis",
#' "crps", or "brier_score" will be selected where available.
#' @param by character vector with names of columns present in the input
#' data.frame. `by` determines how pairwise comparisons will be computed.
@@ -366,8 +366,8 @@ compare_two_models <- function(scores,
#' @keywords internal
infer_rel_skill_metric <- function(scores) {
- if ("interval_score" %in% colnames(scores)) {
- rel_skill_metric <- "interval_score"
+ if ("wis" %in% colnames(scores)) {
+ rel_skill_metric <- "wis"
} else if ("crps" %in% colnames(scores)) {
rel_skill_metric <- "crps"
} else if ("brier_score" %in% colnames(scores)) {
diff --git a/R/pit.R b/R/pit.R
index 2e00e4b90..ee5e09c7c 100644
--- a/R/pit.R
+++ b/R/pit.R
@@ -189,18 +189,10 @@ pit <- function(data,
data <- remove_na_observed_predicted(data)
forecast_type <- get_forecast_type(data)
- # if prediction type is quantile, simply extract coverage values from
- # score and returned a list with named vectors
if (forecast_type == "quantile") {
- coverage <-
- score(data, metrics = "quantile_coverage")
-
- coverage <- summarise_scores(coverage,
- by = unique(c(by, "quantile"))
- )
- # remove all existing attributes and class
- coverage <- remove_scoringutils_class(coverage)
-
+ data[, quantile_coverage := (observed <= predicted)]
+ coverage <- data[, .(quantile_coverage = mean(quantile_coverage)),
+ by = c(unique(c(by, "quantile")))]
coverage <- coverage[order(quantile),
.(
quantile = c(quantile, 1),
@@ -208,7 +200,6 @@ pit <- function(data,
),
by = c(get_forecast_unit(coverage))
]
-
return(coverage[])
}
diff --git a/R/plot.R b/R/plot.R
index f90be0599..84ae33eb1 100644
--- a/R/plot.R
+++ b/R/plot.R
@@ -221,7 +221,7 @@ plot_wis <- function(scores,
#' produced by [score()] or [summarise_scores()]. Note that "range" must be included
#' in the `by` argument when running [summarise_scores()]
#' @param y The variable from the scores you want to show on the y-Axis.
-#' This could be something like "interval_score" (the default) or "dispersion"
+#' This could be something like "wis" (the default) or "dispersion"
#' @param x The variable from the scores you want to show on the x-Axis.
#' Usually this will be "model"
#' @param colour Character vector of length one used to determine a variable
@@ -233,18 +233,18 @@ plot_wis <- function(scores,
#' @export
#' @examples
#' library(ggplot2)
-#' scores <- score(example_quantile)
-#' scores <- summarise_scores(scores, by = c("model", "target_type", "range"))
+#' # scores <- score(example_quantile)
+#' # scores <- summarise_scores(scores, by = c("model", "target_type", "range"))
#'
-#' plot_ranges(scores, x = "model") +
-#' facet_wrap(~target_type, scales = "free")
+#' # plot_ranges(scores, x = "model") +
+#' # facet_wrap(~target_type, scales = "free")
#'
#' # visualise dispersion instead of interval score
-#' plot_ranges(scores, y = "dispersion", x = "model") +
-#' facet_wrap(~target_type)
+#' # plot_ranges(scores, y = "dispersion", x = "model") +
+#' # facet_wrap(~target_type)
plot_ranges <- function(scores,
- y = "interval_score",
+ y = "wis",
x = "model",
colour = "range") {
plot <- ggplot(
@@ -296,7 +296,7 @@ plot_ranges <- function(scores,
#' @export
#' @examples
#' scores <- score(example_quantile)
-#' scores <- summarise_scores(scores, by = c("model", "target_type", "range"))
+#' scores <- summarise_scores(scores, by = c("model", "target_type"))
#'
#' plot_heatmap(scores, x = "target_type", metric = "bias")
@@ -582,10 +582,10 @@ make_na <- make_NA
#' @importFrom data.table dcast
#' @export
#' @examples
-#' data.table::setDTthreads(1) # only needed to avoid issues on CRAN
-#' scores <- score(example_quantile)
-#' scores <- summarise_scores(scores, by = c("model", "range"))
-#' plot_interval_coverage(scores)
+#' # data.table::setDTthreads(1) # only needed to avoid issues on CRAN
+#' # scores <- score(example_quantile)
+#' # scores <- summarise_scores(scores, by = c("model", "range"))
+#' # plot_interval_coverage(scores)
plot_interval_coverage <- function(scores,
colour = "model") {
@@ -613,7 +613,7 @@ plot_interval_coverage <- function(scores,
colour = "grey",
linetype = "dashed"
) +
- geom_line(aes(y = coverage * 100)) +
+ geom_line(aes(y = interval_coverage * 100)) +
theme_scoringutils() +
ylab("% Obs inside interval") +
xlab("Nominal interval coverage") +
@@ -638,9 +638,9 @@ plot_interval_coverage <- function(scores,
#' @importFrom data.table dcast
#' @export
#' @examples
-#' scores <- score(example_quantile)
-#' scores <- summarise_scores(scores, by = c("model", "quantile"))
-#' plot_quantile_coverage(scores)
+#' # scores <- score(example_quantile)
+#' # scores <- summarise_scores(scores, by = c("model", "quantile"))
+#' # plot_quantile_coverage(scores)
plot_quantile_coverage <- function(scores,
colour = "model") {
diff --git a/R/score.R b/R/score.R
index 3d1dc7f07..29230e6e0 100644
--- a/R/score.R
+++ b/R/score.R
@@ -152,18 +152,10 @@ score.scoringutils_binary <- function(data, metrics = metrics_binary, ...) {
data <- remove_na_observed_predicted(data)
metrics <- validate_metrics(metrics)
- # Extract the arguments passed in ...
- args <- list(...)
- lapply(seq_along(metrics), function(i, ...) {
- metric_name <- names(metrics[i])
- fun <- metrics[[i]]
- matching_args <- filter_function_args(fun, args)
-
- data[, (metric_name) := do.call(
- fun, c(list(observed, predicted), matching_args)
- )]
- return()
- }, ...)
+ data <- apply_metrics(
+ data, metrics,
+ data$observed, data$predicted, ...
+ )
setattr(data, "metric_names", names(metrics))
@@ -180,18 +172,10 @@ score.scoringutils_point <- function(data, metrics = metrics_point, ...) {
data <- remove_na_observed_predicted(data)
metrics <- validate_metrics(metrics)
- # Extract the arguments passed in ...
- args <- list(...)
- lapply(seq_along(metrics), function(i, ...) {
- metric_name <- names(metrics[i])
- fun <- metrics[[i]]
- matching_args <- filter_function_args(fun, args)
-
- data[, (metric_name) := do.call(
- fun, c(list(observed, predicted), matching_args)
- )]
- return()
- }, ...)
+ data <- apply_metrics(
+ data, metrics,
+ data$observed, data$predicted, ...
+ )
setattr(data, "metric_names", names(metrics))
@@ -206,101 +190,67 @@ score.scoringutils_sample <- function(data, metrics = metrics_sample, ...) {
forecast_unit <- attr(data, "forecast_unit")
metrics <- validate_metrics(metrics)
- # Extract the arguments passed in ...
- args <- list(...)
- lapply(seq_along(metrics), function(i, ...) {
- metric_name <- names(metrics[i])
- fun <- metrics[[i]]
- matching_args <- filter_function_args(fun, args)
+ # transpose the forecasts that belong to the same forecast unit
+ d_transposed <- data[, .(predicted = list(predicted),
+ observed = unique(observed),
+ scoringutils_N = length(list(sample_id))),
+ by = forecast_unit]
- data[, (metric_name) := do.call(
- fun, c(list(unique(observed), t(predicted)), matching_args)
- ), by = forecast_unit]
- return()
- },
- ...)
+ # split according to number of samples and do calculations for different
+ # sample lengths separately
+ d_split <- split(d_transposed, d_transposed$scoringutils_N)
- data <- data[
- , lapply(.SD, unique),
- .SDcols = colnames(data) %like% paste(names(metrics), collapse = "|"),
- by = forecast_unit
- ]
+ split_result <- lapply(d_split, function(data) {
+ # create a matrix
+ observed <- data$observed
+ predicted <- do.call(rbind, data$predicted)
+ data[, c("observed", "predicted", "scoringutils_N") := NULL]
+ data <- apply_metrics(
+ data, metrics,
+ observed, predicted, ...
+ )
+ return(data)
+ })
+ data <- rbindlist(split_result)
setattr(data, "metric_names", names(metrics))
return(data[])
}
+#' @importFrom data.table `:=` as.data.table rbindlist %like%
#' @rdname score
#' @export
-score.scoringutils_quantile <- function(data, metrics = NULL, ...) {
- data <- validate(data)
- data <- remove_na_observed_predicted(data)
- forecast_unit <- attr(data, "forecast_unit")
-
- if (is.null(metrics)) {
- metrics <- available_metrics()
- }
- metrics <- metrics[metrics %in% available_metrics()]
- scores <- score_quantile(
- data = data,
- forecast_unit = forecast_unit,
- metrics = metrics,
- ...
- )
-
- setattr(scores, "metric_names", metrics[metrics %in% colnames(scores)])
- # manual hack to make sure that the correct attributes are there.
- setattr(scores, "forecast_unit", forecast_unit)
- setattr(scores, "forecast_type", "quantile")
- scores <- new_scoringutils(scores, "scoringutils_quantile")
-
- return(scores[])
-}
-
-
-#' @rdname score
-#' @export
-score.scoringutils_quantile_new <- function(data, metrics = metrics_quantile, ...) {
+score.scoringutils_quantile <- function(data, metrics = metrics_quantile, ...) {
data <- validate(data)
data <- remove_na_observed_predicted(data)
forecast_unit <- attr(data, "forecast_unit")
metrics <- validate_metrics(metrics)
- # Extract the arguments passed in ...
- args <- list(...)
-
# transpose the forecasts that belong to the same forecast unit
# make sure the quantiles and predictions are ordered in the same way
d_transposed <- data[, .(predicted = list(predicted[order(quantile)]),
observed = unique(observed),
quantile = list(quantile[order(quantile)]),
- N = length(quantile)), by = forecast_unit]
+ scoringutils_quantile = toString(quantile[order(quantile)])),
+ by = forecast_unit]
# split according to quantile lengths and do calculations for different
# quantile lengths separately. The function `wis()` assumes that all
# forecasts have the same quantiles
- d_split <- split(d_transposed, d_transposed$N)
+ d_split <- split(d_transposed, d_transposed$scoringutils_quantile)
split_result <- lapply(d_split, function(data) {
# create a matrix out of the list of predicted values and quantiles
observed <- data$observed
predicted <- do.call(rbind, data$predicted)
quantile <- unlist(unique(data$quantile))
- data[, c("observed", "predicted", "quantile", "N") := NULL]
-
- # for each metric, compute score
- lapply(seq_along(metrics), function(i, ...) {
- metric_name <- names(metrics[i])
- fun <- metrics[[i]]
- matching_args <- filter_function_args(fun, args)
+ data[, c("observed", "predicted", "quantile", "scoringutils_quantile") := NULL]
- data[, eval(metric_name) := do.call(
- fun, c(list(observed), list(predicted), list(quantile), matching_args)
- )]
- return()
- },
- ...)
+ data <- apply_metrics(
+ data, metrics,
+ observed, predicted, quantile, ...
+ )
return(data)
})
@@ -309,3 +259,18 @@ score.scoringutils_quantile_new <- function(data, metrics = metrics_quantile, ..
return(data[])
}
+
+apply_metrics <- function(data, metrics, ...) {
+ expr <- expression(
+ data[, (metric_name) := do.call(run_safely, list(..., fun = fun))]
+ )
+ lapply(seq_along(metrics), function(i, data, ...) {
+ metric_name <- names(metrics[i])
+ fun <- metrics[[i]]
+ eval(expr)
+ }, data, ...)
+ return(data)
+}
+
+
+
diff --git a/R/score_quantile.R b/R/score_quantile.R
deleted file mode 100644
index f2da97f04..000000000
--- a/R/score_quantile.R
+++ /dev/null
@@ -1,167 +0,0 @@
-#' @title Evaluate forecasts in a Quantile-Based Format
-#'
-#' @inheritParams score
-#' @inheritParams interval_score
-#' @param count_median_twice logical that controls whether or not to count the
-#' median twice when summarising (default is \code{FALSE}). Counting the
-#' median twice would conceptually treat it as a 0\% prediction interval, where
-#' the median is the lower as well as the upper bound. The alternative is to
-#' treat the median as a single quantile forecast instead of an interval. The
-#' interval score would then be better understood as an average of quantile
-#' scores.
-#' @param forecast_unit A character vector with the column names that define
-#' the unit of a single forecast, i.e. a forecast was made for a combination
-#' of the values in `forecast_unit`
-#'
-#' @return A data.table with appropriate scores. For more information see
-#' [score()]
-#'
-#' @importFrom data.table ':=' as.data.table rbindlist %like%
-#'
-#' @author Nikos Bosse \email{nikosbosse@@gmail.com}
-#' @inherit score references
-#' @keywords internal
-
-score_quantile <- function(data,
- forecast_unit,
- metrics,
- weigh = TRUE,
- count_median_twice = FALSE,
- separate_results = TRUE) {
-
- data <- remove_na_observed_predicted(data)
-
- # make sure to have both quantile as well as range format --------------------
- range_data <- quantile_to_interval(
- data,
- keep_quantile_col = FALSE
- )
- # adds the range column to the quantile data set
- quantile_data <- range_long_to_quantile(
- range_data,
- keep_range_col = TRUE
- )
-
- # to deal with point forecasts in a quantile format. This in effect adds
- # a third column next to lower and upper after pivoting
- range_data[is.na(range), boundary := "point"]
-
- range_data <- data.table::dcast(range_data, ... ~ boundary,
- value.var = "predicted"
- )
-
- # if we only score point forecasts, it may be true that there are no columns
- # upper and lower in the data.frame. If so, these need to be added
- if (!all(c("upper", "lower") %in% colnames(range_data))) {
- range_data[, c("upper", "lower") := NA]
- }
-
- # set up results data.table that will then be modified throughout ------------
- res <- data.table::copy(range_data)
-
- # calculate scores on range format -------------------------------------------
- if ("interval_score" %in% metrics) {
- # compute separate results if desired
- if (separate_results) {
- outcols <- c(
- "interval_score", "dispersion",
- "underprediction", "overprediction"
- )
- } else {
- outcols <- "interval_score"
- }
- res <- res[, eval(outcols) := do.call(
- scoringutils::interval_score,
- list(observed, lower,
- upper, range,
- weigh,
- separate_results = separate_results
- )
- )]
- }
-
- # compute coverage for every single observation
- if ("coverage" %in% metrics) {
- res[, coverage := ifelse(observed <= upper & observed >= lower, 1, 0)] # nolint
- res[, coverage_deviation := coverage - range / 100]
- }
-
- # compute bias
- if ("bias" %in% metrics) {
- res[, bias := bias_range(
- range = range, lower = lower, upper = upper,
- observed = unique(observed)
- ),
- by = forecast_unit
- ]
- }
-
- # compute absolute and squared error for point forecasts
- # these are marked by an NA in range, and a numeric value for point
- compute_point <- any(
- c("se_point, se_mean, ae_point", "ae_median", "absolute_error") %in% metrics
- )
- if (compute_point && "point" %in% colnames(res)) {
- res[
- is.na(range) & is.numeric(point),
- `:=`(ae_point = abs_error(predicted = point, observed),
- se_point = squared_error(predicted = point, observed))
- ]
- }
-
- # calculate scores on quantile format ----------------------------------------
- # compute absolute error of the median
- if ("ae_median" %in% metrics) {
- quantile_data[, ae_median := ae_median_quantile(
- observed,
- predicted,
- quantile
- ),
- by = forecast_unit
- ]
- }
-
- # compute quantile coverage based on quantile version
- if ("quantile_coverage" %in% metrics) {
- quantile_data[, quantile_coverage := (observed <= predicted)]
- }
-
- # merge metrics computed on quantile data (i.e. ae_median, quantile_coverage) back
- # into metrics computed on range data. One important side effect of this is
- # that it controls whether we count the median twice for the interval score
- # (row is then duplicated) or only once. However, merge only needs to happen
- # if we computed either the interval score or the ae_median or quantile coverage
- if (any(c("ae_median", "interval_score", "quantile_coverage") %in% metrics)) {
- # delete unnecessary columns before merging back
- keep_cols <- unique(c(
- forecast_unit, "quantile", "ae_median", "quantile_coverage",
- "boundary", "range"
- ))
- delete_cols <- names(quantile_data)[!(names(quantile_data) %in% keep_cols)]
- quantile_data[, eval(delete_cols) := NULL]
-
- # duplicate median column before merging if median is to be counted twice
- # if this is false, then the res will have one entry for every quantile,
- # which translates to two rows for every interval, but only one for the median
- if (count_median_twice) {
- median <- quantile_data[quantile == 0.5, ][, boundary := "upper"]
- quantile_data <- data.table::rbindlist(list(quantile_data, median))
- }
-
- # merge back with other metrics
- merge_cols <- setdiff(keep_cols, c(
- "ae_median", "quantile_coverage", "quantile",
- "boundary"
- ))
- # specify all.x = TRUE as the point forecasts got deleted when
- # going from range to quantile above
- res <- merge(res, quantile_data, by = merge_cols, all.x = TRUE)
- }
-
- # delete internal columns before returning result
- res <- delete_columns(
- res, c("upper", "lower", "boundary", "point", "observed")
- )
-
- return(res[])
-}
diff --git a/R/summarise_scores.R b/R/summarise_scores.R
index e62bac789..032026402 100644
--- a/R/summarise_scores.R
+++ b/R/summarise_scores.R
@@ -306,78 +306,3 @@ check_summary_params <- function(scores,
}
return(relative_skill)
}
-
-
-
-#' @title Add coverage of central prediction intervals
-#'
-#' @description Adds a column with the coverage of central prediction intervals
-#' to unsummarised scores as produced by [score()]
-#'
-#' The coverage values that are added are computed according to the values
-#' specified in `by`. If, for example, `by = "model"`, then there will be one
-#' coverage value for every model and [add_coverage()] will compute the coverage
-#' for every model across the values present in all other columns which define
-#' the unit of a single forecast.
-#'
-#' @inheritParams summarise_scores
-#' @param by character vector with column names to add the coverage for.
-#' @param ranges numeric vector of the ranges of the central prediction intervals
-#' for which coverage values shall be added.
-#' @return a data.table with unsummarised scores with columns added for the
-#' coverage of the central prediction intervals. While the overall data.table
-#' is still unsummarised, note that for the coverage columns some level of
-#' summary is present according to the value specified in `by`.
-#' @examples
-#' library(magrittr) # pipe operator
-#' score(example_quantile) %>%
-#' add_coverage(by = c("model", "target_type")) %>%
-#' summarise_scores(by = c("model", "target_type")) %>%
-#' summarise_scores(fun = signif, digits = 2)
-#' @export
-#' @keywords scoring
-
-add_coverage <- function(scores,
- by = NULL,
- ranges = c(50, 90)) {
-
- stored_attributes <- get_scoringutils_attributes(scores)
- if (!is.null(attr(scores, "unsummarised_scores"))) {
- scores <- attr(scores, "unsummarised_scores")
- }
-
- if (is.null(by) && !is.null(stored_attributes[["scoringutils_by"]])) {
- by <- stored_attributes[["scoringutils_by"]]
- } else if (is.null(by)) {
- # Need to check this again.
- # (mentioned in https://github.com/epiforecasts/scoringutils/issues/346)
- by <- get_forecast_unit(scores)
- }
-
- summarised_scores <- summarise_scores(
- scores,
- by = c(by, "range")
- )[range %in% ranges]
-
-
- # create cast formula
- cast_formula <-
- paste(
- paste(by, collapse = "+"),
- "~",
- "paste0('coverage_', range)"
- )
-
- coverages <- dcast(
- summarised_scores,
- value.var = "coverage",
- formula = cast_formula
- )
-
- scores_with_coverage <- merge(scores, coverages, by = by)
- scores_with_coverage <- assign_attributes(
- scores_with_coverage, stored_attributes
- )
-
- return(scores_with_coverage[])
-}
diff --git a/R/utils.R b/R/utils.R
index 3de1a9eb8..53a2d800e 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -5,7 +5,8 @@
#' @keywords info
available_metrics <- function() {
- return(unique(scoringutils::metrics$Name))
+ return(unique(c(scoringutils::metrics$Name,
+ "wis", "coverage_50", "coverage_90")))
}
@@ -251,3 +252,21 @@ run_safely <- function(..., fun) {
return(result)
}
+
+#' Ensure That an Object is a Data Table
+#' @description This function ensures that an object is a data table.
+#' If the object is not a data table, it is converted to one. If the object
+#' is a data table, a copy of the object is returned.
+#' @param data An object to ensure is a data table
+#' @return A data table
+#' @keywords internal
+#' @importFrom data.table copy is.data.table as.data.table
+ensure_data.table <- function(data) {
+ if (!is.data.table(data)) {
+ data <- as.data.table(data)
+ } else {
+ data <- copy(data)
+ }
+ return(data)
+}
+
diff --git a/R/utils_data_handling.R b/R/utils_data_handling.R
index cb9470e87..1b1302dbf 100644
--- a/R/utils_data_handling.R
+++ b/R/utils_data_handling.R
@@ -101,12 +101,7 @@ merge_pred_and_obs <- function(forecasts, observations,
sample_to_quantile <- function(data,
quantiles = c(0.05, 0.25, 0.5, 0.75, 0.95),
type = 7) {
- if (!is.data.table(data)) {
- data <- data.table::as.data.table(data)
- } else {
- data <- copy(data)
- }
-
+ data <- ensure_data.table(data)
reserved_columns <- c("predicted", "sample_id")
by <- setdiff(colnames(data), reserved_columns)
@@ -208,19 +203,10 @@ quantile_to_interval.data.frame <- function(dt,
format = "long",
keep_quantile_col = FALSE,
...) {
- if (!is.data.table(dt)) {
- dt <- data.table::as.data.table(dt)
- } else {
- # use copy to avoid
- dt <- copy(dt)
- }
+ dt <- ensure_data.table(dt)
dt[, boundary := ifelse(quantile <= 0.5, "lower", "upper")]
- dt[, range := ifelse(
- boundary == "lower",
- round((1 - 2 * quantile) * 100, 10),
- round((2 * quantile - 1) * 100, 10)
- )]
+ dt[, range := get_range_from_quantile(quantile)]
# add median quantile
median <- dt[quantile == 0.5, ]
@@ -234,6 +220,10 @@ quantile_to_interval.data.frame <- function(dt,
if (format == "wide") {
delete_columns(dt, "quantile")
dt <- dcast(dt, ... ~ boundary, value.var = "predicted")
+ # if there are NA values in `predicted`, this introduces a column "NA"
+ if ("NA" %in% colnames(dt) && all(is.na(dt[["NA"]]))) {
+ dt[, "NA" := NULL]
+ }
}
return(dt[])
}
@@ -311,3 +301,25 @@ sample_to_range_long <- function(data,
return(data[])
}
+
+#' Get Range Belonging to a Quantile
+#' @description Every quantile can be thought of either as the lower or the
+#' upper bound of a symmetric central prediction interval. This helper function
+#' returns the range of the central prediction interval to which the quantile
+#' belongs.
+#'
+#' Due to numeric instability that sometimes occurred in the past, ranges are
+#' rounded to 10 decimal places. This is not a problem for the vast majority of
+#' use cases, but it is something to be aware of.
+#' @param quantile a numeric vector of quantile levels of size N
+#' @return a numeric vector of interval ranges of size N
+#' @keywords internal
+get_range_from_quantile <- function(quantile) {
+ boundary <- ifelse(quantile <= 0.5, "lower", "upper")
+ range <- ifelse(
+ boundary == "lower",
+ round((1 - 2 * quantile) * 100, digits = 10),
+ round((2 * quantile - 1) * 100, digits = 10)
+ )
+ return(range)
+}
diff --git a/R/z_globalVariables.R b/R/z_globalVariables.R
index b81fb3452..89cfc2eab 100644
--- a/R/z_globalVariables.R
+++ b/R/z_globalVariables.R
@@ -30,9 +30,12 @@ globalVariables(c(
"identifCol",
"Interval_Score",
"interval_range",
+ "interval_coverage",
+ "interval_coverage_deviation",
"overprediction",
"underprediction",
"quantile_coverage",
+ "quantile_coverage_deviation",
"LogS",
"log_score",
"lower",
@@ -59,6 +62,7 @@ globalVariables(c(
"rel_to_baseline",
"relative_skill",
"rn",
+ "sample_id",
"scoringutils_InternalDuplicateCheck",
"scoringutils_InternalNumCheck",
"se_mean",
@@ -72,6 +76,7 @@ globalVariables(c(
"var_of_interest",
"variable",
"weight",
+ "wis",
"wis_component_name",
"x",
"y",
diff --git a/README.Rmd b/README.Rmd
index 77584d145..0c4c41223 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -91,8 +91,8 @@ Forecasts can be easily and quickly scored using the `score()` function. `score(
example_quantile %>%
set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>%
validate() %>%
+ add_coverage() %>%
score() %>%
- add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>%
summarise_scores(
by = c("model", "target_type")
) %>%
diff --git a/README.md b/README.md
index 2b0e1d1f9..0a5f93881 100644
--- a/README.md
+++ b/README.md
@@ -129,8 +129,8 @@ details. Finally we summarise these scores by model and target type.
example_quantile %>%
set_forecast_unit(c("location", "target_end_date", "target_type", "horizon", "model")) %>%
validate() %>%
+ add_coverage() %>%
score() %>%
- add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>%
summarise_scores(
by = c("model", "target_type")
) %>%
@@ -144,15 +144,15 @@ example_quantile %>%
kable()
```
-| model | target_type | interval_score | dispersion | underprediction | overprediction | coverage_deviation | bias | ae_median | coverage_50 | coverage_90 | relative_skill | scaled_rel_skill |
-|:----------------------|:------------|---------------:|-----------:|----------------:|---------------:|-------------------:|--------:|----------:|------------:|------------:|---------------:|-----------------:|
-| EuroCOVIDhub-baseline | Cases | 28000 | 4100 | 10000.0 | 14000.0 | -0.110 | 0.0980 | 38000 | 0.33 | 0.82 | 1.30 | 1.6 |
-| EuroCOVIDhub-baseline | Deaths | 160 | 91 | 2.1 | 66.0 | 0.120 | 0.3400 | 230 | 0.66 | 1.00 | 2.30 | 3.8 |
-| EuroCOVIDhub-ensemble | Cases | 18000 | 3700 | 4200.0 | 10000.0 | -0.098 | -0.0560 | 24000 | 0.39 | 0.80 | 0.82 | 1.0 |
-| EuroCOVIDhub-ensemble | Deaths | 41 | 30 | 4.1 | 7.1 | 0.200 | 0.0730 | 53 | 0.88 | 1.00 | 0.60 | 1.0 |
-| UMass-MechBayes | Deaths | 53 | 27 | 17.0 | 9.0 | -0.023 | -0.0220 | 78 | 0.46 | 0.88 | 0.75 | 1.3 |
-| epiforecasts-EpiNow2 | Cases | 21000 | 5700 | 3300.0 | 12000.0 | -0.067 | -0.0790 | 28000 | 0.47 | 0.79 | 0.95 | 1.2 |
-| epiforecasts-EpiNow2 | Deaths | 67 | 32 | 16.0 | 19.0 | -0.043 | -0.0051 | 100 | 0.42 | 0.91 | 0.98 | 1.6 |
+| model | target_type | wis | overprediction | underprediction | dispersion | bias | coverage_50 | coverage_90 | coverage_deviation | ae_median | relative_skill | scaled_rel_skill |
+|:----------------------|:------------|------:|---------------:|----------------:|-----------:|--------:|------------:|------------:|-------------------:|----------:|---------------:|-----------------:|
+| EuroCOVIDhub-baseline | Cases | 28000 | 14000.0 | 10000.0 | 4100 | 0.0980 | 0.33 | 0.82 | -0.120 | 38000 | 1.30 | 1.6 |
+| EuroCOVIDhub-baseline | Deaths | 160 | 66.0 | 2.1 | 91 | 0.3400 | 0.66 | 1.00 | 0.120 | 230 | 2.30 | 3.8 |
+| EuroCOVIDhub-ensemble | Cases | 18000 | 10000.0 | 4200.0 | 3700 | -0.0560 | 0.39 | 0.80 | -0.100 | 24000 | 0.82 | 1.0 |
+| EuroCOVIDhub-ensemble | Deaths | 41 | 7.1 | 4.1 | 30 | 0.0730 | 0.88 | 1.00 | 0.200 | 53 | 0.60 | 1.0 |
+| UMass-MechBayes | Deaths | 53 | 9.0 | 17.0 | 27 | -0.0220 | 0.46 | 0.88 | -0.025 | 78 | 0.75 | 1.3 |
+| epiforecasts-EpiNow2 | Cases | 21000 | 12000.0 | 3300.0 | 5700 | -0.0790 | 0.47 | 0.79 | -0.070 | 28000 | 0.95 | 1.2 |
+| epiforecasts-EpiNow2 | Deaths | 67 | 19.0 | 16.0 | 32 | -0.0051 | 0.42 | 0.91 | -0.045 | 100 | 0.98 | 1.6 |
`scoringutils` contains additional functionality to transform forecasts,
to summarise scores at different levels, to visualise them, and to
@@ -174,20 +174,27 @@ example_quantile %>%
score %>%
summarise_scores(by = c("model", "target_type", "scale")) %>%
head()
-#> model target_type scale interval_score dispersion
-#> 1: EuroCOVIDhub-baseline Cases log 1.169972e+00 0.4373146
-#> 2: EuroCOVIDhub-baseline Cases natural 2.209046e+04 4102.5009443
-#> 3: EuroCOVIDhub-ensemble Cases log 5.500974e-01 0.1011850
-#> 4: EuroCOVIDhub-ensemble Cases natural 1.155071e+04 3663.5245788
-#> 5: epiforecasts-EpiNow2 Cases log 6.005778e-01 0.1066329
-#> 6: epiforecasts-EpiNow2 Cases natural 1.443844e+04 5664.3779484
-#> underprediction overprediction coverage_deviation bias ae_median
-#> 1: 3.521964e-01 0.3804607 -0.10940217 0.09726562 1.185905e+00
-#> 2: 1.028497e+04 7702.9836957 -0.10940217 0.09726562 3.208048e+04
-#> 3: 1.356563e-01 0.3132561 -0.09785326 -0.05640625 7.410484e-01
-#> 4: 4.237177e+03 3650.0047554 -0.09785326 -0.05640625 1.770795e+04
-#> 5: 1.858699e-01 0.3080750 -0.06660326 -0.07890625 7.656591e-01
-#> 6: 3.260356e+03 5513.7058424 -0.06660326 -0.07890625 2.153070e+04
+#> model target_type scale wis overprediction
+#> 1: EuroCOVIDhub-ensemble Cases natural 11550.70664 3650.004755
+#> 2: EuroCOVIDhub-baseline Cases natural 22090.45747 7702.983696
+#> 3: epiforecasts-EpiNow2 Cases natural 14438.43943 5513.705842
+#> 4: EuroCOVIDhub-ensemble Deaths natural 41.42249 7.138247
+#> 5: EuroCOVIDhub-baseline Deaths natural 159.40387 65.899117
+#> 6: UMass-MechBayes Deaths natural 52.65195 8.978601
+#> underprediction dispersion bias coverage_50 coverage_90
+#> 1: 4237.177310 3663.52458 -0.05640625 0.3906250 0.8046875
+#> 2: 10284.972826 4102.50094 0.09726562 0.3281250 0.8203125
+#> 3: 3260.355639 5664.37795 -0.07890625 0.4687500 0.7890625
+#> 4: 4.103261 30.18099 0.07265625 0.8750000 1.0000000
+#> 5: 2.098505 91.40625 0.33906250 0.6640625 1.0000000
+#> 6: 16.800951 26.87239 -0.02234375 0.4609375 0.8750000
+#> coverage_deviation ae_median
+#> 1: -0.10230114 17707.95312
+#> 2: -0.11437500 32080.48438
+#> 3: -0.06963068 21530.69531
+#> 4: 0.20380682 53.13281
+#> 5: 0.12142045 233.25781
+#> 6: -0.02488636 78.47656
```
## Citation
diff --git a/data/metrics_quantile.rda b/data/metrics_quantile.rda
index 2f2607933..b5598113d 100644
Binary files a/data/metrics_quantile.rda and b/data/metrics_quantile.rda differ
diff --git a/inst/create-list-available-forecasts.R b/inst/create-list-available-forecasts.R
index 21a2c8682..07105a7f4 100644
--- a/inst/create-list-available-forecasts.R
+++ b/inst/create-list-available-forecasts.R
@@ -28,7 +28,9 @@ metrics_quantile <- list(
"underprediction" = underprediction,
"dispersion" = dispersion,
"bias" = bias_quantile,
- "coverage_50" = \(...) {run_safely(..., range = 50, fun = interval_coverage_quantile)},
- "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)}
+ "coverage_50" = interval_coverage_quantile,
+ "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)},
+ "coverage_deviation" = interval_coverage_deviation_quantile,
+ "ae_median" = ae_median_quantile
)
usethis::use_data(metrics_quantile, overwrite = TRUE)
diff --git a/man/abs_error.Rd b/man/abs_error.Rd
index 099937bfd..197703193 100644
--- a/man/abs_error.Rd
+++ b/man/abs_error.Rd
@@ -7,10 +7,14 @@
abs_error(observed, predicted)
}
\arguments{
-\item{observed}{A vector with observed values of size n}
+\item{observed}{numeric vector of size n with the observed values}
-\item{predicted}{numeric vector with predictions, corresponding to the
-quantiles in a second vector, \code{quantiles}.}
+\item{predicted}{numeric nxN matrix of predictive
+quantiles, n (number of rows) being the number of forecasts (corresponding
+to the number of observed values) and N
+(number of columns) the number of quantiles per forecast.
+If \code{observed} is just a single number, then predicted can just be a
+vector of size N.}
}
\value{
vector with the absolute error
diff --git a/man/add_coverage.Rd b/man/add_coverage.Rd
index 33990a3bc..d6c82d467 100644
--- a/man/add_coverage.Rd
+++ b/man/add_coverage.Rd
@@ -1,40 +1,56 @@
% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/summarise_scores.R
+% Please edit documentation in R/add_coverage.R
\name{add_coverage}
\alias{add_coverage}
-\title{Add coverage of central prediction intervals}
+\title{Add Coverage Values to Quantile-Based Forecasts}
\usage{
-add_coverage(scores, by = NULL, ranges = c(50, 90))
+add_coverage(data)
}
\arguments{
-\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.}
-
-\item{by}{character vector with column names to add the coverage for.}
-
-\item{ranges}{numeric vector of the ranges of the central prediction intervals
-for which coverage values shall be added.}
+\item{data}{A data.frame or data.table with predicted and observed values.}
}
\value{
-a data.table with unsummarised scores with columns added for the
-coverage of the central prediction intervals. While the overall data.table
-is still unsummarised, note that for the coverage columns some level of
-summary is present according to the value specified in \code{by}.
+a data.table with the input and columns "interval_coverage",
+"interval_coverage_deviation", "quantile_coverage",
+"quantile_coverage_deviation" added.
}
\description{
-Adds a column with the coverage of central prediction intervals
-to unsummarised scores as produced by \code{\link[=score]{score()}}
+Adds interval coverage of central prediction intervals,
+quantile coverage for predictive quantiles, as well as the deviation between
+desired and actual coverage to a data.table. Forecasts should be in a
+quantile format (following the input requirements of \code{score()}).
+
+\strong{Interval coverage}
+
+Coverage for a given interval range is defined as the proportion of
+observations that fall within the corresponding central prediction intervals.
+Central prediction intervals are symmetric around the median and and formed
+by two quantiles that denote the lower and upper bound. For example, the 50\%
+central prediction interval is the interval between the 0.25 and 0.75
+quantiles of the predictive distribution.
+
+The function \code{add_coverage()} computes the coverage per central prediction
+interval, so the coverage will always be either \code{TRUE} (observed value falls
+within the interval) or \code{FALSE} (observed value falls outside the interval).
+You can summarise the coverage values to get the proportion of observations
+that fall within the central prediction intervals.
+
+\strong{Quantile coverage}
+
+Quantile coverage for a given quantile is defined as the proportion of
+observed values that are smaller than the corresponding predictive quantile.
+For example, the 0.5 quantile coverage is the proportion of observed values
+that are smaller than the 0.5 quantile of the predictive distribution.
+
+\strong{Coverage deviation}
-The coverage values that are added are computed according to the values
-specified in \code{by}. If, for example, \code{by = "model"}, then there will be one
-coverage value for every model and \code{\link[=add_coverage]{add_coverage()}} will compute the coverage
-for every model across the values present in all other columns which define
-the unit of a single forecast.
+The coverage deviation is the difference between the desired coverage and the
+actual coverage. For example, if the desired coverage is 90\% and the actual
+coverage is 80\%, the coverage deviation is -0.1.
}
\examples{
library(magrittr) # pipe operator
-score(example_quantile) \%>\%
- add_coverage(by = c("model", "target_type")) \%>\%
- summarise_scores(by = c("model", "target_type")) \%>\%
- summarise_scores(fun = signif, digits = 2)
+example_quantile \%>\%
+ add_coverage()
}
\keyword{scoring}
diff --git a/man/ae_median_quantile.Rd b/man/ae_median_quantile.Rd
index 841850235..d96965c7c 100644
--- a/man/ae_median_quantile.Rd
+++ b/man/ae_median_quantile.Rd
@@ -4,38 +4,38 @@
\alias{ae_median_quantile}
\title{Absolute Error of the Median (Quantile-based Version)}
\usage{
-ae_median_quantile(observed, predicted, quantiles = NULL)
+ae_median_quantile(observed, predicted, quantile)
}
\arguments{
-\item{observed}{A vector with observed values of size n}
+\item{observed}{numeric vector of size n with the observed values}
-\item{predicted}{numeric vector with predictions, corresponding to the
-quantiles in a second vector, \code{quantiles}.}
+\item{predicted}{numeric nxN matrix of predictive
+quantiles, n (number of rows) being the number of forecasts (corresponding
+to the number of observed values) and N
+(number of columns) the number of quantiles per forecast.
+If \code{observed} is just a single number, then predicted can just be a
+vector of size N.}
-\item{quantiles}{numeric vector that denotes the quantile for the values
-in \code{predicted}. Only those predictions where \code{quantiles == 0.5} will
-be kept. If \code{quantiles} is \code{NULL}, then all \code{predicted} and
-\code{observed} will be used (this is then the same as \code{\link[=abs_error]{abs_error()}})}
+\item{quantile}{vector with quantile levels of size N}
}
\value{
-vector with the scoring values
+numeric vector of length N with the absolute error of the median
}
\description{
-Absolute error of the median calculated as
-
+Compute the absolute error of the median calculated as
\deqn{
- \textrm{abs}(\textrm{observed} - \textrm{prediction})
+ \textrm{abs}(\textrm{observed} - \textrm{median prediction})
}{
abs(observed - median_prediction)
}
-
-The function was created for internal use within \code{\link[=score]{score()}}, but can also
-used as a standalone function.
+The median prediction is the predicted value for which quantile == 0.5,
+the function therefore requires 0.5 to be among the quantile levels in
+\code{quantile}.
}
\examples{
observed <- rnorm(30, mean = 1:30)
-predicted_values <- rnorm(30, mean = 1:30)
-ae_median_quantile(observed, predicted_values, quantiles = 0.5)
+predicted_values <- matrix(rnorm(30, mean = 1:30))
+ae_median_quantile(observed, predicted_values, quantile = 0.5)
}
\seealso{
\code{\link[=ae_median_sample]{ae_median_sample()}}, \code{\link[=abs_error]{abs_error()}}
diff --git a/man/ae_median_sample.Rd b/man/ae_median_sample.Rd
index d446b3300..1e420fcc4 100644
--- a/man/ae_median_sample.Rd
+++ b/man/ae_median_sample.Rd
@@ -27,7 +27,7 @@ Absolute error of the median calculated as
}
\examples{
observed <- rnorm(30, mean = 1:30)
-predicted_values <- rnorm(30, mean = 1:30)
+predicted_values <- matrix(rnorm(30, mean = 1:30))
ae_median_sample(observed, predicted_values)
}
\seealso{
diff --git a/man/compare_two_models.Rd b/man/compare_two_models.Rd
index 39780292e..1d4686a1c 100644
--- a/man/compare_two_models.Rd
+++ b/man/compare_two_models.Rd
@@ -22,7 +22,7 @@ compare_two_models(
\item{name_model2}{character, name of the model to compare against}
\item{metric}{A character vector of length one with the metric to do the
-comparison on. The default is "auto", meaning that either "interval_score",
+comparison on. The default is "auto", meaning that either "wis",
"crps", or "brier_score" will be selected where available.}
\item{one_sided}{Boolean, default is \code{FALSE}, whether two conduct a one-sided
diff --git a/man/ensure_data.table.Rd b/man/ensure_data.table.Rd
new file mode 100644
index 000000000..6d2457ee5
--- /dev/null
+++ b/man/ensure_data.table.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{ensure_data.table}
+\alias{ensure_data.table}
+\title{Ensure That an Object is a Data Table}
+\usage{
+ensure_data.table(data)
+}
+\arguments{
+\item{data}{An object to ensure is a data table}
+}
+\value{
+A data table
+}
+\description{
+This function ensures that an object is a data table.
+If the object is not a data table, it is converted to one. If the object
+is a data table, a copy of the object is returned.
+}
+\keyword{internal}
diff --git a/man/example_binary.Rd b/man/example_binary.Rd
index e7042d6b2..47797b8cd 100644
--- a/man/example_binary.Rd
+++ b/man/example_binary.Rd
@@ -19,7 +19,7 @@ A data frame with 346 rows and 10 columns:
}
}
\source{
-\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
}
\usage{
example_binary
diff --git a/man/example_continuous.Rd b/man/example_continuous.Rd
index d1fba390e..354ebc5d6 100644
--- a/man/example_continuous.Rd
+++ b/man/example_continuous.Rd
@@ -20,7 +20,7 @@ A data frame with 13,429 rows and 10 columns:
}
}
\source{
-\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
}
\usage{
example_continuous
diff --git a/man/example_point.Rd b/man/example_point.Rd
index 62af0e44f..1eb734b76 100644
--- a/man/example_point.Rd
+++ b/man/example_point.Rd
@@ -19,7 +19,7 @@ A data frame with
}
}
\source{
-\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
}
\usage{
example_point
diff --git a/man/example_quantile.Rd b/man/example_quantile.Rd
index 00250e6d0..2582907e9 100644
--- a/man/example_quantile.Rd
+++ b/man/example_quantile.Rd
@@ -20,7 +20,7 @@ A data frame with
}
}
\source{
-\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
}
\usage{
example_quantile
diff --git a/man/example_quantile_forecasts_only.Rd b/man/example_quantile_forecasts_only.Rd
index 3fcaf2722..d789ed1e0 100644
--- a/man/example_quantile_forecasts_only.Rd
+++ b/man/example_quantile_forecasts_only.Rd
@@ -18,7 +18,7 @@ A data frame with 7,581 rows and 9 columns:
}
}
\source{
-\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
}
\usage{
example_quantile_forecasts_only
diff --git a/man/example_truth_only.Rd b/man/example_truth_only.Rd
index 46453ba97..f8ae05afa 100644
--- a/man/example_truth_only.Rd
+++ b/man/example_truth_only.Rd
@@ -15,7 +15,7 @@ A data frame with 140 rows and 5 columns:
}
}
\source{
-\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+\url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
}
\usage{
example_truth_only
diff --git a/man/get_range_from_quantile.Rd b/man/get_range_from_quantile.Rd
new file mode 100644
index 000000000..eca15af36
--- /dev/null
+++ b/man/get_range_from_quantile.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils_data_handling.R
+\name{get_range_from_quantile}
+\alias{get_range_from_quantile}
+\title{Get Range Belonging to a Quantile}
+\usage{
+get_range_from_quantile(quantile)
+}
+\arguments{
+\item{quantile}{a numeric vector of quantile levels of size N}
+}
+\value{
+a numeric vector of interval ranges of size N
+}
+\description{
+Every quantile can be thought of either as the lower or the
+upper bound of a symmetric central prediction interval. This helper function
+returns the range of the central prediction interval to which the quantile
+belongs.
+
+Due to numeric instability that sometimes occurred in the past, ranges are
+rounded to 10 decimal places. This is not a problem for the vast majority of
+use cases, but it is something to be aware of.
+}
+\keyword{internal}
diff --git a/man/interval_coverage.Rd b/man/interval_coverage.Rd
index 8fbfc67d1..74256cc77 100644
--- a/man/interval_coverage.Rd
+++ b/man/interval_coverage.Rd
@@ -11,9 +11,14 @@ interval_coverage_quantile(observed, predicted, quantile, range = 50)
interval_coverage_sample(observed, predicted, range = 50)
}
\arguments{
-\item{observed}{A vector with observed values of size n}
+\item{observed}{numeric vector of size n with the observed values}
-\item{predicted}{vector of size n with the predicted values}
+\item{predicted}{numeric nxN matrix of predictive
+quantiles, n (number of rows) being the number of forecasts (corresponding
+to the number of observed values) and N
+(number of columns) the number of quantiles per forecast.
+If \code{observed} is just a single number, then predicted can just be a
+vector of size N.}
\item{quantile}{vector with quantile levels of size N}
diff --git a/man/interval_coverage_deviation_quantile.Rd b/man/interval_coverage_deviation_quantile.Rd
new file mode 100644
index 000000000..9a6029ec7
--- /dev/null
+++ b/man/interval_coverage_deviation_quantile.Rd
@@ -0,0 +1,75 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/metrics-quantile.R
+\name{interval_coverage_deviation_quantile}
+\alias{interval_coverage_deviation_quantile}
+\title{Interval Coverage Deviation (For Quantile-Based Forecasts)}
+\usage{
+interval_coverage_deviation_quantile(observed, predicted, quantile)
+}
+\arguments{
+\item{observed}{numeric vector of size n with the observed values}
+
+\item{predicted}{numeric nxN matrix of predictive
+quantiles, n (number of rows) being the number of forecasts (corresponding
+to the number of observed values) and N
+(number of columns) the number of quantiles per forecast.
+If \code{observed} is just a single number, then predicted can just be a
+vector of size N.}
+
+\item{quantile}{vector with quantile levels of size N}
+}
+\value{
+A numeric vector of length n with the coverage deviation for each
+forecast (comprising one or multiple prediction intervals).
+}
+\description{
+Check the agreement between desired and actual interval coverage
+of a forecast.
+
+The function is similar to \code{\link[=interval_coverage_quantile]{interval_coverage_quantile()}},
+but looks at all provided prediction intervals instead of only one. It
+compares nominal coverage (i.e. the desired coverage) with the actual
+observed coverage.
+
+A central symmetric prediction interval is defined by a lower and an
+upper bound formed by a pair of predictive quantiles. For example, a 50\%
+prediction interval is formed by the 0.25 and 0.75 quantiles of the
+predictive distribution. Ideally, a forecaster should aim to cover about
+50\% of all observed values with their 50\% prediction intervals, 90\% of all
+observed values with their 90\% prediction intervals, and so on.
+
+For every prediction interval, the deviation is computed as the difference
+between the observed coverage and the nominal coverage
+For a single observed value and a single prediction interval,
+coverage is always either 0 or 1. This is not the case for a single observed
+value and multiple prediction intervals, but it still doesn't make that much
+sense to compare nominal (desired) coverage and actual coverage for a single
+observation. In that sense coverage deviation only really starts to make
+sense as a metric when averaged across multiple observations).
+
+Positive values of coverage deviation are an indication for underconfidence,
+i.e. the forecaster could likely have issued a narrower forecast. Negative
+values are an indication for overconfidence, i.e. the forecasts were too
+narrow.
+
+\deqn{
+\textrm{coverage deviation} =
+\mathbf{1}(\textrm{observed value falls within interval} -
+\textrm{nominal coverage})
+}{
+coverage deviation =
+1(observed value falls within interval) - nominal coverage
+}
+The coverage deviation is then averaged across all prediction intervals.
+The median is ignored when computing coverage deviation.
+}
+\examples{
+observed <- c(1, -15, 22)
+predicted <- rbind(
+ c(-1, 0, 1, 2, 3),
+ c(-2, 1, 2, 2, 4),
+ c(-2, 0, 3, 3, 4)
+)
+quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9)
+interval_coverage_deviation_quantile(observed, predicted, quantile)
+}
diff --git a/man/metrics_quantile.Rd b/man/metrics_quantile.Rd
index 9fda39f03..ea444ee7e 100644
--- a/man/metrics_quantile.Rd
+++ b/man/metrics_quantile.Rd
@@ -5,7 +5,7 @@
\alias{metrics_quantile}
\title{Default metrics for quantile-based forecasts.}
\format{
-An object of class \code{list} of length 7.
+An object of class \code{list} of length 9.
}
\usage{
metrics_quantile
@@ -14,9 +14,14 @@ metrics_quantile
A named list with functions:
\itemize{
\item "wis" = \code{\link[=wis]{wis()}}
+\item "overprediction" = \code{\link[=overprediction]{overprediction()}}
+\item "underprediction" = \code{\link[=underprediction]{underprediction()}}
+\item "dispersion" = \code{\link[=dispersion]{dispersion()}}
\item "bias" = \code{\link[=bias_quantile]{bias_quantile()}}
-\item "coverage_50" = \(...) {run_safely(..., range = 50, fun = interval_coverage_quantile)} #nolint
-\item "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)} #nolint
+\item "coverage_50" = \(...) {run_safely(..., range = 50, fun = \link[=interval_coverage_quantile]{interval_coverage_quantile})}
+\item "coverage_90" = \(...) {run_safely(..., range = 90, fun = \link[=interval_coverage_quantile]{interval_coverage_quantile})}
+\item "coverage_deviation" = \code{\link[=interval_coverage_deviation_quantile]{interval_coverage_deviation_quantile()}},
+\item "ae_median" = \code{\link[=ae_median_quantile]{ae_median_quantile()}}
}
}
\keyword{info}
diff --git a/man/pairwise_comparison.Rd b/man/pairwise_comparison.Rd
index 9288e77fb..d30be1197 100644
--- a/man/pairwise_comparison.Rd
+++ b/man/pairwise_comparison.Rd
@@ -25,7 +25,7 @@ splitting) and the pairwise comparisons will be computed separately for the
split data.frames.}
\item{metric}{A character vector of length one with the metric to do the
-comparison on. The default is "auto", meaning that either "interval_score",
+comparison on. The default is "auto", meaning that either "wis",
"crps", or "brier_score" will be selected where available.}
\item{baseline}{character vector of length one that denotes the baseline
diff --git a/man/pairwise_comparison_one_group.Rd b/man/pairwise_comparison_one_group.Rd
index a7d902f15..df59b1472 100644
--- a/man/pairwise_comparison_one_group.Rd
+++ b/man/pairwise_comparison_one_group.Rd
@@ -10,7 +10,7 @@ pairwise_comparison_one_group(scores, metric, baseline, by, ...)
\item{scores}{A data.table of scores as produced by \code{\link[=score]{score()}}.}
\item{metric}{A character vector of length one with the metric to do the
-comparison on. The default is "auto", meaning that either "interval_score",
+comparison on. The default is "auto", meaning that either "wis",
"crps", or "brier_score" will be selected where available.}
\item{baseline}{character vector of length one that denotes the baseline
diff --git a/man/plot_heatmap.Rd b/man/plot_heatmap.Rd
index 8b1aac549..837ef4243 100644
--- a/man/plot_heatmap.Rd
+++ b/man/plot_heatmap.Rd
@@ -29,7 +29,7 @@ different locations.
}
\examples{
scores <- score(example_quantile)
-scores <- summarise_scores(scores, by = c("model", "target_type", "range"))
+scores <- summarise_scores(scores, by = c("model", "target_type"))
plot_heatmap(scores, x = "target_type", metric = "bias")
}
diff --git a/man/plot_interval_coverage.Rd b/man/plot_interval_coverage.Rd
index 9c7da16fe..6c4c3e985 100644
--- a/man/plot_interval_coverage.Rd
+++ b/man/plot_interval_coverage.Rd
@@ -21,8 +21,8 @@ ggplot object with a plot of interval coverage
Plot interval coverage
}
\examples{
-data.table::setDTthreads(1) # only needed to avoid issues on CRAN
-scores <- score(example_quantile)
-scores <- summarise_scores(scores, by = c("model", "range"))
-plot_interval_coverage(scores)
+# data.table::setDTthreads(1) # only needed to avoid issues on CRAN
+# scores <- score(example_quantile)
+# scores <- summarise_scores(scores, by = c("model", "range"))
+# plot_interval_coverage(scores)
}
diff --git a/man/plot_quantile_coverage.Rd b/man/plot_quantile_coverage.Rd
index 2e6ef489e..c479fb5e3 100644
--- a/man/plot_quantile_coverage.Rd
+++ b/man/plot_quantile_coverage.Rd
@@ -21,7 +21,7 @@ ggplot object with a plot of interval coverage
Plot quantile coverage
}
\examples{
-scores <- score(example_quantile)
-scores <- summarise_scores(scores, by = c("model", "quantile"))
-plot_quantile_coverage(scores)
+# scores <- score(example_quantile)
+# scores <- summarise_scores(scores, by = c("model", "quantile"))
+# plot_quantile_coverage(scores)
}
diff --git a/man/plot_ranges.Rd b/man/plot_ranges.Rd
index a4a999ff2..27922b3c0 100644
--- a/man/plot_ranges.Rd
+++ b/man/plot_ranges.Rd
@@ -4,7 +4,7 @@
\alias{plot_ranges}
\title{Plot Metrics by Range of the Prediction Interval}
\usage{
-plot_ranges(scores, y = "interval_score", x = "model", colour = "range")
+plot_ranges(scores, y = "wis", x = "model", colour = "range")
}
\arguments{
\item{scores}{A data.frame of scores based on quantile forecasts as
@@ -12,7 +12,7 @@ produced by \code{\link[=score]{score()}} or \code{\link[=summarise_scores]{summ
in the \code{by} argument when running \code{\link[=summarise_scores]{summarise_scores()}}}
\item{y}{The variable from the scores you want to show on the y-Axis.
-This could be something like "interval_score" (the default) or "dispersion"}
+This could be something like "wis" (the default) or "dispersion"}
\item{x}{The variable from the scores you want to show on the x-Axis.
Usually this will be "model"}
@@ -31,13 +31,13 @@ sharpness / dispersion changes by range.
}
\examples{
library(ggplot2)
-scores <- score(example_quantile)
-scores <- summarise_scores(scores, by = c("model", "target_type", "range"))
+# scores <- score(example_quantile)
+# scores <- summarise_scores(scores, by = c("model", "target_type", "range"))
-plot_ranges(scores, x = "model") +
- facet_wrap(~target_type, scales = "free")
+# plot_ranges(scores, x = "model") +
+# facet_wrap(~target_type, scales = "free")
# visualise dispersion instead of interval score
-plot_ranges(scores, y = "dispersion", x = "model") +
- facet_wrap(~target_type)
+# plot_ranges(scores, y = "dispersion", x = "model") +
+# facet_wrap(~target_type)
}
diff --git a/man/score.Rd b/man/score.Rd
index 5cf7a1b14..c26fbb24c 100644
--- a/man/score.Rd
+++ b/man/score.Rd
@@ -7,7 +7,6 @@
\alias{score.scoringutils_point}
\alias{score.scoringutils_sample}
\alias{score.scoringutils_quantile}
-\alias{score.scoringutils_quantile_new}
\title{Evaluate forecasts in a data.frame format}
\usage{
score(data, ...)
@@ -20,9 +19,7 @@ score(data, ...)
\method{score}{scoringutils_sample}(data, metrics = metrics_sample, ...)
-\method{score}{scoringutils_quantile}(data, metrics = NULL, ...)
-
-\method{score}{scoringutils_quantile_new}(data, metrics = metrics_quantile, ...)
+\method{score}{scoringutils_quantile}(data, metrics = metrics_quantile, ...)
}
\arguments{
\item{data}{A data.frame or data.table with predicted and observed values.}
diff --git a/man/score_quantile.Rd b/man/score_quantile.Rd
deleted file mode 100644
index 5f51f94ec..000000000
--- a/man/score_quantile.Rd
+++ /dev/null
@@ -1,62 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/score_quantile.R
-\name{score_quantile}
-\alias{score_quantile}
-\title{Evaluate forecasts in a Quantile-Based Format}
-\usage{
-score_quantile(
- data,
- forecast_unit,
- metrics,
- weigh = TRUE,
- count_median_twice = FALSE,
- separate_results = TRUE
-)
-}
-\arguments{
-\item{data}{A data.frame or data.table with predicted and observed values.}
-
-\item{forecast_unit}{A character vector with the column names that define
-the unit of a single forecast, i.e. a forecast was made for a combination
-of the values in \code{forecast_unit}}
-
-\item{metrics}{A named list of scoring functions. Names will be used as
-column names in the output. See \code{\link[=metrics_point]{metrics_point()}}, \code{\link[=metrics_binary]{metrics_binary()}},
-\code{metrics_quantile()}, and \code{\link[=metrics_sample]{metrics_sample()}} for more information on the
-default metrics used.}
-
-\item{weigh}{if TRUE, weigh the score by alpha / 2, so it can be averaged
-into an interval score that, in the limit, corresponds to CRPS. Alpha is the
-decimal value that represents how much is outside a central prediction
-interval (e.g. for a 90 percent central prediction interval, alpha is 0.1)
-Default: \code{TRUE}.}
-
-\item{count_median_twice}{logical that controls whether or not to count the
-median twice when summarising (default is \code{FALSE}). Counting the
-median twice would conceptually treat it as a 0\\% prediction interval, where
-the median is the lower as well as the upper bound. The alternative is to
-treat the median as a single quantile forecast instead of an interval. The
-interval score would then be better understood as an average of quantile
-scores.}
-
-\item{separate_results}{if \code{TRUE} (default is \code{FALSE}), then the separate
-parts of the interval score (dispersion penalty, penalties for over- and
-under-prediction get returned as separate elements of a list). If you want a
-\code{data.frame} instead, simply call \code{\link[=as.data.frame]{as.data.frame()}} on the output.}
-}
-\value{
-A data.table with appropriate scores. For more information see
-\code{\link[=score]{score()}}
-}
-\description{
-Evaluate forecasts in a Quantile-Based Format
-}
-\references{
-Bosse NI, Gruson H, Cori A, van Leeuwen E, Funk S, Abbott S
-(2022) Evaluating Forecasts with scoringutils in R.
-\doi{10.48550/arXiv.2205.07090}
-}
-\author{
-Nikos Bosse \email{nikosbosse@gmail.com}
-}
-\keyword{internal}
diff --git a/man/se_mean_sample.Rd b/man/se_mean_sample.Rd
index d7c7d332f..c2101d4df 100644
--- a/man/se_mean_sample.Rd
+++ b/man/se_mean_sample.Rd
@@ -20,14 +20,15 @@ vector with the scoring values
Squared error of the mean calculated as
\deqn{
- \textrm{mean}(\textrm{observed} - \textrm{prediction})^2
+ \textrm{mean}(\textrm{observed} - \textrm{mean prediction})^2
}{
- mean(observed - mean_prediction)^2
+ mean(observed - mean prediction)^2
}
+The mean prediction is calculated as the mean of the predictive samples.
}
\examples{
observed <- rnorm(30, mean = 1:30)
-predicted_values <- rnorm(30, mean = 1:30)
+predicted_values <- matrix(rnorm(30, mean = 1:30))
se_mean_sample(observed, predicted_values)
}
\seealso{
diff --git a/man/wis.Rd b/man/wis.Rd
index a76b3c35f..057e9d04a 100644
--- a/man/wis.Rd
+++ b/man/wis.Rd
@@ -24,9 +24,14 @@ overprediction(observed, predicted, quantile)
underprediction(observed, predicted, quantile)
}
\arguments{
-\item{observed}{A vector with observed values of size n}
+\item{observed}{numeric vector of size n with the observed values}
-\item{predicted}{vector of size n with the predicted values}
+\item{predicted}{numeric nxN matrix of predictive
+quantiles, n (number of rows) being the number of forecasts (corresponding
+to the number of observed values) and N
+(number of columns) the number of quantiles per forecast.
+If \code{observed} is just a single number, then predicted can just be a
+vector of size N.}
\item{quantile}{vector with quantile levels of size N}
@@ -46,8 +51,8 @@ Default: \code{TRUE}.}
\item{na.rm}{if TRUE, ignore NA values when computing the score}
}
\value{
-\code{wis()}: a numeric vector with WIS values (one per observation), or a list
-with separate entries if \code{separate_results} is \code{TRUE}.
+\code{wis()}: a numeric vector with WIS values of size n (one per observation),
+or a list with separate entries if \code{separate_results} is \code{TRUE}.
\code{dispersion()}: a numeric vector with dispersion values (one per observation)
diff --git a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg
index f56619980..a33e8a207 100644
--- a/tests/testthat/_snaps/plot_correlation/plot-correlation.svg
+++ b/tests/testthat/_snaps/plot_correlation/plot-correlation.svg
@@ -25,104 +25,146 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-1
-0.46
-1
-0.28
-0.15
-1
-0.94
-0.32
--0.03
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+1
+0.94
+1
+0.28
+-0.03
+1
+0.46
+0.32
+0.15
+1
+0.11
+0.22
+-0.35
+0.11
1
--0.34
--0.12
--0.33
--0.25
-1
-0.11
-0.11
--0.35
-0.22
-0.06
-1
-0.99
-0.54
-0.34
-0.9
--0.38
-0.1
-1
+-0.21
+-0.15
+-0.21
+-0.09
+0.01
+1
+-0.41
+-0.32
+-0.36
+-0.09
+0.1
+0.37
+1
+-0.34
+-0.25
+-0.33
+-0.12
+0.06
+0.85
+0.64
+1
+0.99
+0.9
+0.34
+0.54
+0.1
+-0.25
+-0.41
+-0.38
+1
-interval_score
-dispersion
-underprediction
-overprediction
-coverage_deviation
-bias
-ae_median
-
-
-
+wis
+overprediction
+underprediction
+dispersion
+bias
+coverage_50
+coverage_90
+coverage_deviation
+ae_median
+
+
+
+
-
-
-
+
+
+
+
-
-
-
+
+
+
+
-
-
-
-ae_median
-bias
-coverage_deviation
-overprediction
-underprediction
-dispersion
-interval_score
-
-0.0
-0.5
+
+
+
+
+ae_median
+coverage_deviation
+coverage_90
+coverage_50
+bias
+dispersion
+underprediction
+overprediction
+wis
+
+0.0
+0.5
1.0
Correlation
-
-
+
+
-
-
+
+
plot__correlation
diff --git a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg
index 528aefb09..8c4222d9a 100644
--- a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg
+++ b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg
@@ -25,174 +25,20 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-0.1
-0.1
-0.1
-0.1
-0.1
-0.1
-0.1
-0.1
-0.1
-0.1
-0.1
-0.1
--0.06
--0.06
--0.06
--0.06
--0.06
--0.06
--0.06
--0.06
--0.06
-0.06
--0.06
--0.06
--0.08
--0.08
--0.08
--0.08
--0.08
--0.08
--0.08
--0.08
--0.08
--0.08
--0.08
+0.1
-0.08
-0.34
-0.34
-0.34
-0.34
-0.34
-0.34
-0.34
-0.34
-0.34
-0.34
-0.34
-0.34
-0.07
-0.07
-0.07
-0.07
0.07
-0.07
-0.07
-0.07
-0.07
-0.07
-0.07
-0.07
--0.02
--0.02
--0.02
--0.02
--0.02
--0.02
--0.02
--0.02
--0.02
--0.02
--0.02
+0.34
-0.02
-0.01
--0.01
--0.01
--0.01
--0.01
--0.01
--0.01
--0.01
--0.01
--0.01
--0.01
--0.01
diff --git a/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg b/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg
index 548878c34..91848b1dd 100644
--- a/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg
+++ b/tests/testthat/_snaps/plot_interval_coverage/plot-interval-coverage.svg
@@ -57,15 +57,17 @@
100
Nominal interval coverage
% Obs inside interval
-model
-
-
-
-
-EuroCOVIDhub-baseline
-EuroCOVIDhub-ensemble
-UMass-MechBayes
-epiforecasts-EpiNow2
+model
+
+
+
+
+
+EuroCOVIDhub-baseline
+EuroCOVIDhub-ensemble
+UMass-MechBayes
+epiforecasts-EpiNow2
+NA
plot_interval_coverage
diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg
index 22e2408dc..458487011 100644
--- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg
+++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison-pval.svg
@@ -28,8 +28,8 @@
-
+
@@ -37,8 +37,8 @@
< 0.001
< 0.001
1
-0.298
< 0.001
+0.298
1
0.298
< 0.001
@@ -56,9 +56,9 @@
+
-
@@ -72,9 +72,9 @@
< 0.001
< 0.001
1
+< 0.001
< 0.001
< 0.001
-< 0.001
1
0.007
< 0.001
diff --git a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg
index 1ff397cfe..3a7599da4 100644
--- a/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg
+++ b/tests/testthat/_snaps/plot_pairwise_comparison/plot-pairwise-comparison.svg
@@ -28,8 +28,8 @@
-
+
@@ -37,8 +37,8 @@
1.37
1.59
1
-0.86
0.63
+0.86
1
1.16
0.73
@@ -56,9 +56,9 @@
+
-
@@ -72,9 +72,9 @@
3.03
3.85
1
+0.26
0.62
0.79
-0.26
1
0.74
1.27
diff --git a/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg b/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg
index bf686eedb..76808cc67 100644
--- a/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg
+++ b/tests/testthat/_snaps/plot_quantile_coverage/plot-quantile-coverage.svg
@@ -57,15 +57,17 @@
1.00
Quantile
% Obs below quantile
-model
-
-
-
-
-EuroCOVIDhub-baseline
-EuroCOVIDhub-ensemble
-UMass-MechBayes
-epiforecasts-EpiNow2
+model
+
+
+
+
+
+EuroCOVIDhub-baseline
+EuroCOVIDhub-ensemble
+UMass-MechBayes
+epiforecasts-EpiNow2
+NA
plot_quantile_coverage
diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg
index 812e1600f..4ad667f8a 100644
--- a/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg
+++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-dispersion.svg
@@ -25,42 +25,42 @@
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -82,54 +82,54 @@
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg
index 15c9ee3e3..98a9a883c 100644
--- a/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg
+++ b/tests/testthat/_snaps/plot_ranges/plot-ranges-interval.svg
@@ -25,42 +25,42 @@
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -82,54 +82,54 @@
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -203,7 +203,7 @@
model
-interval_score
+wis
0
25
diff --git a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg
index 95f9fe247..25b15d296 100644
--- a/tests/testthat/_snaps/plot_score_table/plot-score-table.svg
+++ b/tests/testthat/_snaps/plot_score_table/plot-score-table.svg
@@ -25,62 +25,78 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-10000
-9000
-50
-10000
-2000
-2000
-30
-3000
-5000
-2000
-20
-2000
-7000
-5000
-9
-6000
-0.002
-0.05
--0.02
--0.06
-0.2
-0.008
--0.02
--0.04
-20000
-10000
-80
-10000
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+9000
+10000
+10000
+50
+5000
+7000
+6000
+9
+2000
+5000
+2000
+20
+2000
+2000
+3000
+30
+0.008
+0.2
+-0.04
+-0.02
+0.6
+0.5
+0.4
+0.5
+0.9
+0.9
+0.8
+0.9
+0.05
+0.002
+-0.06
+-0.02
+10000
+20000
+10000
+80
@@ -93,20 +109,24 @@
-
-
-
+
+
+
+
-
-
-
-interval_score
-dispersion
-underprediction
-overprediction
-coverage_deviation
-bias
-ae_median
+
+
+
+
+wis
+overprediction
+underprediction
+dispersion
+bias
+coverage_50
+coverage_90
+coverage_deviation
+ae_median
plot_score_table
diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg
index 758a3c147..c315cf06d 100644
--- a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg
+++ b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg
@@ -25,14 +25,14 @@
-
+
-
+
-
+
@@ -43,16 +43,16 @@
-
+
-
+
-
+
diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg
index 987072ca4..fea309214 100644
--- a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg
+++ b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg
@@ -25,14 +25,14 @@
-
+
-
+
-
+
@@ -43,16 +43,16 @@
-
+
-
+
-
+
diff --git a/tests/testthat/_snaps/plot_wis/plot-wis.svg b/tests/testthat/_snaps/plot_wis/plot-wis.svg
index 5328b4779..a2bdf8653 100644
--- a/tests/testthat/_snaps/plot_wis/plot-wis.svg
+++ b/tests/testthat/_snaps/plot_wis/plot-wis.svg
@@ -25,14 +25,14 @@
-
+
-
+
-
+
@@ -43,16 +43,16 @@
-
+
-
+
-
+
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index c157fb958..a236f299d 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -1,8 +1,13 @@
# load common required test packages
library(ggplot2, quietly = TRUE)
+library(data.table)
suppressMessages(library(magrittr))
-# compute quantile scores
+metrics_no_cov <- metrics_quantile[!grepl("coverage", names(metrics_quantile))]
+metrics_no_cov_no_ae <- metrics_no_cov[!grepl("ae", names(metrics_no_cov))]
+
+
+# compute scores
scores_quantile <- suppressMessages(score(example_quantile))
scores_continuous <- suppressMessages(score(data = example_continuous))
scores_point <- suppressMessages(score(example_point))
diff --git a/tests/testthat/test-add_coverage.R b/tests/testthat/test-add_coverage.R
index fab1e72a1..689b8640b 100644
--- a/tests/testthat/test-add_coverage.R
+++ b/tests/testthat/test-add_coverage.R
@@ -1,31 +1,13 @@
-ex_coverage <- scores_quantile[model == "EuroCOVIDhub-ensemble"]
+ex_coverage <- example_quantile[model == "EuroCOVIDhub-ensemble"]
test_that("add_coverage() works as expected", {
- expect_error(
- add_coverage(ex_coverage, by = c("model", "target_type"), range = c())
- )
- expect_error(
- add_coverage(ex_coverage, by = c("model", "target_type")), NA
- )
- cov <- add_coverage(
- scores_quantile, by = c("model", "target_type"), range = c(10, 20)
- )
- expect_equal(
- grep("coverage_", colnames(cov), value = TRUE),
- c("coverage_deviation", "coverage_10", "coverage_20")
- )
-})
-
+ expect_no_condition(cov <- add_coverage(example_quantile))
-test_that("Order of `add_coverage()` and `summarise_scores()` doesn't matter", {
- # Need to update test. Turns out the order does matter...
- # see https://github.com/epiforecasts/scoringutils/issues/367
- pw1 <- add_coverage(ex_coverage, by = "model")
- pw1_sum <- summarise_scores(pw1, by = "model")
-
- pw2 <- summarise_scores(ex_coverage, by = "model")
- pw2 <- add_coverage(pw2)
+ required_names <- c(
+ "range", "interval_coverage", "interval_coverage_deviation",
+ "quantile_coverage", "quantile_coverage_deviation"
+ )
+ expect_equal(colnames(cov), c(colnames(example_quantile), required_names))
- # expect_true(all(pw1_sum == pw2, na.rm = TRUE))
- # expect_true(all(names(attributes(pw2)) == names(attributes(pw1_sum))))
+ expect_equal(nrow(cov), nrow(example_quantile))
})
diff --git a/tests/testthat/test-bias.R b/tests/testthat/test-bias.R
deleted file mode 100644
index 42eefa34f..000000000
--- a/tests/testthat/test-bias.R
+++ /dev/null
@@ -1,311 +0,0 @@
-test_that("bias_sample() throws an error when missing observed", {
- observed <- rpois(10, lambda = 1:10)
- predicted <- replicate(50, rpois(n = 10, lambda = 1:10))
-
- expect_error(
- bias_sample(predicted = predicted),
- 'argument "observed" is missing, with no default'
- )
-})
-
-test_that("bias_sample() throws an error when missing 'predicted'", {
- observed <- rpois(10, lambda = 1:10)
- predicted <- replicate(50, rpois(n = 10, lambda = 1:10))
-
- expect_error(
- bias_sample(observed = observed),
- 'argument "predicted" is missing, with no default'
- )
-})
-
-test_that("bias_sample() works for integer observed and predicted", {
- observed <- rpois(10, lambda = 1:10)
- predicted <- replicate(10, rpois(10, lambda = 1:10))
- output <- bias_sample(
- observed = observed,
- predicted = predicted
- )
- expect_equal(
- length(output),
- length(observed)
- )
- expect_equal(
- class(output),
- "numeric"
- )
-})
-
-test_that("bias_sample() works for continuous observed values and predicted", {
- observed <- rnorm(10)
- predicted <- replicate(10, rnorm(10))
- output <- bias_sample(
- observed = observed,
- predicted = predicted
- )
- expect_equal(
- length(output),
- length(observed)
- )
- expect_equal(
- class(output),
- "numeric"
- )
-})
-
-test_that("bias_sample() works as expected", {
- observed <- rpois(30, lambda = 1:30)
- predicted <- replicate(200, rpois(n = 30, lambda = 1:30))
- expect_true(all(bias_sample(observed, predicted) == bias_sample(observed, predicted)))
-
- ## continuous forecasts
- observed <- rnorm(30, mean = 1:30)
- predicted <- replicate(200, rnorm(30, mean = 1:30))
-
- scoringutils2 <- bias_sample(observed, predicted)
- scoringutils <- bias_sample(observed, predicted)
-
- expect_equal(scoringutils, scoringutils2)
-})
-
-
-test_that("bias_quantile() works as expected", {
- predicted <- c(1, 2, 3)
- quantiles <- c(0.1, 0.5, 0.9)
- expect_equal(
- bias_quantile(observed = 2, predicted, quantiles),
- 0
- )
- predicted <- c(0, 1, 2)
- quantiles <- c(0.1, 0.5, 0.9)
- expect_equal(
- bias_quantile(observed = 2, predicted, quantiles),
- -0.8
- )
-
- predicted <- c(
- 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996,
- 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000,
- 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004,
- 7973.000, 8340.500, 8675.750, 11555.000, 11976.500
- )
-
- quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99)
-
- observed <- 8062
- expect_equal(bias_quantile(observed, predicted, quantile), -0.8)
-})
-
-test_that("bias_quantile handles matrix input", {
- observed <- seq(10, 0, length.out = 4)
- predicted <- matrix(1:12, ncol = 3)
- quantiles <- c(0.1, 0.5, 0.9)
- expect_equal(
- bias_quantile(observed, predicted, quantiles),
- c(-1.0, -0.8, 0.8, 1.0)
- )
-})
-
-
-test_that("bias_quantile() handles vector that is too long", {
- predicted <- c(NA, 1, 2, 3)
- quantiles <- c(0.1, 0.5, 0.9)
-
- expect_error(
- bias_quantile(observed = 2, predicted, quantiles),
- "Assertion on 'quantile' failed: Must have length 4, but has length 3."
- )
-})
-
-test_that("bias_quantile() handles NA values", {
- predicted <- c(NA, 1, 2)
- quantiles <- c(0.1, 0.5, 0.9)
- expect_equal(
- bias_quantile(observed = 2, predicted, quantiles),
- -0.8
- )
- predicted <- c(0, 1, 2)
- quantiles <- c(0.1, 0.5, NA)
- expect_equal(
- bias_quantile(observed = 2, predicted, quantiles),
- -1
- )
- expect_equal(
- bias_quantile(observed = 2, predicted, quantiles, na.rm = FALSE),
- NA_real_
- )
-})
-
-test_that("bias_quantile() errors if no predictions", {
- expect_error(
- bias_quantile(observed = 2, numeric(0), numeric(0)),
- "Assertion on 'quantile' failed: Must have length >= 1, but has length 0"
- )
-})
-
-test_that("bias_quantile() returns correct bias if value below the median", {
- predicted <- c(1, 2, 4, 5)
- quantiles <- c(0.1, 0.3, 0.7, 0.9)
- suppressMessages(
- expect_equal(bias_quantile(observed = 1, predicted, quantiles), 0.8)
- )
-})
-
-test_that("bias_quantile() returns correct bias if value above median", {
- predicted <- c(1, 2, 4, 5)
- quantiles <- c(0.1, 0.3, 0.7, 0.9)
- suppressMessages(
- expect_equal(bias_quantile(observed = 5, predicted, quantiles), -0.8)
- )
-})
-
-test_that("bias_quantile() returns correct bias if value at the median", {
- predicted <- c(1, 2, 3, 4)
- quantiles <- c(0.1, 0.3, 0.5, 0.7)
-
- expect_equal(bias_quantile(observed = 3, predicted, quantiles), 0)
-})
-
-test_that("bias_quantile() returns 1 if true value below min prediction", {
- predicted <- c(2, 3, 4, 5)
- quantiles <- c(0.1, 0.3, 0.7, 0.9)
-
- suppressMessages(
- expect_equal(bias_quantile(observed = 1, predicted, quantiles), 1)
- )
-})
-
-test_that("bias_quantile() returns -1 if true value above max prediction", {
- predicted <- c(1, 2, 3, 4)
- quantiles <- c(0.1, 0.3, 0.5, 0.7)
-
- expect_equal(bias_quantile(observed = 6, predicted, quantiles), -1)
-})
-
-test_that("bias_quantile(): quantiles must be between 0 and 1", {
- predicted <- 1:4
-
- # Failing example
- quantiles <- c(-0.1, 0.3, 0.5, 0.8)
- expect_error(
- bias_quantile(observed = 3, predicted, quantiles),
- "Assertion on 'quantile' failed: Element 1 is not >= 0."
- )
-
- # Passing counter example
- quantiles <- c(0.1, 0.3, 0.5, 0.8)
- expect_silent(bias_quantile(observed = 3, predicted, quantiles))
-})
-
-test_that("bias_quantile(): quantiles must be increasing", {
- predicted <- 1:4
-
- # Failing example
- quantiles <- c(0.8, 0.3, 0.5, 0.9)
- expect_error(
- bias_quantile(observed = 3, predicted, quantiles),
- "Predictions must not be decreasing with increasing quantile level"
- )
-
- # Passing counter example
- quantiles <- c(0.3, 0.5, 0.8, 0.9)
- expect_silent(bias_quantile(observed = 3, predicted, quantiles))
-})
-
-test_that("bias_quantile(): predictions must be increasing", {
- predicted <- c(1, 2, 4, 3)
- quantiles <- c(0.1, 0.3, 0.5, 0.9)
-
- expect_error(
- bias_quantile(observed = 3, predicted, quantiles),
- "Predictions must not be decreasing with increasing quantile level"
- )
- expect_silent(bias_quantile( observed = 3, 1:4, quantiles))
-})
-
-test_that("bias_quantile(): quantiles must be unique", {
- predicted <- 1:4
-
- # Failing example
- quantiles <- c(0.3, 0.3, 0.5, 0.8)
- expect_error(
- bias_quantile(observed = 3, predicted, quantiles),
- "Assertion on 'quantile' failed: Contains duplicated values, position 2."
- )
-
- # Passing example
- quantiles <- c(0.3, 0.5, 0.8, 0.9)
- expect_silent(bias_quantile(observed = 3, predicted, quantiles))
-})
-
-test_that("bias_sample() approx equals bias_quantile() for many samples", {
- set.seed(123)
-
- # Generate true value
- observed <- 3
-
- # Generate many sample predictions
- predicted <- sample(rnorm(1000, mean = observed, sd = 2), 1000)
-
- # Get sample based bias
- bias_sample_result <- bias_sample(
- observed, matrix(predicted, nrow = 1)
- )
-
- # Convert predictions to quantiles
- quantiles <- seq(0, 1, length.out = 100)
- quantile_preds <- quantile(predicted, probs = quantiles)
-
- # Get quantile based bias
- bias_quantile_result <- suppressMessages(
- bias_quantile(observed, quantile_preds, quantiles)
- )
-
- # Difference should be small
- expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1)
-})
-
-test_that("bias_quantile() and bias_range() give the same result", {
- predicted <- sort(rnorm(23))
- lower <- rev(predicted[1:12])
- upper <- predicted[12:23]
-
- range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98)
- quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99)
- observed <- rnorm(1)
-
- range_bias <- bias_range(
- lower = lower, upper = upper,
- range = range, observed = observed
- )
- range_quantile <- bias_quantile(
- observed = observed,
- predicted = predicted,
- quantile = quantiles
- )
- expect_equal(range_bias, range_quantile)
-})
-
-test_that("bias_range() works with point forecasts", {
- predicted <- 1
- observed <- 1
- range <- c(0)
-
- expect_equal(bias_range(predicted, predicted, range, observed), 0)
-})
-
-test_that("bias_range(): ranges must be between 0 and 100", {
- lower <- 4:1
- upper <- 5:8
-
- # Failing example
- range <- c(-10, 0, 10, 20)
- expect_error(
- bias_range(lower, upper, range, observed = 3),
- "range must be between 0 and 100"
- )
-
- # Passing counter example
- range <- c(0, 10, 20, 30)
- expect_silent(bias_range(lower, upper, range, observed = 3))
-})
-
diff --git a/tests/testthat/test-convenience-functions.R b/tests/testthat/test-convenience-functions.R
index ecfc86653..98d784cd4 100644
--- a/tests/testthat/test-convenience-functions.R
+++ b/tests/testthat/test-convenience-functions.R
@@ -38,46 +38,55 @@ test_that("function transform_forecasts works", {
})
+# ============================================================================ #
+# `set_forecast_unit()`
+# ============================================================================ #
test_that("function set_forecast_unit() works", {
-
# some columns in the example data have duplicated information. So we can remove
# these and see whether the result stays the same.
-
- scores1 <- suppressMessages(score(example_quantile))
- scores1 <- scores1[order(location, target_end_date, target_type, horizon, model), ]
+ scores1 <- scores_quantile[order(location, target_end_date, target_type, horizon, model), ]
ex2 <- set_forecast_unit(
example_quantile,
c("location", "target_end_date", "target_type", "horizon", "model")
)
- scores2 <- suppressMessages(score(ex2))
+ scores2 <- score(ex2)
scores2 <- scores2[order(location, target_end_date, target_type, horizon, model), ]
expect_equal(scores1$interval_score, scores2$interval_score)
})
+test_that("set_forecast_unit() works on input that's not a data.table", {
+ df <- data.frame(
+ a = 1:2,
+ b = 2:3,
+ c = 3:4
+ )
+ expect_equal(
+ colnames(set_forecast_unit(df, c("a", "b"))),
+ c("a", "b")
+ )
+ # apparently it also works on a matrix... good to know :)
+ expect_equal(
+ names(set_forecast_unit(as.matrix(df), "a")),
+ "a"
+ )
+})
-test_that("function set_forecast_unit() gives warning when column is not there", {
+test_that("function set_forecast_unit() gives warning when column is not there", {
expect_warning(
set_forecast_unit(
example_quantile,
- c("location", "target_end_date", "target_type", "horizon", "model", "test")
+ c("location", "target_end_date", "target_type", "horizon", "model", "test1", "test2")
)
)
})
-
test_that("function get_forecast_unit() and set_forecast_unit() work together", {
-
fu_set <- c("location", "target_end_date", "target_type", "horizon", "model")
-
- ex <- set_forecast_unit(
- example_binary,
- fu_set
- )
-
+ ex <- set_forecast_unit(example_binary, fu_set)
fu_get <- get_forecast_unit(ex)
expect_equal(fu_set, fu_get)
})
diff --git a/tests/testthat/test-get_-functions.R b/tests/testthat/test-get_-functions.R
index 217e954bd..0a499b243 100644
--- a/tests/testthat/test-get_-functions.R
+++ b/tests/testthat/test-get_-functions.R
@@ -70,3 +70,42 @@ test_that("get_type() handles `NA` values", {
expect_equal(get_type(c(1, NA, 3.2)), "continuous")
expect_error(get_type(NA), "Can't get type: all values of are NA")
})
+
+
+# `get_duplicate_forecasts()` ==================================================
+test_that("get_duplicate_forecasts() works as expected for quantile", {
+ expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0)
+ expect_equal(
+ nrow(
+ get_duplicate_forecasts(rbind(example_quantile, example_quantile[1000:1010]))),
+ 22
+ )
+})
+
+test_that("get_duplicate_forecasts() works as expected for sample", {
+ expect_equal(nrow(get_duplicate_forecasts(example_continuous)), 0)
+ expect_equal(
+ nrow(
+ get_duplicate_forecasts(rbind(example_continuous, example_continuous[1040:1050]))),
+ 22
+ )
+})
+
+
+test_that("get_duplicate_forecasts() works as expected for binary", {
+ expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0)
+ expect_equal(
+ nrow(
+ get_duplicate_forecasts(rbind(example_binary, example_binary[1000:1010]))),
+ 22
+ )
+})
+
+test_that("get_duplicate_forecasts() works as expected for point", {
+ expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0)
+ expect_equal(
+ nrow(
+ get_duplicate_forecasts(rbind(example_point, example_point[1010:1020]))),
+ 22
+ )
+})
diff --git a/tests/testthat/test-get_duplicate_forecasts.R b/tests/testthat/test-get_duplicate_forecasts.R
deleted file mode 100644
index 5487ea0fb..000000000
--- a/tests/testthat/test-get_duplicate_forecasts.R
+++ /dev/null
@@ -1,37 +0,0 @@
-test_that("get_duplicate_forecasts() works as expected for quantile", {
- expect_equal(nrow(get_duplicate_forecasts(example_quantile)), 0)
- expect_equal(
- nrow(
- get_duplicate_forecasts(rbind(example_quantile, example_quantile[1000:1010]))),
- 22
- )
-})
-
-test_that("get_duplicate_forecasts() works as expected for sample", {
- expect_equal(nrow(get_duplicate_forecasts(example_continuous)), 0)
- expect_equal(
- nrow(
- get_duplicate_forecasts(rbind(example_continuous, example_continuous[1040:1050]))),
- 22
- )
-})
-
-
-test_that("get_duplicate_forecasts() works as expected for binary", {
- expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0)
- expect_equal(
- nrow(
- get_duplicate_forecasts(rbind(example_binary, example_binary[1000:1010]))),
- 22
- )
-})
-
-test_that("get_duplicate_forecasts() works as expected for point", {
- expect_equal(nrow(get_duplicate_forecasts(example_binary)), 0)
- expect_equal(
- nrow(
- get_duplicate_forecasts(rbind(example_point, example_point[1010:1020]))),
- 22
- )
-})
-
diff --git a/tests/testthat/test-lower-level-check-functions.R b/tests/testthat/test-metrics-binary.R
similarity index 52%
rename from tests/testthat/test-lower-level-check-functions.R
rename to tests/testthat/test-metrics-binary.R
index 8d73aa528..9c49e7050 100644
--- a/tests/testthat/test-lower-level-check-functions.R
+++ b/tests/testthat/test-metrics-binary.R
@@ -1,45 +1,13 @@
-test_that("Lower-level input check functions work", {
- observed <- rpois(30, lambda = 1:30)
- predicted <- replicate(20, rpois(n = 30, lambda = 1:30))
- expect_equal(length(crps_sample(observed, predicted)), 30)
-
- # should error when wrong prediction type is given
- predicted2 <- rpois(30, lambda = 1)
- expect_error(crps_sample(observed, predicted2),
- "Assertion on 'predicted' failed: Must be of type 'matrix', not 'integer'",
- fixed = TRUE
- )
-
- # predictions have wrong number of rows
- predicted3 <- replicate(20, rpois(n = 31, lambda = 1))
- expect_error(
- crps_sample(observed, predicted3),
- "Assertion on 'predicted' failed: Must have exactly 30 rows, but has 31 rows.",
- # "Mismatch: 'observed' has length `30`, but 'predicted' has `31` rows.",
- fixed = TRUE
- )
-
- # error with missing argument
- expect_error(crps_sample(predicted = predicted),
- 'argument "observed" is missing, with no default',
- fixed = TRUE
- )
-
- # checks work for binary forecasts
- observed <- factor(sample(c(0, 1), size = 10, replace = TRUE))
- predicted <- runif(n = 10)
- expect_equal(length(brier_score(observed, predicted)), 10)
-
- # predictions are not between 0 and 1
- predicted2 <- predicted + 2
- expect_error(
- brier_score(observed, predicted2),
- "Assertion on 'predicted' failed: Element 1 is not <= 1.",
- fixed = TRUE
- )
-})
-
-
+observed <- factor(rbinom(10, size = 1, prob = 0.5))
+predicted <- c(0.425, 0.55, 0.541, 0.52, 0.13, 0.469, 0.86, 0.22, 0.74, 0.9)
+df <- data.table(
+ observed = observed,
+ predicted = predicted,
+ model = "m1",
+ id = 1:10
+)
+
+# test input handling
test_that("function throws an error when missing observed or predicted", {
observed <- sample(c(0, 1), size = 10, replace = TRUE)
predicted <- replicate(
@@ -83,17 +51,6 @@ test_that("function throws an error for wrong format of `observed`", {
})
test_that("function throws an error for wrong format of predictions", {
- observed <- factor(sample(c(0, 1), size = 10, replace = TRUE))
- predicted <- runif(10, min = 0, max = 3)
- expect_error(
- brier_score(
- observed = observed,
- predicted = predicted
- ),
- #"For a binary forecast, all predictions should be probabilities between 0 or 1."
- "Assertion on 'predicted' failed: Element 1 is not <= 1."
- )
-
predicted <- runif(10, min = 0, max = 1)
expect_error(
brier_score(
@@ -114,3 +71,59 @@ test_that("function throws an error for wrong format of predictions", {
fixed = TRUE
)
})
+
+test_that("Input checking for binary forecasts works", {
+ # everything correct
+ expect_no_condition(
+ scoringutils:::assert_input_binary(observed, predicted)
+ )
+
+ # predicted > 1
+ expect_error(
+ scoringutils:::assert_input_binary(observed, predicted + 1),
+ "Assertion on 'predicted' failed: Element 1 is not <= 1."
+ )
+
+ # predicted < 0
+ expect_error(
+ scoringutils:::assert_input_binary(observed, predicted - 1),
+ "Assertion on 'predicted' failed: Element 1 is not >= 0."
+ )
+
+ # observed value not factor
+ expect_error(
+ scoringutils:::assert_input_binary(1:10, predicted),
+ "Assertion on 'observed' failed: Must be of type 'factor', not 'integer'."
+ )
+
+ # observed value has not 2 levels
+ expect_error(
+ scoringutils:::assert_input_binary(factor(1:10), predicted),
+ "Assertion on 'observed' failed: Must have exactly 2 levels."
+ )
+
+ # observed is a single number and does not have the same length as predicted
+ expect_error(
+ scoringutils:::assert_input_binary(factor(1), predicted),
+ "`observed` and `predicted` need to be of same length when scoring binary forecasts",
+ fixed = TRUE
+ )
+
+ # predicted is a matrix
+ expect_error(
+ scoringutils:::assert_input_binary(observed, matrix(predicted)),
+ "Assertion on 'predicted' failed: Must be of type 'atomic vector', not 'matrix'."
+ )
+})
+
+test_that("Binary metrics work within and outside of `score()`", {
+ result <- score(df)
+ expect_equal(
+ brier_score(observed, predicted),
+ result$brier_score
+ )
+ expect_equal(
+ logs_binary(observed, predicted),
+ result$log_score
+ )
+})
diff --git a/tests/testthat/test-absolute_error.R b/tests/testthat/test-metrics-point.R
similarity index 82%
rename from tests/testthat/test-absolute_error.R
rename to tests/testthat/test-metrics-point.R
index f61493b25..2f64226df 100644
--- a/tests/testthat/test-absolute_error.R
+++ b/tests/testthat/test-metrics-point.R
@@ -1,25 +1,20 @@
-test_that("absolute error (sample based) works", {
- observed <- rnorm(30, mean = 1:30)
- predicted_values <- rnorm(30, mean = 1:30)
-
- scoringutils <- ae_median_sample(observed, predicted_values)
-
- ae <- abs(observed - predicted_values)
- expect_equal(ae, scoringutils)
-})
-
-
-# covidHubUtils-tests
+# covidHubUtils-tests on absolute error ========================================
+# test are adapted from the package
+# covidHubUtils, https://github.com/reichlab/covidHubUtils/
+y <- c(1, -15, 22)
+forecast_quantiles_matrix <- rbind(
+ c(-1, 0, 1, 2, 3),
+ c(-2, 1, 2, 2, 4),
+ c(-2, 0, 3, 3, 4)
+)
+forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9)
+
+target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7)
+horizons <- c("1", "2", "1")
+locations <- c("01", "01", "02")
+target_variables <- rep("inc death", length(y))
test_that("abs error is correct within score, point forecast only", {
- # test is adapted from the package covidHubUtils, https://github.com/reichlab/covidHubUtils/
- y <- c(1, -15, 22)
-
- target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7)
- horizons <- c("1", "2", "1")
- locations <- c("01", "01", "02")
- target_variables <- rep("inc death", length(y))
-
forecast_target_end_dates <-
rep(target_end_dates, times = 1)
forecast_horizons <- rep(horizons, times = 1)
@@ -73,21 +68,9 @@ test_that("abs error is correct within score, point forecast only", {
})
test_that("abs error is correct, point and median forecasts different", {
- y <- c(1, -15, 22)
- forecast_quantiles_matrix <- rbind(
- c(-1, 0, 1, 2, 3),
- c(-2, 1, 2, 2, 4),
- c(-2, 0, 3, 3, 4)
- )
- forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9)
forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE]
forecast_quantile_probs <- forecast_quantile_probs[3]
- target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7)
- horizons <- c("1", "2", "1")
- locations <- c("01", "01", "02")
- target_variables <- rep("inc death", length(y))
-
forecast_target_end_dates <-
rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix))
forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix))
@@ -145,21 +128,9 @@ test_that("abs error is correct, point and median forecasts different", {
})
test_that("abs error is correct, point and median forecasts same", {
- y <- c(1, -15, 22)
- forecast_quantiles_matrix <- rbind(
- c(-1, 0, 1, 2, 3),
- c(-2, 1, 2, 2, 4),
- c(-2, 0, 3, 3, 4)
- )
- forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9)
forecast_quantiles_matrix <- forecast_quantiles_matrix[, 3, drop = FALSE]
forecast_quantile_probs <- forecast_quantile_probs[3]
- target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7)
- horizons <- c("1", "2", "1")
- locations <- c("01", "01", "02")
- target_variables <- rep("inc death", length(y))
-
forecast_target_end_dates <-
rep(target_end_dates, times = 1 + ncol(forecast_quantiles_matrix))
forecast_horizons <- rep(horizons, times = 1 + ncol(forecast_quantiles_matrix))
@@ -196,7 +167,6 @@ test_that("abs error is correct, point and median forecasts same", {
stringsAsFactors = FALSE
)
-
# bring in scoringutils format
truth_scoringutils <- data.table::as.data.table(test_truth)
fc_scoringutils <- data.table::as.data.table(test_forecasts)
@@ -221,7 +191,5 @@ test_that("abs error is correct, point and median forecasts same", {
)
expected <- abs(y - point_forecast)
-
- # expect_equal(actual$abs_error, expected)
expect_equal(eval$ae_point, expected)
})
diff --git a/tests/testthat/test-interval_score.R b/tests/testthat/test-metrics-quantile.R
similarity index 51%
rename from tests/testthat/test-interval_score.R
rename to tests/testthat/test-metrics-quantile.R
index 2a75c6fdb..8dd6d6a22 100644
--- a/tests/testthat/test-interval_score.R
+++ b/tests/testthat/test-metrics-quantile.R
@@ -1,12 +1,84 @@
-test_that("wis works, median only", {
+observed <- c(1, -15, 22)
+predicted <- rbind(
+ c(-1, 0, 1, 2, 3),
+ c(-2, 1, 2, 2, 4),
+ c(-2, 0, 3, 3, 4)
+)
+quantile <- c(0.1, 0.25, 0.5, 0.75, 0.9)
+
+# covidHubUtils test:
+y <- c(1, -15, 22)
+forecast_quantiles_matrix <- rbind(
+ c(-1, 0, 1, 2, 3),
+ c(-2, 1, 2, 2, 4),
+ c(-2, 0, 3, 3, 4)
+)
+forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9)
+
+
+# ============================================================================ #
+# Input handling ===============================================================
+# ============================================================================ #
+test_that("Input checking for quantile forecasts works", {
+ # everything correct
+ expect_no_condition(
+ scoringutils:::assert_input_quantile(observed, predicted, quantile)
+ )
+
+ # quantile > 1
+ expect_error(
+ scoringutils:::assert_input_quantile(observed, predicted, quantile + 1),
+ "Assertion on 'quantile' failed: Element 1 is not <= 1."
+ )
+
+ # quantile < 0
+ expect_error(
+ scoringutils:::assert_input_quantile(observed, predicted, quantile - 1),
+ "Assertion on 'quantile' failed: Element 1 is not >= 0."
+ )
+
+ # 10 observations, but only 3 forecasts
+ expect_error(
+ scoringutils:::assert_input_quantile(1:10, predicted, quantile),
+ "Assertion on 'predicted' failed: Must have exactly 10 rows, but has 3 rows."
+ )
+
+ # observed value is a factor
+ expect_error(
+ scoringutils:::assert_input_quantile(factor(1:10), predicted, quantile),
+ "Assertion on 'observed' failed: Must be of type 'numeric', not 'factor'."
+ )
+
+ # observed is a single number and does not have the same length as predicted
+ # There seems to be an issue with the error message: there is one \n to many
+ # such that the test fails when executed alone, but works when executed
+ # together with others.
+ expect_error(
+ scoringutils:::assert_input_quantile(1, predicted, quantile),
+ "Assertion failed. One of the following must apply:\n * check_numeric_vector(predicted): Must be of type 'atomic vector',\n * not 'matrix'\n * check_matrix(predicted): Must have exactly 1 rows, but has 3 rows",
+ fixed = TRUE
+ )
+
+ # predicted is a vector
+ expect_error(
+ scoringutils:::assert_input_quantile(observed, as.vector(predicted), quantile),
+ "Assertion on 'predicted' failed: Must be of type 'matrix', not 'double'."
+ )
+})
+
+
+# ============================================================================ #
+# wis ==========================================================================
+# ============================================================================ #a
+test_that("wis works standalone, median only", {
y <- c(1, -15, 22)
lower <- upper <- predicted_quantile <- c(1, 2, 3)
quantile_probs <- 0.5
actual <- interval_score(y,
- lower = lower, upper = upper,
- weigh = TRUE,
- interval_range = 0
+ lower = lower, upper = upper,
+ weigh = TRUE,
+ interval_range = 0
)
actual_wis <- wis(
@@ -20,7 +92,7 @@ test_that("wis works, median only", {
expect_identical(actual, expected)
})
-test_that("WIS works within score for median forecast", {
+test_that("`wis()` works within score for median forecast", {
test_data <- data.frame(
observed = c(1, -15, 22),
predicted = 1:3,
@@ -28,13 +100,14 @@ test_that("WIS works within score for median forecast", {
model = "model1",
date = 1:3
)
- eval <- scoringutils::score(test_data,
- count_median_twice = TRUE
+ eval <- score(
+ test_data,
+ count_median_twice = TRUE, metrics = metrics_no_cov
)
- expect_equal(eval$ae_median, eval$interval_score)
+ expect_equal(eval$ae_median, eval$wis)
})
-test_that("wis works, 1 interval only", {
+test_that("`wis()` equals `interval_score()`, 1 interval only", {
y <- c(1, -15, 22)
lower <- c(0, 1, 0)
upper <- c(2, 2, 3)
@@ -61,7 +134,7 @@ test_that("wis works, 1 interval only", {
expect_identical(actual_wis, expected)
})
-test_that("WIS works within score for one interval", {
+test_that("wis() works within score for one interval", {
test_data <- data.frame(
observed = rep(c(1, -15, 22), times = 2),
quantile = rep(c(0.25, 0.75), each = 3),
@@ -70,9 +143,10 @@ test_that("WIS works within score for one interval", {
date = rep(1:3, times = 2)
)
- eval <- suppressMessages(scoringutils::score(test_data,
- count_median_twice = TRUE
- ))
+ eval <- score(
+ test_data,
+ count_median_twice = TRUE, metrics = list(wis = wis)
+ )
eval <- summarise_scores(eval, by = c("model", "date"))
@@ -82,10 +156,10 @@ test_that("WIS works within score for one interval", {
expected <- (upper - lower) * (alpha / 2) + c(0, 1 - (-15), 22 - 3)
- expect_equal(expected, eval$interval_score)
+ expect_equal(expected, eval$wis)
})
-test_that("wis works, 1 interval and median", {
+test_that("`wis()` works 1 interval and median", {
test_data <- data.frame(
observed = rep(c(1, -15, 22), times = 3),
quantile = rep(c(0.25, 0.5, 0.75), each = 3),
@@ -94,8 +168,9 @@ test_that("wis works, 1 interval and median", {
date = rep(1:3, times = 3)
)
- eval <- scoringutils::score(test_data,
- count_median_twice = TRUE
+ eval <- score(
+ test_data,
+ count_median_twice = TRUE, metrics = metrics_no_cov
)
eval <- summarise_scores(eval, by = c("model", "date"))
@@ -117,19 +192,10 @@ test_that("wis works, 1 interval and median", {
count_median_twice = TRUE
)
- expect_identical(eval$interval_score, expected)
+ expect_identical(eval$wis, expected)
expect_identical(actual_wis, expected)
})
-# covidHubUtils test:
-y <- c(1, -15, 22)
-forecast_quantiles_matrix <- rbind(
- c(-1, 0, 1, 2, 3),
- c(-2, 1, 2, 2, 4),
- c(-2, 0, 3, 3, 4)
-)
-forecast_quantile_probs <- c(0.1, 0.25, 0.5, 0.75, 0.9)
-
test_that("wis works, 2 intervals and median", {
test_data <- data.frame(
observed = rep(c(1, -15, 22), times = 5),
@@ -142,8 +208,9 @@ test_that("wis works, 2 intervals and median", {
date = rep(1:3, times = 5)
)
- eval <- scoringutils::score(test_data,
- count_median_twice = TRUE
+ eval <- score(
+ test_data,
+ count_median_twice = TRUE, metrics = metrics_no_cov
)
eval <- summarise_scores(eval, by = c("model", "date"))
@@ -168,7 +235,7 @@ test_that("wis works, 2 intervals and median", {
)
expect_equal(
- as.numeric(eval$interval_score),
+ as.numeric(eval$wis),
as.numeric(expected)
)
expect_identical(actual_wis, expected)
@@ -228,8 +295,9 @@ test_that("wis is correct, median only - test corresponds to covidHubUtils", {
data_formatted <- merge(forecasts_formated, truth_formatted)
- eval <- scoringutils::score(data_formatted,
- count_median_twice = FALSE
+ eval <- score(
+ data_formatted,
+ count_median_twice = FALSE, metrics = metrics_no_cov
)
expected <- abs(y - forecast_quantiles_matrix[, 1])
@@ -241,11 +309,10 @@ test_that("wis is correct, median only - test corresponds to covidHubUtils", {
count_median_twice = FALSE
)
- expect_equal(eval$interval_score, expected)
+ expect_equal(eval$wis, expected)
expect_equal(actual_wis, expected)
})
-
test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils", {
forecast_quantiles_matrix <- forecast_quantiles_matrix[, c(1, 5), drop = FALSE]
forecast_quantile_probs <- forecast_quantile_probs[c(1, 5)]
@@ -299,15 +366,15 @@ test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils",
data_formatted <- merge(forecasts_formated, truth_formatted)
- eval <- suppressMessages(scoringutils::score(data_formatted,
- count_median_twice = FALSE
+ eval <- suppressMessages(score(data_formatted,
+ count_median_twice = FALSE, metrics = metrics_no_cov_no_ae
))
eval <- summarise_scores(eval,
- by = c(
- "model", "location", "target_variable",
- "target_end_date", "forecast_date", "horizon"
- )
+ by = c(
+ "model", "location", "target_variable",
+ "target_end_date", "forecast_date", "horizon"
+ )
)
alpha1 <- 0.2
@@ -321,11 +388,10 @@ test_that("wis is correct, 1 interval only - test corresponds to covidHubUtils",
count_median_twice = FALSE
)
- expect_equal(eval$interval_score, expected)
+ expect_equal(eval$wis, expected)
expect_equal(actual_wis, expected)
})
-
test_that("wis is correct, 2 intervals and median - test corresponds to covidHubUtils", {
target_end_dates <- as.Date("2020-01-01") + c(7, 14, 7)
horizons <- c("1", "2", "1")
@@ -376,15 +442,15 @@ test_that("wis is correct, 2 intervals and median - test corresponds to covidHub
data_formatted <- merge(forecasts_formated, truth_formatted)
- eval <- scoringutils::score(data_formatted,
- count_median_twice = FALSE
+ eval <- score(data_formatted,
+ count_median_twice = FALSE, metrics = metrics_no_cov
)
eval <- summarise_scores(eval,
- by = c(
- "model", "location", "target_variable",
- "target_end_date", "forecast_date", "horizon"
- )
+ by = c(
+ "model", "location", "target_variable",
+ "target_end_date", "forecast_date", "horizon"
+ )
)
alpha1 <- 0.2
@@ -402,7 +468,7 @@ test_that("wis is correct, 2 intervals and median - test corresponds to covidHub
count_median_twice = FALSE
)
- expect_equal(eval$interval_score, expected)
+ expect_equal(eval$wis, expected)
})
test_that("Quantlie score and interval score yield the same result, weigh = FALSE", {
@@ -431,20 +497,24 @@ test_that("Quantlie score and interval score yield the same result, weigh = FALS
)
qs_lower <- quantile_score(observed,
- predicted = lower,
- quantile = alpha / 2,
- weigh = w
+ predicted = lower,
+ quantile = alpha / 2,
+ weigh = w
)
qs_upper <- quantile_score(observed,
- predicted = upper,
- quantile = 1 - alpha / 2,
- weigh = w
+ predicted = upper,
+ quantile = 1 - alpha / 2,
+ weigh = w
)
expect_equal((qs_lower + qs_upper) / 2, is)
expect_equal(wis, is)
}
})
+
+# ============================================================================ #
+# Quantile score ============================================================= #
+# ============================================================================ #
test_that("Quantlie score and interval score yield the same result, weigh = TRUE", {
observed <- rnorm(10, mean = 1:10)
alphas <- c(0.1, 0.5, 0.9)
@@ -471,14 +541,14 @@ test_that("Quantlie score and interval score yield the same result, weigh = TRUE
)
qs_lower <- quantile_score(observed,
- predicted = lower,
- quantile = alpha / 2,
- weigh = w
+ predicted = lower,
+ quantile = alpha / 2,
+ weigh = w
)
qs_upper <- quantile_score(observed,
- predicted = upper,
- quantile = 1 - alpha / 2,
- weigh = w
+ predicted = upper,
+ quantile = 1 - alpha / 2,
+ weigh = w
)
expect_equal((qs_lower + qs_upper) / 2, is)
expect_equal(wis, is)
@@ -495,3 +565,229 @@ test_that("wis works with separate results", {
expect_equal(wis$wis, wis$dispersion + wis$overprediction + wis$underprediction)
})
+
+# ============================================================================ #
+# overprediction, underprediction, dispersion ================================ #
+# ============================================================================ #
+test_that("wis is the sum of overprediction, underprediction, dispersion", {
+ wis <- wis(
+ observed = y,
+ predicted = forecast_quantiles_matrix,
+ quantile = forecast_quantile_probs
+ )
+
+ d <- dispersion(y, forecast_quantiles_matrix, forecast_quantile_probs)
+ o <- overprediction(y, forecast_quantiles_matrix, forecast_quantile_probs)
+ u <- underprediction(y, forecast_quantiles_matrix, forecast_quantile_probs)
+
+ expect_equal(wis, d + o + u)
+})
+
+
+# ============================================================================ #
+# `interval_coverage_quantile` =============================================== #
+# ============================================================================ #
+test_that("interval_coverage_quantile works", {
+ expect_equal(
+ interval_coverage_quantile(observed, predicted, quantile, range = 50),
+ c(TRUE, FALSE, FALSE)
+ )
+})
+
+test_that("interval_coverage_quantile rejects wrong inputs", {
+ expect_error(
+ interval_coverage_quantile(observed, predicted, quantile, range = c(50, 0)),
+ "Assertion on 'range' failed: Must have length 1."
+ )
+})
+
+
+# ============================================================================ #
+# `interval_coverage_deviation_quantile` ===================================== #
+# ============================================================================ #
+test_that("interval_coverage_deviation_quantile works", {
+ existing_ranges <- unique(get_range_from_quantile(quantile))
+ expect_equal(existing_ranges, c(80, 50, 0))
+
+ cov_50 <- interval_coverage_quantile(observed, predicted, quantile, range = c(50))
+ cov_80 <- interval_coverage_quantile(observed, predicted, quantile, range = c(80))
+ manual <- 0.5 * (cov_50 - 0.5) + 0.5 * (cov_80 - 0.8)
+
+ expect_equal(
+ interval_coverage_deviation_quantile(observed, predicted, quantile),
+ manual
+ )
+})
+
+
+# ============================================================================ #
+# `bias_quantile` ==============================================================
+# ============================================================================ #
+test_that("bias_quantile() works as expected", {
+ predicted <- c(1, 2, 3)
+ quantiles <- c(0.1, 0.5, 0.9)
+ expect_equal(
+ bias_quantile(observed = 2, predicted, quantiles),
+ 0
+ )
+ predicted <- c(0, 1, 2)
+ quantiles <- c(0.1, 0.5, 0.9)
+ expect_equal(
+ bias_quantile(observed = 2, predicted, quantiles),
+ -0.8
+ )
+
+ predicted <- c(
+ 705.500, 1127.000, 4006.250, 4341.500, 4709.000, 4821.996,
+ 5340.500, 5451.000, 5703.500, 6087.014, 6329.500, 6341.000,
+ 6352.500, 6594.986, 6978.500, 7231.000, 7341.500, 7860.004,
+ 7973.000, 8340.500, 8675.750, 11555.000, 11976.500
+ )
+
+ quantile <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99)
+
+ observed <- 8062
+ expect_equal(bias_quantile(observed, predicted, quantile), -0.8)
+})
+
+test_that("bias_quantile handles matrix input", {
+ observed <- seq(10, 0, length.out = 4)
+ predicted <- matrix(1:12, ncol = 3)
+ quantiles <- c(0.1, 0.5, 0.9)
+ expect_equal(
+ bias_quantile(observed, predicted, quantiles),
+ c(-1.0, -0.8, 0.8, 1.0)
+ )
+})
+
+
+test_that("bias_quantile() handles vector that is too long", {
+ predicted <- c(NA, 1, 2, 3)
+ quantiles <- c(0.1, 0.5, 0.9)
+
+ expect_error(
+ bias_quantile(observed = 2, predicted, quantiles),
+ "Assertion on 'quantile' failed: Must have length 4, but has length 3."
+ )
+})
+
+test_that("bias_quantile() handles NA values", {
+ predicted <- c(NA, 1, 2)
+ quantiles <- c(0.1, 0.5, 0.9)
+ expect_equal(
+ bias_quantile(observed = 2, predicted, quantiles),
+ -0.8
+ )
+ predicted <- c(0, 1, 2)
+ quantiles <- c(0.1, 0.5, NA)
+ expect_equal(
+ bias_quantile(observed = 2, predicted, quantiles),
+ -1
+ )
+ expect_equal(
+ bias_quantile(observed = 2, predicted, quantiles, na.rm = FALSE),
+ NA_real_
+ )
+})
+
+test_that("bias_quantile() errors if no predictions", {
+ expect_error(
+ bias_quantile(observed = 2, numeric(0), numeric(0)),
+ "Assertion on 'quantile' failed: Must have length >= 1, but has length 0"
+ )
+})
+
+test_that("bias_quantile() returns correct bias if value below the median", {
+ predicted <- c(1, 2, 4, 5)
+ quantiles <- c(0.1, 0.3, 0.7, 0.9)
+ suppressMessages(
+ expect_equal(bias_quantile(observed = 1, predicted, quantiles), 0.8)
+ )
+})
+
+test_that("bias_quantile() returns correct bias if value above median", {
+ predicted <- c(1, 2, 4, 5)
+ quantiles <- c(0.1, 0.3, 0.7, 0.9)
+ suppressMessages(
+ expect_equal(bias_quantile(observed = 5, predicted, quantiles), -0.8)
+ )
+})
+
+test_that("bias_quantile() returns correct bias if value at the median", {
+ predicted <- c(1, 2, 3, 4)
+ quantiles <- c(0.1, 0.3, 0.5, 0.7)
+
+ expect_equal(bias_quantile(observed = 3, predicted, quantiles), 0)
+})
+
+test_that("bias_quantile() returns 1 if true value below min prediction", {
+ predicted <- c(2, 3, 4, 5)
+ quantiles <- c(0.1, 0.3, 0.7, 0.9)
+
+ suppressMessages(
+ expect_equal(bias_quantile(observed = 1, predicted, quantiles), 1)
+ )
+})
+
+test_that("bias_quantile() returns -1 if true value above max prediction", {
+ predicted <- c(1, 2, 3, 4)
+ quantiles <- c(0.1, 0.3, 0.5, 0.7)
+
+ expect_equal(bias_quantile(observed = 6, predicted, quantiles), -1)
+})
+
+test_that("bias_quantile(): quantiles must be between 0 and 1", {
+ predicted <- 1:4
+
+ # Failing example
+ quantiles <- c(-0.1, 0.3, 0.5, 0.8)
+ expect_error(
+ bias_quantile(observed = 3, predicted, quantiles),
+ "Assertion on 'quantile' failed: Element 1 is not >= 0."
+ )
+
+ # Passing counter example
+ quantiles <- c(0.1, 0.3, 0.5, 0.8)
+ expect_silent(bias_quantile(observed = 3, predicted, quantiles))
+})
+
+test_that("bias_quantile(): quantiles must be increasing", {
+ predicted <- 1:4
+
+ # Failing example
+ quantiles <- c(0.8, 0.3, 0.5, 0.9)
+ expect_error(
+ bias_quantile(observed = 3, predicted, quantiles),
+ "Predictions must not be decreasing with increasing quantile level"
+ )
+
+ # Passing counter example
+ quantiles <- c(0.3, 0.5, 0.8, 0.9)
+ expect_silent(bias_quantile(observed = 3, predicted, quantiles))
+})
+
+test_that("bias_quantile(): predictions must be increasing", {
+ predicted <- c(1, 2, 4, 3)
+ quantiles <- c(0.1, 0.3, 0.5, 0.9)
+
+ expect_error(
+ bias_quantile(observed = 3, predicted, quantiles),
+ "Predictions must not be decreasing with increasing quantile level"
+ )
+ expect_silent(bias_quantile( observed = 3, 1:4, quantiles))
+})
+
+test_that("bias_quantile(): quantiles must be unique", {
+ predicted <- 1:4
+
+ # Failing example
+ quantiles <- c(0.3, 0.3, 0.5, 0.8)
+ expect_error(
+ bias_quantile(observed = 3, predicted, quantiles),
+ "Assertion on 'quantile' failed: Contains duplicated values, position 2."
+ )
+
+ # Passing example
+ quantiles <- c(0.3, 0.5, 0.8, 0.9)
+ expect_silent(bias_quantile(observed = 3, predicted, quantiles))
+})
diff --git a/tests/testthat/test-metrics-range.R b/tests/testthat/test-metrics-range.R
new file mode 100644
index 000000000..bd0290e9f
--- /dev/null
+++ b/tests/testthat/test-metrics-range.R
@@ -0,0 +1,45 @@
+test_that("bias_quantile() and bias_range() give the same result", {
+ predicted <- sort(rnorm(23))
+ lower <- rev(predicted[1:12])
+ upper <- predicted[12:23]
+
+ range <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 98)
+ quantiles <- c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99)
+ observed <- rnorm(1)
+
+ range_bias <- bias_range(
+ lower = lower, upper = upper,
+ range = range, observed = observed
+ )
+ range_quantile <- bias_quantile(
+ observed = observed,
+ predicted = predicted,
+ quantile = quantiles
+ )
+ expect_equal(range_bias, range_quantile)
+})
+
+test_that("bias_range() works with point forecasts", {
+ predicted <- 1
+ observed <- 1
+ range <- c(0)
+
+ expect_equal(bias_range(predicted, predicted, range, observed), 0)
+})
+
+test_that("bias_range(): ranges must be between 0 and 100", {
+ lower <- 4:1
+ upper <- 5:8
+
+ # Failing example
+ range <- c(-10, 0, 10, 20)
+ expect_error(
+ bias_range(lower, upper, range, observed = 3),
+ "range must be between 0 and 100"
+ )
+
+ # Passing counter example
+ range <- c(0, 10, 20, 30)
+ expect_silent(bias_range(lower, upper, range, observed = 3))
+})
+
diff --git a/tests/testthat/test-metrics-sample.R b/tests/testthat/test-metrics-sample.R
new file mode 100644
index 000000000..ded7b52ae
--- /dev/null
+++ b/tests/testthat/test-metrics-sample.R
@@ -0,0 +1,146 @@
+test_that("Input handling", {
+ observed <- rpois(30, lambda = 1:30)
+ predicted <- replicate(20, rpois(n = 30, lambda = 1:30))
+ expect_equal(length(crps_sample(observed, predicted)), 30)
+
+ # should error when wrong prediction type is given
+ predicted2 <- rpois(30, lambda = 1)
+ expect_error(crps_sample(observed, predicted2),
+ "Assertion on 'predicted' failed: Must be of type 'matrix', not 'integer'",
+ fixed = TRUE
+ )
+
+ # predictions have wrong number of rows
+ predicted3 <- replicate(20, rpois(n = 31, lambda = 1))
+ expect_error(
+ crps_sample(observed, predicted3),
+ "Assertion on 'predicted' failed: Must have exactly 30 rows, but has 31 rows.",
+ # "Mismatch: 'observed' has length `30`, but 'predicted' has `31` rows.",
+ fixed = TRUE
+ )
+
+ # error with missing argument
+ expect_error(crps_sample(predicted = predicted),
+ 'argument "observed" is missing, with no default',
+ fixed = TRUE
+ )
+})
+
+
+
+test_that("bias_sample() throws an error when missing observed", {
+ observed <- rpois(10, lambda = 1:10)
+ predicted <- replicate(50, rpois(n = 10, lambda = 1:10))
+
+ expect_error(
+ bias_sample(predicted = predicted),
+ 'argument "observed" is missing, with no default'
+ )
+})
+
+test_that("bias_sample() throws an error when missing 'predicted'", {
+ observed <- rpois(10, lambda = 1:10)
+ predicted <- replicate(50, rpois(n = 10, lambda = 1:10))
+
+ expect_error(
+ bias_sample(observed = observed),
+ 'argument "predicted" is missing, with no default'
+ )
+})
+
+test_that("bias_sample() works for integer observed and predicted", {
+ observed <- rpois(10, lambda = 1:10)
+ predicted <- replicate(10, rpois(10, lambda = 1:10))
+ output <- bias_sample(
+ observed = observed,
+ predicted = predicted
+ )
+ expect_equal(
+ length(output),
+ length(observed)
+ )
+ expect_equal(
+ class(output),
+ "numeric"
+ )
+})
+
+test_that("bias_sample() works for continuous observed values and predicted", {
+ observed <- rnorm(10)
+ predicted <- replicate(10, rnorm(10))
+ output <- bias_sample(
+ observed = observed,
+ predicted = predicted
+ )
+ expect_equal(
+ length(output),
+ length(observed)
+ )
+ expect_equal(
+ class(output),
+ "numeric"
+ )
+})
+
+test_that("bias_sample() works as expected", {
+ observed <- rpois(30, lambda = 1:30)
+ predicted <- replicate(200, rpois(n = 30, lambda = 1:30))
+ expect_true(all(bias_sample(observed, predicted) == bias_sample(observed, predicted)))
+
+ ## continuous forecasts
+ observed <- rnorm(30, mean = 1:30)
+ predicted <- replicate(200, rnorm(30, mean = 1:30))
+
+ scoringutils2 <- bias_sample(observed, predicted)
+ scoringutils <- bias_sample(observed, predicted)
+
+ expect_equal(scoringutils, scoringutils2)
+})
+
+
+test_that("bias_sample() approx equals bias_quantile() for many samples", {
+ set.seed(123)
+
+ # Generate true value
+ observed <- 3
+
+ # Generate many sample predictions
+ predicted <- sample(rnorm(1000, mean = observed, sd = 2), 1000)
+
+ # Get sample based bias
+ bias_sample_result <- bias_sample(
+ observed, matrix(predicted, nrow = 1)
+ )
+
+ # Convert predictions to quantiles
+ quantiles <- seq(0, 1, length.out = 100)
+ quantile_preds <- quantile(predicted, probs = quantiles)
+
+ # Get quantile based bias
+ bias_quantile_result <- suppressMessages(
+ bias_quantile(observed, quantile_preds, quantiles)
+ )
+
+ # Difference should be small
+ expect_equal(bias_quantile_result, bias_sample_result, tolerance = 0.1)
+})
+
+
+# `ae_median_sample` ===========================================================
+test_that("ae_median_sample works", {
+ observed <- rnorm(30, mean = 1:30)
+ predicted_values <- rnorm(30, mean = 1:30)
+ scoringutils <- ae_median_sample(observed, matrix(predicted_values))
+ ae <- abs(observed - predicted_values)
+ expect_equal(ae, scoringutils)
+})
+
+# `mad_sample()` ===============================================================
+test_that("function throws an error when missing 'predicted'", {
+ predicted <- replicate(50, rpois(n = 10, lambda = 1:10))
+
+ expect_error(
+ mad_sample()
+ )
+})
+
diff --git a/tests/testthat/test-pairwise_comparison.R b/tests/testthat/test-pairwise_comparison.R
index 3d0120adb..74275a06a 100644
--- a/tests/testthat/test-pairwise_comparison.R
+++ b/tests/testthat/test-pairwise_comparison.R
@@ -53,7 +53,7 @@ test_that("pairwise_comparison() works", {
)
# evaluate the toy forecasts, once with and once without a baseline model specified
- eval <- suppressMessages(score(data_formatted))
+ eval <- score(data_formatted)
# check with relative skills
eval_without_rel_skill <- summarise_scores(
@@ -85,7 +85,7 @@ test_that("pairwise_comparison() works", {
# prepare scores for the code Johannes Bracher wrote
scores_johannes <- data.table::copy(eval_without_baseline) # doesn't matter which one
data.table::setnames(scores_johannes,
- old = c("location", "target_end_date", "interval_score"),
+ old = c("location", "target_end_date", "wis"),
new = c("unit", "timezero", "wis")
)
@@ -238,7 +238,7 @@ test_that("pairwise_comparison() works", {
model = rep(c("model1", "model2", "model3"), each = 10),
date = as.Date("2020-01-01") + rep(1:5, each = 2),
location = c(1, 2),
- interval_score = (abs(rnorm(30))),
+ wis = (abs(rnorm(30))),
ae_median = (abs(rnorm(30)))
)
diff --git a/tests/testthat/test-plot_heatmap.R b/tests/testthat/test-plot_heatmap.R
index 9118ff21f..d6453bd45 100644
--- a/tests/testthat/test-plot_heatmap.R
+++ b/tests/testthat/test-plot_heatmap.R
@@ -1,9 +1,7 @@
library(ggplot2, quietly = TRUE)
test_that("plot_heatmap() works as expected", {
- scores <- suppressMessages(
- summarise_scores(scores_quantile, by = c("model", "target_type", "range"))
- )
+ scores <- summarise_scores(scores_quantile, by = c("model", "target_type"))
p <- plot_heatmap(scores, x = "target_type", metric = "bias")
expect_s3_class(p, "ggplot")
skip_on_cran()
diff --git a/tests/testthat/test-plot_interval_coverage.R b/tests/testthat/test-plot_interval_coverage.R
index 04f203b03..0e885219f 100644
--- a/tests/testthat/test-plot_interval_coverage.R
+++ b/tests/testthat/test-plot_interval_coverage.R
@@ -1,11 +1,8 @@
-library(ggplot2, quietly = TRUE)
-
test_that("plot_interval_coverage() works as expected", {
- scores <- suppressMessages(
- summarise_scores(scores_quantile, by = c("model", "range"))
- )
- p <- plot_interval_coverage(scores)
+ coverage <- add_coverage(example_quantile) |>
+ summarise_scores(by = c("model", "range"))
+ p <- plot_interval_coverage(coverage)
expect_s3_class(p, "ggplot")
skip_on_cran()
- vdiffr::expect_doppelganger("plot_interval_coverage", p)
+ suppressWarnings(vdiffr::expect_doppelganger("plot_interval_coverage", p))
})
diff --git a/tests/testthat/test-plot_quantile_coverage.R b/tests/testthat/test-plot_quantile_coverage.R
index 6c3593c04..060b9be26 100644
--- a/tests/testthat/test-plot_quantile_coverage.R
+++ b/tests/testthat/test-plot_quantile_coverage.R
@@ -1,9 +1,9 @@
test_that("plot_quantile_coverage() works as expected", {
- scores <- suppressMessages(
- summarise_scores(scores_quantile, by = c("model", "quantile"))
- )
- p <- plot_quantile_coverage(scores)
+ coverage <- add_coverage(example_quantile) |>
+ summarise_scores(by = c("model", "quantile"))
+
+ p <- plot_quantile_coverage(coverage)
expect_s3_class(p, "ggplot")
skip_on_cran()
- vdiffr::expect_doppelganger("plot_quantile_coverage", p)
+ suppressWarnings(vdiffr::expect_doppelganger("plot_quantile_coverage", p))
})
diff --git a/tests/testthat/test-plot_ranges.R b/tests/testthat/test-plot_ranges.R
index e9ae5575b..b4dec124e 100644
--- a/tests/testthat/test-plot_ranges.R
+++ b/tests/testthat/test-plot_ranges.R
@@ -1,13 +1,18 @@
-sum_scores <- suppressMessages(
- summarise_scores(scores_quantile, by = c("model", "target_type", "range"))
-)
+m <- modifyList(metrics_no_cov_no_ae, list("bias" = NULL))
+
+sum_scores <- copy(example_quantile) %>%
+ .[, interval_range := scoringutils:::get_range_from_quantile(quantile)] |>
+ score(metrics = m) |>
+ summarise_scores(by = c("model", "target_type", "interval_range"))
+
+sum_scores[, range := interval_range]
test_that("plot_ranges() works as expected with interval score", {
p <- plot_ranges(sum_scores, x = "model") +
- facet_wrap(~target_type, scales = "free")
+ facet_wrap(~target_type, scales = "free")
expect_s3_class(p, "ggplot")
skip_on_cran()
- vdiffr::expect_doppelganger("plot_ranges_interval", p)
+ suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_interval", p))
})
test_that("plot_ranges() works as expected with dispersion", {
@@ -15,5 +20,5 @@ test_that("plot_ranges() works as expected with dispersion", {
facet_wrap(~target_type)
expect_s3_class(p, "ggplot")
skip_on_cran()
- vdiffr::expect_doppelganger("plot_ranges_dispersion", p)
+ suppressWarnings(vdiffr::expect_doppelganger("plot_ranges_dispersion", p))
})
diff --git a/tests/testthat/test-plot_score_table.R b/tests/testthat/test-plot_score_table.R
index 8336de7a9..5ffc0b029 100644
--- a/tests/testthat/test-plot_score_table.R
+++ b/tests/testthat/test-plot_score_table.R
@@ -1,7 +1,6 @@
test_that("plot_score_table() works as expected", {
p <- suppressMessages(
scores_quantile %>%
- add_coverage(by = c("model")) %>%
summarise_scores(by = c("model")) %>%
summarise_scores(by = c("model"), fun = signif, digits = 1) %>%
plot_score_table()
diff --git a/tests/testthat/test-score.R b/tests/testthat/test-score.R
index 9b4040723..a67575984 100644
--- a/tests/testthat/test-score.R
+++ b/tests/testthat/test-score.R
@@ -168,17 +168,13 @@ test_that("score.scoringutils_point() errors with only NA values", {
test_that("score_quantile correctly handles separate results = FALSE", {
df <- example_quantile[model == "EuroCOVIDhub-ensemble" &
target_type == "Cases" & location == "DE"]
- eval <- suppressMessages(
- score(
- df[!is.na(predicted)],
- separate_results = FALSE
- )
- )
+ eval <- score(df[!is.na(predicted)], separate_results = FALSE)
expect_equal(
nrow(eval) > 1,
TRUE
)
+ expect_true(all(names(metrics_quantile) %in% colnames(eval)))
})
@@ -191,10 +187,12 @@ test_that("score() quantile produces desired metrics", {
quantile = rep(c(0.1, 0.9), times = 10)
)
- out <- suppressMessages(score(data = data))
+ out <- suppressWarnings(suppressMessages(
+ score(data = data, metrics = metrics_no_cov))
+ )
metric_names <- c(
"dispersion", "underprediction", "overprediction",
- "bias", "ae_median", "coverage_deviation"
+ "bias", "ae_median"
)
expect_true(all(metric_names %in% colnames(out)))
@@ -227,27 +225,16 @@ test_that("all quantile and range formats yield the same result", {
expect_equal(sort(eval1$ae_median), sort(ae))
})
-test_that("function produces output even if only some metrics are chosen", {
- example <- scoringutils::example_quantile
-
- eval <- suppressMessages(score(example, metrics = "coverage"))
-
- expect_equal(
- nrow(eval) > 1,
- TRUE
- )
-})
-
test_that("WIS is the same with other metrics omitted or included", {
eval <- suppressMessages(score(example_quantile,
- metrics = "interval_score"
+ metrics = list("wis" = wis)
))
eval2 <- scores_quantile
expect_equal(
- sum(eval$interval_score),
- sum(eval2$interval_score)
+ sum(eval$wis),
+ sum(eval2$wis)
)
})
diff --git a/tests/testthat/test-sharpness.R b/tests/testthat/test-sharpness.R
deleted file mode 100644
index 12dcc4c9f..000000000
--- a/tests/testthat/test-sharpness.R
+++ /dev/null
@@ -1,7 +0,0 @@
-test_that("function throws an error when missing 'predicted'", {
- predicted <- replicate(50, rpois(n = 10, lambda = 1:10))
-
- expect_error(
- mad_sample()
- )
-})
diff --git a/tests/testthat/test-summarise_scores.R b/tests/testthat/test-summarise_scores.R
index dc3de70e3..dbd2cde4b 100644
--- a/tests/testthat/test-summarise_scores.R
+++ b/tests/testthat/test-summarise_scores.R
@@ -1,6 +1,4 @@
test_that("summarise_scores() works without any arguments", {
- expect_true("quantile" %in% names(scores_quantile))
-
summarised_scores <- summarise_scores(scores_quantile)
expect_false("quantile" %in% names(summarised_scores))
diff --git a/tests/testthat/test-utils_data_handling.R b/tests/testthat/test-utils_data_handling.R
index e627485ab..4521c1f73 100644
--- a/tests/testthat/test-utils_data_handling.R
+++ b/tests/testthat/test-utils_data_handling.R
@@ -22,7 +22,7 @@ test_that("range_long_to_quantile works", {
-test_that("quantile_to_interval works", {
+test_that("quantile_to_interval.data.frame() works", {
quantile <- data.frame(
date = as.Date("2020-01-01") + 1:10,
model = "model1",
@@ -30,7 +30,6 @@ test_that("quantile_to_interval works", {
predicted = c(2:11, 4:13),
quantile = rep(c(0.25, 0.75), each = 10)
)
-
long <- data.frame(
date = as.Date("2020-01-01") + 1:10,
model = "model1",
@@ -39,19 +38,38 @@ test_that("quantile_to_interval works", {
range = 50,
boundary = rep(c("lower", "upper"), each = 10)
)
-
long2 <- as.data.frame(quantile_to_interval(
quantile,
keep_quantile_col = FALSE
))
-
data.table::setcolorder(long2, names(long))
-
# for some reason this is needed to pass the unit tests on gh actions
long2$boundary <- as.character(long2$boundary)
long$boundary <- as.character(long$boundary)
-
expect_equal(long, as.data.frame(long2))
+
+ # check that it handles NA values
+ setDT(quantile)
+ quantile[c(1, 3, 11, 13), c("observed", "predicted", "quantile") := NA]
+ # in this instance, a problem appears because there is an NA value both
+ # for the upper and lower bound.
+ expect_message(
+ quantile_to_interval(
+ quantile,
+ keep_quantile_col = FALSE,
+ format = "wide"
+ ),
+ "Aggregate function missing, defaulting to 'length'"
+ )
+ quantile <- quantile[-c(1, 3), ]
+ wide2 <- scoringutils:::quantile_to_interval(
+ quantile,
+ keep_quantile_col = FALSE,
+ format = "wide"
+ )
+ expect_equal(nrow(wide2), 10)
+ expect_true(!("NA") %in% colnames(wide2))
+ expect_equal(sum(wide2$lower, na.rm = TRUE), 59)
})
diff --git a/vignettes/scoringutils.Rmd b/vignettes/scoringutils.Rmd
index 6b55c25b5..4f6989834 100644
--- a/vignettes/scoringutils.Rmd
+++ b/vignettes/scoringutils.Rmd
@@ -223,7 +223,6 @@ For quantile-based forecasts we are often interested in specific coverage-levels
```{r}
score(example_quantile) %>%
- add_coverage(ranges = c(50, 90), by = c("model", "target_type")) %>%
summarise_scores(by = c("model", "target_type")) %>%
summarise_scores(fun = signif, digits = 2)
```
@@ -304,20 +303,20 @@ example_quantile[quantile %in% seq(0.1, 0.9, 0.1), ] %>%
facet_grid(model ~ target_type)
```
-Another way to look at calibration are interval coverage and quantile coverage. Interval coverage is the percentage of true values that fall inside a given central prediction interval. Quantile coverage is the percentage of observed values that fall below a given quantile level.
+
-In order to plot interval coverage, you need to include "range" in the `by` argument to `summarise_scores()`. The green area on the plot marks conservative behaviour, i.e. your empirical coverage is greater than it nominally need be (e.g. 55% of true values covered by all 50% central prediction intervals.)
+
-```{r}
+```{r, eval=FALSE, include=FALSE}
example_quantile %>%
score() %>%
summarise_scores(by = c("model", "range")) %>%
plot_interval_coverage()
```
-To visualise quantile coverage, you need to include "quantile" in `by`. Again, the green area corresponds to conservative forecasts, where central prediction intervals would cover more than needed.
+
-```{r}
+```{r, eval=FALSE, include=FALSE}
example_quantile %>%
score() %>%
summarise_scores(by = c("model", "quantile")) %>%
@@ -377,11 +376,11 @@ example_quantile %>%
plot_correlation()
```
-### Scores by interval ranges
+
-If you would like to see how different forecast interval ranges contribute to average scores, you can visualise scores by interval range:
+
-```{r}
+```{r, eval = FALSE, include = FALSE}
example_quantile %>%
score() %>%
summarise_scores(by = c("model", "range", "target_type")) %>%
@@ -400,8 +399,7 @@ example_integer %>%
sample_to_quantile(
quantiles = c(0.01, 0.025, seq(0.05, 0.95, 0.05), 0.975, 0.99)
) %>%
- score() %>%
- add_coverage(by = c("model", "target_type"))
+ score()
```
## Available metrics