epiforecasts · nikosbosse · Nov 15, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/NAMESPACE b/NAMESPACE
@@ -8,7 +8,6 @@ S3method(score,default)
 S3method(score,scoringutils_binary)
 S3method(score,scoringutils_point)
 S3method(score,scoringutils_quantile)
-S3method(score,scoringutils_quantile_new)
 S3method(score,scoringutils_sample)
 S3method(validate,default)
 S3method(validate,scoringutils_binary)
@@ -32,6 +31,7 @@ export(crps_sample)
 export(dispersion)
 export(dss_sample)
 export(get_duplicate_forecasts)
+export(interval_coverage_deviation_quantile)
 export(interval_coverage_quantile)
 export(interval_coverage_sample)
 export(interval_score)
@@ -82,6 +82,7 @@ importFrom(checkmate,assert_data_frame)
 importFrom(checkmate,assert_data_table)
 importFrom(checkmate,assert_factor)
 importFrom(checkmate,assert_list)
+importFrom(checkmate,assert_logical)
 importFrom(checkmate,assert_number)
 importFrom(checkmate,assert_numeric)
 importFrom(checkmate,check_atomic_vector)
@@ -108,6 +109,7 @@ importFrom(data.table,nafill)
 importFrom(data.table,rbindlist)
 importFrom(data.table,setDT)
 importFrom(data.table,setattr)
+importFrom(data.table,setcolorder)
 importFrom(data.table,setnames)
 importFrom(ggdist,geom_lineribbon)
 importFrom(ggplot2,.data)

diff --git a/NEWS.md b/NEWS.md
@@ -22,6 +22,7 @@ The update introduces a lot of breaking changes. If you want to keep using the o
     - `quantile`: numeric, a vector with quantile-levels. Can alternatively be a matrix of the same shape as `predicted`.
 - `check_forecasts()` was replaced by a new function `validate()`. `validate()` validates the input and in that sense fulfills the purpose of `check_forecasts()`. It has different methods: `validate.default()` assigns the input a class based on their forecast type. Other methods validate the input specifically for the various forecast types.
 - The functionality for computing pairwise comparisons was now split from `summarise_scores()`. Instead of doing pairwise comparisons as part of summarising scores, a new function, `add_pairwise_comparison()`, was introduced that takes summarised scores as an input and adds pairwise comparisons to it. 
+- `add_coverage()` was reworked completely. It's new purpose is now to add coverage information to the raw forecast data (essentially fulfilling some of the functionality that was previously covered by `score_quantile()`)
 - The function `find_duplicates()` was renamed to `get_duplicate_forecasts()`
 - Changes to `avail_forecasts()` and `plot_avail_forecasts()`:
   - The function `avail_forecasts()` was renamed to `available_forecasts()` for consistency with `available_metrics()`. The old function, `avail_forecasts()` is still available as an alias, but will be removed in the future.

diff --git a/R/add_coverage.R b/R/add_coverage.R
@@ -0,0 +1,83 @@
+#' @title Add Coverage Values to Quantile-Based Forecasts
+#'
+#' @description Adds interval coverage of central prediction intervals,
+#' quantile coverage for predictive quantiles, as well as the deviation between
+#' desired and actual coverage to a data.table. Forecasts should be in a
+#' quantile format (following the input requirements of `score()`).
+#'
+#' **Interval coverage**
+#'
+#' Coverage for a given interval range is defined as the proportion of
+#' observations that fall within the corresponding central prediction intervals.
+#' Central prediction intervals are symmetric around the median and and formed
+#' by two quantiles that denote the lower and upper bound. For example, the 50%
+#' central prediction interval is the interval between the 0.25 and 0.75
+#' quantiles of the predictive distribution.
+#'
+#' The function `add_coverage()` computes the coverage per central prediction
+#' interval, so the coverage will always be either `TRUE` (observed value falls
+#' within the interval) or `FALSE` (observed value falls outside the interval).
+#' You can summarise the coverage values to get the proportion of observations
+#' that fall within the central prediction intervals.
+#'
+#' **Quantile coverage**
+#'
+#' Quantile coverage for a given quantile is defined as the proportion of
+#' observed values that are smaller than the corresponding predictive quantile.
+#' For example, the 0.5 quantile coverage is the proportion of observed values
+#' that are smaller than the 0.5 quantile of the predictive distribution.
+#'
+#' **Coverage deviation**
+#'
+#' The coverage deviation is the difference between the desired coverage and the
+#' actual coverage. For example, if the desired coverage is 90% and the actual
+#' coverage is 80%, the coverage deviation is -0.1.
+#'
+#' @inheritParams score
+#' @return a data.table with the input and columns "interval_coverage",
+#' "interval_coverage_deviation", "quantile_coverage",
+#' "quantile_coverage_deviation" added.
+#' @importFrom data.table setcolorder
+#' @examples
+#' library(magrittr) # pipe operator
+#' example_quantile %>%
+#'   add_coverage()
+#' @export
+#' @keywords scoring
+#' @export
+add_coverage <- function(data) {
+  stored_attributes <- get_scoringutils_attributes(data)
+  data <- validate(data)
+  forecast_unit <- get_forecast_unit(data)
+  data_cols <- colnames(data) # store so we can reset column order later
+
+  # what happens if quantiles are not symmetric around the median?
+  # should things error? Also write tests for that.
+  interval_data <- quantile_to_interval(data, format = "wide")
+  interval_data[, interval_coverage := ifelse(
+    observed <= upper & observed >= lower,
+    TRUE,
+    FALSE)
+  ][, c("lower", "upper", "observed") := NULL]
+
+  data[, range := get_range_from_quantile(quantile)]
+
+  data <- merge(interval_data, data, by = unique(c(forecast_unit, "range")))
+  data[, interval_coverage_deviation := interval_coverage - range / 100]
+  data[, quantile_coverage := observed <= predicted]
+  data[, quantile_coverage_deviation := quantile_coverage - quantile]
+
+  # reset column order
+  new_metrics <- c("interval_coverage", "interval_coverage_deviation",
+                   "quantile_coverage", "quantile_coverage_deviation")
+  setcolorder(data, unique(c(data_cols, "range", new_metrics)))
+
+  # add coverage "metrics" to list of stored metrics
+  # this makes it possible to use `summarise_scores()` later on
+  stored_attributes[["metric_names"]] <- c(
+    stored_attributes[["metric_names"]],
+    new_metrics
+  )
+  data <- assign_attributes(data, stored_attributes)
+  return(data[])
+}
diff --git a/R/check-input-helpers.R b/R/check-input-helpers.R
@@ -297,12 +297,22 @@ check_columns_present <- function(data, columns) {
   }
   assert_character(columns, min.len = 1)
   colnames <- colnames(data)
+  missing <- list()
   for (x in columns){
     if (!(x %in% colnames)) {
-      msg <- paste0("Column '", x, "' not found in data")
-      return(msg)
+      missing[[x]] <- x
     }
   }
+  missing <- unlist(missing)
+  if (length(missing) > 1) {
+    msg <- paste0(
+      "Columns '", paste(missing, collapse = "', '"), "' not found in data"
+    )
+    return(msg)
+  } else if (length(missing) == 1) {
+    msg <- paste0("Column '", missing, "' not found in data")
+    return(msg)
+  }
   return(TRUE)
 }
 

diff --git a/R/convenience-functions.R b/R/convenience-functions.R
@@ -235,21 +235,13 @@ log_shift <- function(x, offset = 0, base = exp(1)) {
 #'   example_quantile,
 #'   c("location", "target_end_date", "target_type", "horizon", "model")
 #' )
-
 set_forecast_unit <- function(data, forecast_unit) {
-
-  datacols <- colnames(data)
-  missing <- forecast_unit[!(forecast_unit %in% datacols)]
-
-  if (length(missing) > 0) {
-    warning(
-      "Column(s) '",
-      missing,
-      "' are not columns of the data and will be ignored."
-    )
-    forecast_unit <- intersect(forecast_unit, datacols)
+  data <- ensure_data.table(data)
+  missing <- check_columns_present(data, forecast_unit)
+  if (!is.logical(missing)) {
+    warning(missing)
+    forecast_unit <- intersect(forecast_unit, colnames(data))
   }
-
   keep_cols <- c(get_protected_columns(data), forecast_unit)
   out <- unique(data[, .SD, .SDcols = keep_cols])[]
   return(out)

diff --git a/R/data.R b/R/data.R
@@ -19,7 +19,7 @@
 #'   \item{model}{name of the model that generated the forecasts}
 #'   \item{horizon}{forecast horizon in weeks}
 #' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
 "example_quantile"
 
 
@@ -44,7 +44,7 @@
 #'   \item{model}{name of the model that generated the forecasts}
 #'   \item{horizon}{forecast horizon in weeks}
 #' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
 "example_point"
 
 
@@ -69,7 +69,7 @@
 #'   \item{predicted}{predicted value}
 #'   \item{sample_id}{id for the corresponding sample}
 #' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
 "example_continuous"
 
 
@@ -124,7 +124,7 @@
 #'   \item{horizon}{forecast horizon in weeks}
 #'   \item{predicted}{predicted value}
 #' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
 "example_binary"
 
 
@@ -147,7 +147,7 @@
 #'   \item{model}{name of the model that generated the forecasts}
 #'   \item{horizon}{forecast horizon in weeks}
 #' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
 "example_quantile_forecasts_only"
 
 
@@ -167,7 +167,7 @@
 #'   \item{observed}{observed values}
 #'   \item{location_name}{name of the country for which a prediction was made}
 #' }
-#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/} # nolint
+#' @source \url{https://github.com/covid19-forecast-hub-europe/covid19-forecast-hub-europe/commit/a42867b1ea152c57e25b04f9faa26cfd4bfd8fa6/}
 "example_truth_only"
 
 #' Summary information for selected metrics
@@ -216,8 +216,13 @@
 #'
 #' A named list with functions:
 #' - "wis" = [wis()]
+#' - "overprediction" = [overprediction()]
+#' - "underprediction" = [underprediction()]
+#' - "dispersion" = [dispersion()]
 #' - "bias" = [bias_quantile()]
-#' - "coverage_50" = \(...) {run_safely(..., range = 50, fun = interval_coverage_quantile)} #nolint
-#' - "coverage_90" = \(...) {run_safely(..., range = 90, fun = interval_coverage_quantile)} #nolint
+#' - "coverage_50" = \(...) {run_safely(..., range = 50, fun = [interval_coverage_quantile][interval_coverage_quantile()])}
+#' - "coverage_90" = \(...) {run_safely(..., range = 90, fun = [interval_coverage_quantile][interval_coverage_quantile()])}
+#' - "coverage_deviation" = [interval_coverage_deviation_quantile()],
+#' - "ae_median" = [ae_median_quantile()]
 #' @keywords info
 "metrics_quantile"
diff --git a/R/get_-functions.R b/R/get_-functions.R
@@ -193,6 +193,8 @@ get_protected_columns <- function(data = NULL) {
   protected_columns <- c(
     "predicted", "observed", "sample_id", "quantile", "upper", "lower",
     "pit_value", "range", "boundary", "relative_skill", "scaled_rel_skill",
+    "interval_coverage", "interval_coverage_deviation",
+    "quantile_coverage", "quantile_coverage_deviation",
     available_metrics(),
     grep("coverage_", names(data), fixed = TRUE, value = TRUE)
   )