diff --git a/.Rbuildignore b/.Rbuildignore index 31fbab108..b3b6bd9ec 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,9 +5,9 @@ ^LICENSE\.md$ ^cran-comments\.md$ ^CRAN-RELEASE$ -^NEWS.md ^R/sandbox\.R$ -^Readme.md -^Readme.Rmd +^README.Rmd +^_pkgdown.yml$ ^doc$ ^Meta$ +^_pkgdown\.yml$ diff --git a/.github/workflows/check-full.yaml b/.github/workflows/check-full.yaml index ff24a97f2..1cff5c613 100644 --- a/.github/workflows/check-full.yaml +++ b/.github/workflows/check-full.yaml @@ -2,8 +2,6 @@ on: push: branches: - master - schedule: - - cron: "0 0 * * *" pull_request: branches: - master @@ -20,12 +18,10 @@ jobs: fail-fast: false matrix: config: - - {os: macOS-latest, r: 'devel'} - - {os: macOS-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: ubuntu-16.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} - - {os: ubuntu-16.04, r: 'oldrel', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} - + - {os: ubuntu-20.04, r: 'release'} + - {os: ubuntu-20.04, r: 'oldrel'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true RSPM: ${{ matrix.config.rspm }} @@ -60,15 +56,18 @@ jobs: env: RHUB_PLATFORM: linux-x86_64-ubuntu-gcc run: | - Rscript -e "remotes::install_github('r-hub/sysreqs')" - sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") - sudo -s eval "$sysreqs" + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + sudo apt-get -y install libudunits2-dev libgdal-dev libqpdf-dev libcurl4-openssl-dev + shell: bash + - name: Install dependencies run: | remotes::install_deps(dependencies = TRUE) remotes::install_cran("rcmdcheck") shell: Rscript {0} - - name: Session info run: | options(width = 100) diff --git a/DESCRIPTION b/DESCRIPTION index 194851b47..95262d532 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,7 +67,7 @@ Suggests: knitr, rmarkdown RoxygenNote: 7.1.1 -URL: https://github.com/epiforecasts/scoringutils +URL: https://github.com/epiforecasts/scoringutils, https://epiforecasts.io/scoringutils/ BugReports: https://github.com/epiforecasts/scoringutils/issues VignetteBuilder: knitr Depends: diff --git a/NAMESPACE b/NAMESPACE index 3bae902fb..8f8158154 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,7 +38,6 @@ export(score_heatmap) export(score_table) export(sharpness) export(show_avail_forecasts) -export(update_list) export(wis_components) importFrom(data.table,"%like%") importFrom(data.table,':=') diff --git a/NEWS.md b/NEWS.md index 54e33577a..dc8680033 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,44 +1,50 @@ -## scoringutils 0.1.7.2 -### Package updates +# scoringutils 0.1.7.2 + +## Package updates - minor bug fixes (previously, 'interval_score' needed to be among the selected metrics) +- all data.tables are now returned as `table[]` rather than as `table`, +such that they don't have to be called twice to display the contents. + +# scoringutils 0.1.7 -## scoringutils 0.1.7 ## Feature updates - added a function, `pairwise_comparison()` that runs pairwise comparisons between models on the output of `eval_forecasts()` - added functionality to compute relative skill within `eval_forecasts()` - added a function to visualise pairwise comparisons -### Package updates +## Package updates - The WIS definition change introduced in version 0.1.5 was partly corrected such that the difference in weighting is only introduced when summarising over scores from different interval ranges -## scoringutils 0.1.6 +# scoringutils 0.1. + ## Feature updates - `eval_forecasts()` can now handle a separate forecast and truth data set as as input - `eval_forecasts()` now supports scoring point forecasts along side quantiles in a quantile-based format. Currently the only metric used is the absolute error -### Package updates +## Package updates - Many functions, especially `eval_forecasts()` got a major rewrite. While functionality should be unchanged, the code should now be easier to maintain - Some of the data-handling functions got renamed, but old names are supported as well for now. -## scoringutils 0.1.5 -### Package updates +# scoringutils 0.1.5 + +## Package updates - changed the default definition of the weighted interval score. Previously, the median prediction was counted twice, but is no only counted once. If you want to go back to the old behaviour, you can call the interval_score function with the argument `count_median_twice = FALSE`. -## scoringutils 0.1.4 +# scoringutils 0.1.4 -### Feature updates +## Feature updates - we added basic plotting functionality to visualise scores. You can now easily obtain diagnostic plots based on scores as produced by `eval_forecasts`. - `correlation_plot` shows correlation between metrics @@ -47,16 +53,16 @@ chosen metric - `score_heatmap` visualises scores as heatmap - `score_table` shows a coloured summary table of scores -### package updates +## package updates - renamed "calibration" to "coverage" - renamed "true_values" to "true_value" in data.frames - renamed "predictions" to "prediction" in data.frames - renamed "is_overprediction" to "overprediction" - renamed "is_underprediction" to "underprediction" -## scoringutils 0.1.3 +# scoringutils 0.1.3 -### (Potentially) Breaking changes +## (Potentially) Breaking changes - the by argument in `eval_forecasts` now has a slightly changed meaning. It now denotes the lowest possible grouping unit, i.e. the unit of one observation and needs to be specified explicitly. The default is now `NULL`. The reason for @@ -68,7 +74,7 @@ to be included. - for the interval score, `weigh = TRUE` is now the default option. - (potentially planned) rename true_values to true_value and predictions to prediction. -### Feature updates +## Feature updates - updated quantile evaluation metrics in `eval_forecasts`. Bias as well as calibration now take all quantiles into account - Included option to summarise scores according to a `summarise_by` argument in @@ -77,13 +83,13 @@ as an arbitrary set of quantiles. - `eval_forecasts` can now return pit histograms. - switched to ggplot2 for plotting -## scoringutils 0.1.2 +# scoringutils 0.1.2 -### (Potentially) Breaking changes +## (Potentially) Breaking changes - all scores in eval_forecasts were consistently renamed to lower case. Interval_score is now interval_score, CRPS is now crps etc. -### Feature updates +## Feature updates - included support for grouping scores according to a vector of column names in `eval_forecasts` - included support for passing down arguments to lower-level functions in @@ -91,17 +97,17 @@ in `eval_forecasts` - included support for three new metrics to score quantiles with `eval_forecasts`: bias, sharpness and calibration -### Package updates +## Package updates - example data now has a horizon column to illustrate the use of grouping - documentation updated to explain the above listed changes -## scoringutils 0.1.1 +# scoringutils 0.1.1 -### Feature updates +## Feature updates - included support for a long as well as wide input formats for quantile forecasts that are scored with `eval_forecasts` -### Package updates +## Package updates - updated documentation for the `eval_forecasts` - added badges to the Readme diff --git a/R/bias.R b/R/bias.R index 588b757b4..0fb4620c6 100644 --- a/R/bias.R +++ b/R/bias.R @@ -92,22 +92,14 @@ bias <- function(true_values, predictions) { n_pred <- ncol(predictions) # empirical cdf - P_x <- vapply(seq_along(true_values), - function(i) { - sum(predictions[i,] <= true_values[i]) / n_pred - }, - .0) + P_x <- rowSums(predictions <= true_values) / n_pred if (continuous_predictions) { res <- 1 - 2 * P_x return(res) } else { # for integer case also calculate empirical cdf for (y-1) - P_xm1 <- vapply(seq_along(true_values), - function(i) { - sum(predictions[i,] <= true_values[i] - 1) / n_pred - }, - .0) + P_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred res <- 1 - (P_x + P_xm1) return(res) diff --git a/R/eval_forecasts_binary.R b/R/eval_forecasts_binary.R index 9b94d6bb0..93d7cb21b 100644 --- a/R/eval_forecasts_binary.R +++ b/R/eval_forecasts_binary.R @@ -45,6 +45,8 @@ eval_forecasts_binary <- function(data, by = summarise_by] } + + return(res[]) } diff --git a/R/eval_forecasts_continuous_integer.R b/R/eval_forecasts_continuous_integer.R index 34ada6c5a..82e130e12 100644 --- a/R/eval_forecasts_continuous_integer.R +++ b/R/eval_forecasts_continuous_integer.R @@ -169,5 +169,5 @@ eval_forecasts_sample <- function(data, pit_plots = pit_histograms) } - return(res) + return(res[]) } diff --git a/R/eval_forecasts_helper.R b/R/eval_forecasts_helper.R index f66bf6b4a..40088b0e0 100644 --- a/R/eval_forecasts_helper.R +++ b/R/eval_forecasts_helper.R @@ -20,7 +20,7 @@ add_quantiles <- function(dt, varnames, quantiles, by) { na.rm = TRUE)), by = c(by)] } - return(dt) + return(dt[]) } @@ -41,5 +41,5 @@ add_sd <- function(dt, varnames, by) { for (varname in varnames) { dt[, paste0(varname, "_sd") := sd(get(varname), na.rm = TRUE), by = by] } - return(dt) + return(dt[]) } diff --git a/R/eval_forecasts_quantile.R b/R/eval_forecasts_quantile.R index 6395ea7fa..036b97ea1 100644 --- a/R/eval_forecasts_quantile.R +++ b/R/eval_forecasts_quantile.R @@ -117,8 +117,12 @@ eval_forecasts_quantile <- function(data, quantile_data[, quantile_coverage := (true_value <= prediction)] } - # merge only if something was computed - if (any(c("aem", "quantile_coverage") %in% metrics)) { + # merge metrics computed on quantile data (i.e. aem, quantile_coverage) back + # into metrics computed on range data. One important side effect of this is + # that it controls whether we count the median twice for the interval score + # (row is then duplicated) or only once. However, merge only needs to happen + # if we computed either the interval score or the aem or quantile coverage + if (any(c("aem", "interval_score", "quantile_coverage") %in% metrics)) { # delete unnecessary columns before merging back keep_cols <- unique(c(by, "quantile", "aem", "quantile_coverage", "boundary", "range")) @@ -183,12 +187,12 @@ eval_forecasts_quantile <- function(data, } # if neither quantile nor range are in summarise_by, remove coverage and quantile_coverage - if (!("range" %in% summarise_by)) { + if (!("range" %in% summarise_by) & ("coverage" %in% colnames(res))) { res[, c("coverage") := NULL] } if (!("quantile" %in% summarise_by) & "quantile_coverage" %in% names(res)) { res[, c("quantile_coverage") := NULL] } - return(res) + return(res[]) } diff --git a/R/interval_score.R b/R/interval_score.R index 9d6359a53..f9208674f 100644 --- a/R/interval_score.R +++ b/R/interval_score.R @@ -53,6 +53,7 @@ #' upper = upper, #' interval_range = interval_range) #' +#' # example with missing values and separate results #' interval_score(true_values = c(true_values, NA), #' lower = c(lower, NA), #' upper = c(NA, upper), diff --git a/R/pairwise-comparisons.R b/R/pairwise-comparisons.R index 6ec96e9bd..a17a6ea9c 100644 --- a/R/pairwise-comparisons.R +++ b/R/pairwise-comparisons.R @@ -172,7 +172,7 @@ add_rel_skill_to_eval_forecasts <- function(unsummarised_scores, # also delete skill metric from output out[, eval(rel_skill_metric) := NULL] - return(out) + return(out[]) } @@ -278,6 +278,8 @@ pairwise_comparison_one_group <- function(scores, by = "model"] # merge back to retain the ratios even for comparisons with the baseline result <- merge(result, result_without_baseline, all.x = TRUE) + # avoid mixture of NA and NaN which can cause problems downstream + result[is.na(theta), theta := NA_real_] # remove NAs form merge in the thetas result[, theta := unique(na.omit(theta)), by = "model"] } else { @@ -303,7 +305,7 @@ pairwise_comparison_one_group <- function(scores, data.table::setnames(out, old = c("ratio", "theta", "rel_to_baseline"), new = c("mean_scores_ratio", "relative_skill", "scaled_rel_skill")) - return(out) + return(out[]) } @@ -341,7 +343,7 @@ compare_two_models <- function(scores, scores <- data.table::as.data.table(scores) if (!("model" %in% names(scores))) { - stop("pairwise compairons require a column called 'model'") + stop("pairwise comparisons require a column called 'model'") } # select only columns in c(by, var) diff --git a/R/pit.R b/R/pit.R index 81001bef2..fd4f154d7 100644 --- a/R/pit.R +++ b/R/pit.R @@ -166,11 +166,7 @@ pit <- function(true_values, # calculate emipirical cumulative distribution function as # Portion of (y_true <= y_predicted) - P_x <- vapply(seq_along(true_values), - function(i) { - sum(predictions[i, ] <= true_values[i]) / n_pred - }, - .0) + P_x <- rowSums(predictions <= true_values) / n_pred # calculate PIT for continuous predictions case if (continuous_predictions) { @@ -190,11 +186,7 @@ pit <- function(true_values, # calculate PIT for integer predictions case if (!continuous_predictions) { # empirical cdf for (y-1) for integer-valued predictions - P_xm1 <- vapply(seq_along(true_values), - function(i) { - sum(predictions[i,] <= true_values[i] - 1) / n_pred - }, - .0) + P_xm1 <- rowSums(predictions <= (true_values - 1)) / n_pred # do n_replicates times for randomised PIT u <- replicate(n_replicates, P_xm1 + stats::runif(n) * (P_x - P_xm1)) # apply Anderson Darling test on u values diff --git a/R/plot.R b/R/plot.R index f4734d220..78c3496e1 100644 --- a/R/plot.R +++ b/R/plot.R @@ -744,13 +744,14 @@ plot_predictions <- function(data = NULL, select_median <- (forecasts$range %in% 0 & forecasts$boundary == "lower") median <- forecasts[select_median] - plot <- plot + - ggplot2::geom_line(data = median, - mapping = ggplot2::aes(y = prediction, colour = "median"), - lwd = 0.4) + if (nrow(median) > 0) { + plot <- plot + + ggplot2::geom_line(data = median, + mapping = ggplot2::aes(y = prediction, colour = "median"), + lwd = 0.4) + } } - # facet if specified by the user if (!is.null(facet_formula)) { if (facet_wrap_or_grid == "facet_wrap") { diff --git a/R/utils.R b/R/utils.R index ef425e480..7eeaefb46 100644 --- a/R/utils.R +++ b/R/utils.R @@ -177,7 +177,6 @@ extract_from_list <- function(list, what) { #' @param defaults A list of default settings #' @param optional A list of optional settings to override defaults #' @return A list -#' @export #' #' @keywords internal update_list <- function(defaults = list(), optional = list()) { diff --git a/R/utils_data_handling.R b/R/utils_data_handling.R index 68b8e5cdb..419fbb53c 100644 --- a/R/utils_data_handling.R +++ b/R/utils_data_handling.R @@ -29,7 +29,7 @@ range_long_to_wide <- function(data) { out <- data.table::dcast(data, ... ~ boundary + range, value.var = "prediction") - return(out) + return(out[]) } @@ -45,7 +45,7 @@ range_long_to_wide <- function(data) { quantile_to_wide <- function(data) { warning("This function will be deprecated. Please use `range_long_to_wide()` in the future") out <- scoringutils::range_long_to_wide(data) - return(out) + return(out[]) } @@ -106,7 +106,7 @@ range_wide_to_long <- function(data) { data[, range := as.numeric(gsub("^.*?_","", range))] } - return(data) + return(data[]) } @@ -120,7 +120,7 @@ range_wide_to_long <- function(data) { quantile_to_long <- function(data) { warning("This function will be deprecated. Please use `range_wide_to_long()` in the future") out <- scoringutils::range_wide_to_long(data) - return(out) + return(out[]) } @@ -173,7 +173,7 @@ range_long_to_quantile <- function(data, } - return(unique(data)) + return(unique(data)[]) } @@ -188,7 +188,7 @@ range_to_quantile <- function(data, keep_range_col = FALSE) { warning("This function will be deprecated. Please use `range_long_to_quantile()` in the future") out <- scoringutils::range_long_to_quantile(data, keep_range_col) - return(out) + return(out[]) } @@ -246,7 +246,7 @@ quantile_to_range_long <- function(data, data[, `:=`(boundary = as.character(boundary), range = as.numeric(range))] - return(data) + return(data[]) } @@ -261,7 +261,7 @@ quantile_to_range <- function(data, keep_quantile_col = FALSE) { warning("This function will be deprecated. Please use `quantile_to_range_long()` in the future") out <- scoringutils::quantile_to_range_long(data, keep_quantile_col) - return(out) + return(out[]) } @@ -306,7 +306,7 @@ sample_to_quantile <- function(data, type = type, na.rm = TRUE)), by = by] - return(data) + return(data[]) } @@ -354,7 +354,7 @@ sample_to_range_long <- function(data, data <- scoringutils::quantile_to_range_long(data, keep_quantile_col = keep_quantile_col) - return(data) + return(data[]) } @@ -372,7 +372,7 @@ sample_to_range <- function(data, keep_quantile_col = TRUE) { warning("This function will be deprecated. Please use `sample_to_range-long()` in the future") out <- scoringutils::sample_to_range_long(data, range, type, keep_quantile_col) - return(out) + return(out[]) } @@ -450,7 +450,7 @@ merge_pred_and_obs <- function(forecasts, observations, combined[, paste0(basenames_overlap, ".y") := NULL] } - return(combined) + return(combined[]) } diff --git a/Readme.Rmd b/README.Rmd similarity index 100% rename from Readme.Rmd rename to README.Rmd diff --git a/Readme.md b/README.md similarity index 100% rename from Readme.md rename to README.md diff --git a/_pkgdown.yml b/_pkgdown.yml index 7e05f6cb6..862c3a5fb 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -5,6 +5,7 @@ reference: - title: Evaluation functions contents: - compare_two_models + - starts_with("eval_") - eval_forecasts - eval_forecasts_binary - eval_forecasts_sample diff --git a/man/interval_score.Rd b/man/interval_score.Rd index 5ae6e1dfb..75b6153fd 100644 --- a/man/interval_score.Rd +++ b/man/interval_score.Rd @@ -75,6 +75,7 @@ interval_score(true_values = true_values, upper = upper, interval_range = interval_range) +# example with missing values and separate results interval_score(true_values = c(true_values, NA), lower = c(lower, NA), upper = c(NA, upper), diff --git a/tests/testthat/test-eval_forecasts.R b/tests/testthat/test-eval_forecasts.R index 3c0e6caab..34c1d64b6 100644 --- a/tests/testthat/test-eval_forecasts.R +++ b/tests/testthat/test-eval_forecasts.R @@ -81,6 +81,21 @@ test_that("function produces output even if only some metrics are chosen", { TRUE) }) +test_that("WIS is the same with other metrics omitted or included", { + range_example_wide <- data.table::setDT(scoringutils::range_example_data_wide) + range_example <- scoringutils::range_wide_to_long(range_example_wide) + + eval <- scoringutils::eval_forecasts(range_example, + summarise_by = c("model", "range"), + metrics = "interval_score") + + eval2 <- scoringutils::eval_forecasts(range_example, + summarise_by = c("model", "range")) + + expect_equal(sum(eval$interval_score), + sum(eval2$interval_score)) +}) + diff --git a/vignettes/scoringutils.Rmd b/vignettes/scoringutils.Rmd index 04575d9b4..80cafec5b 100644 --- a/vignettes/scoringutils.Rmd +++ b/vignettes/scoringutils.Rmd @@ -21,7 +21,7 @@ knitr::opts_chunk$set( The `scoringutils` package provides a collection of metrics and proper scoring rules that make it simple to score forecasts against the true observed values. -Predictions can either be automatically scored from a `data.frame` using the function `eval_forecasts`. Alternatively, evaluation metrics can be accessed directly using lower level functions within a vector/matrix framework. +Predictions can either be automatically scored from a `data.frame` using the function `eval_forecasts()`. Alternatively, evaluation metrics can be accessed directly using lower level functions within a vector/matrix framework. Predictions can be handled in various formats: `scoringutils` can handle probabilistic forecasts in either a sample based or a quantile based format. For more detail on the expected input formats please see below. True values can be integer, continuous or binary. @@ -29,7 +29,7 @@ In addition to automatic scoring, `scoringutils` offers a variety of plots and v # Scoring Forecasts Automatically -Most of the time, the `eval_forecasts` function will be able to do the entire evaluation for you. The idea is simple, yet flexible. +Most of the time, the `eval_forecasts()` function will be able to do the entire evaluation for you. The idea is simple, yet flexible. All you need to do is to pass in a `data.frame` that has a column called `prediction` and one called `true_value`. Depending on the exact input format, additional columns like `sample`, `quantile` or `range` and `boundary` are needed. Additional columns may be present to indicate a grouping of forecasts. For example, we could have forecasts made by different models in various locations at different time points, each for several weeks into the future. In this case, we would have additional columns called for example `model`, `date`, `forecast_date`, `forecast_horizon` and `location`. @@ -44,7 +44,6 @@ Here is an example of an evaluation using the example data included in the packa ```{r} library(scoringutils) -library(data.table) ``` ```{r} @@ -117,7 +116,7 @@ We can also visualise metrics using a heatmap: ```{r} scores <- scoringutils::eval_forecasts(data, summarise_by = c("model", "horizon")) -scores <- scores[, horizon := as.factor(horizon)] +scores$horizon <- as.factor(scores$horizon) scoringutils::score_heatmap(scores, x = "horizon", metric = "bias") ``` @@ -125,7 +124,7 @@ scoringutils::score_heatmap(scores, ### Expected Input Formats -The `eval_forecasts` function is designed to work with various different input formats. The following formats are currently supported: +The `eval_forecasts()` function is designed to work with various different input formats. The following formats are currently supported: quantile forecasts in either a plain quantile format or in a format that specifies interval ranges and the boundary of a given interval range. @@ -260,7 +259,7 @@ Null hypothesis will often be rejected outright. ## Continuous Ranked Probability Score (CRPS) -Wrapper around the `crps_sample` function from the +Wrapper around the `crps_sample()` function from the `scoringRules` package. For more information look at the manuals from the `scoringRules` package. The function can be used for continuous as well as integer valued forecasts. Smaller values are better. @@ -274,7 +273,7 @@ crps(true_values, predictions) ## Dawid-Sebastiani Score (DSS) -Wrapper around the `dss_sample` function from the +Wrapper around the `dss_sample()` function from the `scoringRules` package. For more information look at the manuals from the `scoringRules` package. The function can be used for continuous as well as integer valued forecasts. Smaller values are better. @@ -286,7 +285,7 @@ dss(true_values, predictions) ``` ## Log Score -Wrapper around the `log_sample` function from the +Wrapper around the `log_sample()` function from the `scoringRules` package. For more information look at the manuals from the `scoringRules` package. The function should not be used for integer valued forecasts. While Log Scores are in principle possible for integer valued