From 228931165a3e6a4d5eb83933cdfd662431ce683d Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Mon, 20 Apr 2020 16:27:46 -0400 Subject: [PATCH 1/8] add function to check for equitimed data add tests --- NAMESPACE | 2 + R/utils-data-processing.R | 61 +++++++++++++++++++++ man/is_equitimed.Rd | 25 +++++++++ tests/testthat/test-02-dataset-processing.R | 17 +++++- 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 R/utils-data-processing.R create mode 100644 man/is_equitimed.Rd diff --git a/NAMESPACE b/NAMESPACE index 8ee186a..7d7ac3e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -44,6 +44,8 @@ export(import_retriever_data) export(install_retriever_data) export(interpolate_obs) export(invoke) +export(is_equitimed) +export(is_evenly_sampled) export(normalize_effort) export(normalize_obs) export(normalize_times) diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R new file mode 100644 index 0000000..0b4e9ce --- /dev/null +++ b/R/utils-data-processing.R @@ -0,0 +1,61 @@ +#' @title Check that the times of a dataset are evenly sampled +#' @aliases is_evenly_sampled +#' +#' @param data dataset to check +#' @param period period to check the times against (if `NULL`, first check to +#' see if there is a known `period` set in the metadata, otherwise assumes 1) +#' @param tol tolerance for the period +#' @param return_if_time_missing default value to return if times are missing +#' +#' @return TRUE or FALSE, invisibly +#' +#' @export +is_equitimed <- function(data, period = NULL, tol = 1e-06, + return_if_time_missing = TRUE) +{ + stopifnot(check_data_format(data)) + + times <- get_times_from_data(data) + if (is.null(times)) + { + message("No time index found. Returning ", return_if_time_missing) + return(return_if_time_missing) + } + period <- get_period(data$metadata$period, period) + + full_times <- tryCatch(tidyr::full_seq(times, period, tol), + error = function(e) { + message(e) + return(NULL) + }) + invisible(isTRUE(all.equal(times, full_times))) +} + +#' @export +is_evenly_sampled <- is_equitimed + +#' extract the period, given the value from the metadata field, and a value +#' specified by the user. The flowchart is: +#' (1) if user has supplied non-null `period`, use that +#' (2) if metadata period is non-null, use that +#' (3) use a default value of 1 and print a message +#' +#' @noRd +get_period <- function(metadata_period, period = NULL) +{ + if (is.null(period)) + { + period <- metadata_period + if (is.null(period)) + { + message("No time period found. Assuming period = 1.") + period <- 1 + } + } + return(period) +} + + + + + diff --git a/man/is_equitimed.Rd b/man/is_equitimed.Rd new file mode 100644 index 0000000..86f25ea --- /dev/null +++ b/man/is_equitimed.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-data-processing.R +\name{is_equitimed} +\alias{is_equitimed} +\alias{is_evenly_sampled} +\title{Check that the times of a dataset are evenly sampled} +\usage{ +is_equitimed(data, period = NULL, tol = 1e-06, return_if_time_missing = TRUE) +} +\arguments{ +\item{data}{dataset to check} + +\item{period}{period to check the times against (if \code{NULL}, first check to +see if there is a known \code{period} set in the metadata, otherwise assumes 1)} + +\item{tol}{tolerance for the period} + +\item{return_if_time_missing}{default value to return if times are missing} +} +\value{ +TRUE or FALSE, invisibly +} +\description{ +Check that the times of a dataset are evenly sampled +} diff --git a/tests/testthat/test-02-dataset-processing.R b/tests/testthat/test-02-dataset-processing.R index 9f2c56e..288c4d4 100644 --- a/tests/testthat/test-02-dataset-processing.R +++ b/tests/testthat/test-02-dataset-processing.R @@ -1,7 +1,6 @@ context("Check Dataset Processing Code") test_that("BBS data processing works", { - species_table <- data.frame(aou = c(1, 3, 33, 44), spanish_common_name = c("a x", "c x", "c x / yy", "d x zz")) @@ -14,3 +13,19 @@ test_that("BBS data processing works", { expect_equal(out$species_id, c(1, 2, 3, 44)) expect_equal(out$abundance, c(2, 2, 4, 2)) }) + +test_that("is_equitimed works", { + m <- capture_error(expect_false(is_equitimed(dragons))) + expect_match(as.character(m), "Error: `x` is not a regular sequence.", fixed = TRUE) + + path <- system.file("extdata", "subsampled", + package = "MATSS", mustWork = TRUE) + dat <- get_mtquad_data() + expect_true(is_equitimed(dat)) +}) + + + + + +test_that("make_integer_times") \ No newline at end of file From dd20bfd5319768fb2886bbf5c436a967a0ef2c09 Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Mon, 20 Apr 2020 21:32:56 -0400 Subject: [PATCH 2/8] add function to produce equitimed data --- DESCRIPTION | 3 +- NAMESPACE | 2 + R/utils-data-processing.R | 140 +++++++++++++++++--- man/is_equitimed.Rd | 4 +- man/make_equitimed.Rd | 50 +++++++ tests/testthat/test-02-dataset-processing.R | 5 +- 6 files changed, 181 insertions(+), 23 deletions(-) create mode 100644 man/make_equitimed.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 401b376..9d7371f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -64,7 +64,8 @@ Imports: stringr, tibble, tidyr, - usethis + usethis, + vctrs Suggests: covr, knitr, diff --git a/NAMESPACE b/NAMESPACE index 7d7ac3e..a9e4d09 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,8 @@ export(interpolate_obs) export(invoke) export(is_equitimed) export(is_evenly_sampled) +export(make_equitimed) +export(make_evenly_sampled) export(normalize_effort) export(normalize_obs) export(normalize_times) diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R index 0b4e9ce..033f662 100644 --- a/R/utils-data-processing.R +++ b/R/utils-data-processing.R @@ -5,35 +5,40 @@ #' @param period period to check the times against (if `NULL`, first check to #' see if there is a known `period` set in the metadata, otherwise assumes 1) #' @param tol tolerance for the period -#' @param return_if_time_missing default value to return if times are missing #' #' @return TRUE or FALSE, invisibly #' #' @export -is_equitimed <- function(data, period = NULL, tol = 1e-06, - return_if_time_missing = TRUE) +is_equitimed <- function(data, period = NULL, tol = 1e-06) { stopifnot(check_data_format(data)) + fulL_times <- get_full_times(data = data, period = period, tol = tol) times <- get_times_from_data(data) - if (is.null(times)) - { - message("No time index found. Returning ", return_if_time_missing) - return(return_if_time_missing) - } - period <- get_period(data$metadata$period, period) - - full_times <- tryCatch(tidyr::full_seq(times, period, tol), - error = function(e) { - message(e) - return(NULL) - }) invisible(isTRUE(all.equal(times, full_times))) } #' @export is_evenly_sampled <- is_equitimed +#' get the complete time index, filling in gaps where necessary, and using the +#' period to establish the sampling frequency +#' +#' @noRd +get_full_times <- function(data, period = NULL, tol = 1e-06, + convert_error_to_message = TRUE) +{ + times <- get_times_from_data(data) + period <- get_period_from_data(data, period) + + full_times <- tryCatch(tidyr::full_seq(times, period, tol), + error = function(e) { + message(e$message); + return(NULL) + }) + return(full_times) +} + #' extract the period, given the value from the metadata field, and a value #' specified by the user. The flowchart is: #' (1) if user has supplied non-null `period`, use that @@ -41,11 +46,11 @@ is_evenly_sampled <- is_equitimed #' (3) use a default value of 1 and print a message #' #' @noRd -get_period <- function(metadata_period, period = NULL) +get_period_from_data <- function(data, period = NULL) { if (is.null(period)) { - period <- metadata_period + period <- data$metadata$period if (is.null(period)) { message("No time period found. Assuming period = 1.") @@ -54,8 +59,109 @@ get_period <- function(metadata_period, period = NULL) } return(period) } +#' @title Insert rows if necessary so that time series are evenly sampled +#' @aliases make_evenly_sampled +#' +#' @param data dataset to modify +#' @inheritParams is_equitimed +#' @param method one of `c("mean", "method", "closest")` that determines how +#' the rows of the original data will get coerced into the output here. +#' @inheritParams base::mean +#' +#' @return the dataset, with rows coerced according to the equitimed time +#' indices, and additional empty rows inserted if needed +#' +#' @details First, `get_full_times()` computes the sequence of time index values +#' at a regular sampling interval of period. These will be the final time +#' index values for the output. *Some* set of rows of the original dataset +#' will map to each of these time indices. +#' +#' The `method` argument determines how these rows get coerced: +#' \describe{ +#' \item{mean}{the values in the rows are averaged together using `mean`} +#' \item{median}{the values in the rows are averaged together using `median`} +#' \item{closest}{the values in the row that is closest in time to the +#' desired time index are used.} +#' } +#' +#' @export +make_equitimed <- function(data, period = NULL, tol = 1e-06, + method = c("mean", "method", "closest"), + na.rm = TRUE) +{ + stopifnot(check_data_format(data)) + + full_times <- get_full_times(data = data, period = period, tol = tol) + if (is.null(full_times)) + { + stop("Unable to construct an equitimed time index.") + } + + times <- get_times_from_data(data) + if (isTRUE(all.equal(times, full_times))) + { + message("Dataset is already equitimed (evenly sampled in time).") + return(invisible(data)) + } + + # generate empty matrices to hold final abundance and covariates + abundance <- matrix(NA, nrow = length(full_times), ncol = NCOL(data$abundance)) + covariates <- data$covariates[0,] + # compute separation between times and full_times + times_dist <- outer(times, full_times, function(a, b) {abs(b - a)}) + + # fill abundance and covariates + method <- match.arg(method) + switch(method, + mean = { + idx <- times_dist <= tol + for (i in seq_along(full_times)) + { + abundance[i, ] <- colMeans(data$abundance[idx[, i], , drop = FALSE], na.rm = na.rm) + covariates[i, ] <- purrr::map_dfc(yy$covariates[idx[, i], , drop = FALSE], mean, na.rm = TRUE) + } + }, + median = { + idx <- times_dist <= tol + for (i in seq_along(full_times)) + { + abundance[i, ] <- apply(data$abundance[idx[, i], , drop = FALSE], 2, median, na.rm = na.rm) + covariates[i, ] <- purrr::map_dfc(yy$covariates[idx[, i], , drop = FALSE], median, na.rm = TRUE) + } + }, + closest = { + idx <- apply(times_dist, 2, which.min) + abundance <- data$abundance[idx,] + covariates <- data$covariates[idx,] + }) + + # restore column names and convert to tibbles + colnames(abundance) <- colnames(data$abundance) + abundance <- tibble::as_tibble(abundance) + covariates <- tibble::as_tibble(covariates) + + # make sure times column is properly filled + time_var <- resolve_covariate_variable(data, "timename") + if (is.null(time_var)) + { + # make sure timename variable is unique + new_col_names <- vctrs::vec_as_names(c(colnames(covariates), "time"), repair = "unique") + time_var <- tail(new_col_names, 1) + data$metadata$timename <- time_var + } + covariates[time_var] <- full_times + # assemble data to return + out <- list(abundance = abundance, + covariates = covariates, + metadata = data$metadata) + attr(out, "class") <- "matssdata" + + return(out) +} +#' @export +make_evenly_sampled <- make_equitimed diff --git a/man/is_equitimed.Rd b/man/is_equitimed.Rd index 86f25ea..141a8e6 100644 --- a/man/is_equitimed.Rd +++ b/man/is_equitimed.Rd @@ -5,7 +5,7 @@ \alias{is_evenly_sampled} \title{Check that the times of a dataset are evenly sampled} \usage{ -is_equitimed(data, period = NULL, tol = 1e-06, return_if_time_missing = TRUE) +is_equitimed(data, period = NULL, tol = 1e-06) } \arguments{ \item{data}{dataset to check} @@ -14,8 +14,6 @@ is_equitimed(data, period = NULL, tol = 1e-06, return_if_time_missing = TRUE) see if there is a known \code{period} set in the metadata, otherwise assumes 1)} \item{tol}{tolerance for the period} - -\item{return_if_time_missing}{default value to return if times are missing} } \value{ TRUE or FALSE, invisibly diff --git a/man/make_equitimed.Rd b/man/make_equitimed.Rd new file mode 100644 index 0000000..f07d32c --- /dev/null +++ b/man/make_equitimed.Rd @@ -0,0 +1,50 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-data-processing.R +\name{make_equitimed} +\alias{make_equitimed} +\alias{make_evenly_sampled} +\title{Insert rows if necessary so that time series are evenly sampled} +\usage{ +make_equitimed( + data, + period = NULL, + tol = 1e-06, + method = c("mean", "method", "closest"), + na.rm = TRUE +) +} +\arguments{ +\item{data}{dataset to modify} + +\item{period}{period to check the times against (if \code{NULL}, first check to +see if there is a known \code{period} set in the metadata, otherwise assumes 1)} + +\item{tol}{tolerance for the period} + +\item{method}{one of \code{c("mean", "method", "closest")} that determines how +the rows of the original data will get coerced into the output here.} + +\item{na.rm}{a logical value indicating whether \code{NA} + values should be stripped before the computation proceeds.} +} +\value{ +the dataset, with rows coerced according to the equitimed time +indices, and additional empty rows inserted if needed +} +\description{ +Insert rows if necessary so that time series are evenly sampled +} +\details{ +First, \code{get_full_times()} computes the sequence of time index values +at a regular sampling interval of period. These will be the final time +index values for the output. \emph{Some} set of rows of the original dataset +will map to each of these time indices. + +The \code{method} argument determines how these rows get coerced: +\describe{ +\item{mean}{the values in the rows are averaged together using \code{mean}} +\item{median}{the values in the rows are averaged together using \code{median}} +\item{closest}{the values in the row that is closest in time to the +desired time index are used.} +} +} diff --git a/tests/testthat/test-02-dataset-processing.R b/tests/testthat/test-02-dataset-processing.R index 288c4d4..3c0e83d 100644 --- a/tests/testthat/test-02-dataset-processing.R +++ b/tests/testthat/test-02-dataset-processing.R @@ -20,8 +20,9 @@ test_that("is_equitimed works", { path <- system.file("extdata", "subsampled", package = "MATSS", mustWork = TRUE) - dat <- get_mtquad_data() - expect_true(is_equitimed(dat)) + dat <- get_mtquad_data(path = file.path(path, "mapped-plant-quads-mt")) + m <- capture_messages(expect_true(is_equitimed(dat))) + expect_match(m, "No time period found. Assuming period = 1.", fixed = TRUE) }) From 6dbc113207135c42200efeab29e37b7ae0af05eb Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Mon, 20 Apr 2020 23:48:11 -0400 Subject: [PATCH 3/8] fix make_equitimed and add tests --- R/utils-data-processing.R | 17 +++++----- tests/testthat/test-02-dataset-processing.R | 37 +++++++++++++++++++-- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R index 033f662..e1f72d8 100644 --- a/R/utils-data-processing.R +++ b/R/utils-data-processing.R @@ -13,7 +13,7 @@ is_equitimed <- function(data, period = NULL, tol = 1e-06) { stopifnot(check_data_format(data)) - fulL_times <- get_full_times(data = data, period = period, tol = tol) + full_times <- get_full_times(data = data, period = period, tol = tol) times <- get_times_from_data(data) invisible(isTRUE(all.equal(times, full_times))) } @@ -25,15 +25,14 @@ is_evenly_sampled <- is_equitimed #' period to establish the sampling frequency #' #' @noRd -get_full_times <- function(data, period = NULL, tol = 1e-06, - convert_error_to_message = TRUE) +get_full_times <- function(data, period = NULL, tol = 1e-06) { times <- get_times_from_data(data) period <- get_period_from_data(data, period) full_times <- tryCatch(tidyr::full_seq(times, period, tol), error = function(e) { - message(e$message); + message(e$message) return(NULL) }) return(full_times) @@ -94,19 +93,19 @@ make_equitimed <- function(data, period = NULL, tol = 1e-06, full_times <- get_full_times(data = data, period = period, tol = tol) if (is.null(full_times)) { - stop("Unable to construct an equitimed time index.") + stop("Unable to construct an evenly spaced time index.") } times <- get_times_from_data(data) if (isTRUE(all.equal(times, full_times))) { - message("Dataset is already equitimed (evenly sampled in time).") + message("Dataset is already evenly sampled in time.") return(invisible(data)) } # generate empty matrices to hold final abundance and covariates abundance <- matrix(NA, nrow = length(full_times), ncol = NCOL(data$abundance)) - covariates <- data$covariates[0,] + covariates <- data$covariates[0, , drop = FALSE] # compute separation between times and full_times times_dist <- outer(times, full_times, function(a, b) {abs(b - a)}) @@ -119,7 +118,7 @@ make_equitimed <- function(data, period = NULL, tol = 1e-06, for (i in seq_along(full_times)) { abundance[i, ] <- colMeans(data$abundance[idx[, i], , drop = FALSE], na.rm = na.rm) - covariates[i, ] <- purrr::map_dfc(yy$covariates[idx[, i], , drop = FALSE], mean, na.rm = TRUE) + covariates[i, ] <- purrr::map_dfc(data$covariates[idx[, i], , drop = FALSE], mean, na.rm = TRUE) } }, median = { @@ -127,7 +126,7 @@ make_equitimed <- function(data, period = NULL, tol = 1e-06, for (i in seq_along(full_times)) { abundance[i, ] <- apply(data$abundance[idx[, i], , drop = FALSE], 2, median, na.rm = na.rm) - covariates[i, ] <- purrr::map_dfc(yy$covariates[idx[, i], , drop = FALSE], median, na.rm = TRUE) + covariates[i, ] <- purrr::map_dfc(data$covariates[idx[, i], , drop = FALSE], median, na.rm = TRUE) } }, closest = { diff --git a/tests/testthat/test-02-dataset-processing.R b/tests/testthat/test-02-dataset-processing.R index 3c0e83d..e43af7f 100644 --- a/tests/testthat/test-02-dataset-processing.R +++ b/tests/testthat/test-02-dataset-processing.R @@ -15,8 +15,8 @@ test_that("BBS data processing works", { }) test_that("is_equitimed works", { - m <- capture_error(expect_false(is_equitimed(dragons))) - expect_match(as.character(m), "Error: `x` is not a regular sequence.", fixed = TRUE) + m <- capture_messages(expect_false(is_equitimed(dragons))) + expect_match(m, "`x` is not a regular sequence.", fixed = TRUE) path <- system.file("extdata", "subsampled", package = "MATSS", mustWork = TRUE) @@ -25,8 +25,39 @@ test_that("is_equitimed works", { expect_match(m, "No time period found. Assuming period = 1.", fixed = TRUE) }) +test_that("make_equitimed works", { + expect_error(make_equitimed(dragons), + "Unable to construct an evenly spaced time index.", + fixed = TRUE) + path <- system.file("extdata", "subsampled", + package = "MATSS", mustWork = TRUE) + dat <- get_mtquad_data(path = file.path(path, "mapped-plant-quads-mt")) + expect_error(m <- capture_messages(out <- make_equitimed(dat)), NA) + expect_match(m, "No time period found. Assuming period = 1.", fixed = TRUE, all = FALSE) + expect_match(m, "Dataset is already evenly sampled in time.", fixed = TRUE, all = FALSE) + expect_equal(out, dat) + + dat <- list( + abundance = data.frame(a = c(1, 2, 3, 5, 6, 9, 10, 11, 12, 13), + b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), + covariates = data.frame(time = c(0, 0.5, 1, 2, 2.5, 4, 4.5, 5, 5.5, 6)), + metadata = list(timename = "time", + period = 0.5, + is_community = FALSE, + citation = "NA") + ) + attr(dat, "class") <- "matssdata" + expect_error(out <- make_equitimed(dat), NA) + expect_true(is_equitimed(out)) + expect_true(check_data_format(out)) + expect_equal(dim(out$abundance), c(13, 2)) + expect_true(all(is.na(out$abundance[c(4, 7, 8), ]))) + expect_true(!any(is.na(out$abundance[-c(4, 7, 8), ]))) +}) -test_that("make_integer_times") \ No newline at end of file +test_that("make_integer_times", { + +}) \ No newline at end of file From c2ea2177fc6b9564cd7422405e30e41de8511823 Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Tue, 21 Apr 2020 21:30:34 -0400 Subject: [PATCH 4/8] add `make_integer_times` --- DESCRIPTION | 5 +- NAMESPACE | 1 + R/create_MATSS_compendium.R | 3 +- R/utils-data-processing.R | 84 ++++++++++++++++++++- man/make_integer_times.Rd | 29 +++++++ tests/testthat/test-02-dataset-processing.R | 46 ++++++++--- 6 files changed, 151 insertions(+), 17 deletions(-) create mode 100644 man/make_integer_times.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 9d7371f..27d6f4e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -57,7 +57,7 @@ Imports: portalr (>= 0.3.2), purrr, rdataretriever (>= 2.0.0), - reticulate (>= 1.14.9002), + reticulate (>= 1.15), rlang, RSQLite, rstudioapi, @@ -77,9 +77,6 @@ Suggests: visNetwork VignetteBuilder: knitr -Remotes: - rstudio/reticulate, - weecology/portalr Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) diff --git a/NAMESPACE b/NAMESPACE index a9e4d09..f87cd03 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,7 @@ export(is_equitimed) export(is_evenly_sampled) export(make_equitimed) export(make_evenly_sampled) +export(make_integer_times) export(normalize_effort) export(normalize_obs) export(normalize_times) diff --git a/R/create_MATSS_compendium.R b/R/create_MATSS_compendium.R index 7b31cbb..ee0a7d6 100644 --- a/R/create_MATSS_compendium.R +++ b/R/create_MATSS_compendium.R @@ -56,7 +56,8 @@ create_MATSS_compendium <- function(path, matss_ref_key = matss_citation$key), package = "MATSS") if (DEPLOY) - preamble_text <- ":rotating_light: **THIS IS AN AUTO-GENERATED EXAMPLE COMPENDIUM** :rotating_light:\n" + preamble_text <- c(":rotating_light: **THIS IS AN AUTO-GENERATED EXAMPLE COMPENDIUM** :rotating_light:\n", + "*last updated: ", Sys.Date(), "*\n") else preamble_text <- "" diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R index e1f72d8..d5b270d 100644 --- a/R/utils-data-processing.R +++ b/R/utils-data-processing.R @@ -28,6 +28,12 @@ is_evenly_sampled <- is_equitimed get_full_times <- function(data, period = NULL, tol = 1e-06) { times <- get_times_from_data(data) + if (is.null(times)) + { + stop("Dataset does not appear to have a times variable.\n", + "Check", usethis::ui_code("covariates"), " and ", + usethis::ui_code("metadata$timename"), ".\n") + } period <- get_period_from_data(data, period) full_times <- tryCatch(tidyr::full_seq(times, period, tol), @@ -58,6 +64,7 @@ get_period_from_data <- function(data, period = NULL) } return(period) } + #' @title Insert rows if necessary so that time series are evenly sampled #' @aliases make_evenly_sampled #' @@ -145,7 +152,8 @@ make_equitimed <- function(data, period = NULL, tol = 1e-06, if (is.null(time_var)) { # make sure timename variable is unique - new_col_names <- vctrs::vec_as_names(c(colnames(covariates), "time"), repair = "unique") + new_col_names <- vctrs::vec_as_names(c(colnames(covariates), "time"), + repair = "unique", quiet = TRUE) time_var <- tail(new_col_names, 1) data$metadata$timename <- time_var } @@ -163,4 +171,78 @@ make_equitimed <- function(data, period = NULL, tol = 1e-06, #' @export make_evenly_sampled <- make_equitimed +#' @title Add a time variable with integer values for evenly sampled data +#' +#' @param data dataset to modify +#' @inheritParams is_equitimed +#' @inheritParams base::mean +#' +#' @return the dataset, with integer times +#' +#' @details First, check if the data are evenly sampled in time. If not, we +#' exit early. Next, if the times are already integer or Date, we don't do +#' anything. If the times are numeric, but roundable to integer, we round. +#' Otherwise, we add a new variable to `covariates` from 1:n and designate +#' this variable as the `timename`. +#' +#' @export +make_integer_times <- function(data, period = NULL, tol = 1e-06, + confirm = interactive()) +{ + is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) + { + abs(x - round(x)) < tol + } + + # check for existence of times + times <- get_times_from_data(data) + + # do checks based on existing times + if (!is.null(times)) + { + # check for equitimed + if (!is_equitimed(data = data, period = period, tol = tol)) + { + stop(c("Dataset is not evenly sampled in time.\n", + "Perhaps you want to call ", usethis::ui_code("make_equitimed()"), " first.\n")) + } + + # check for integer times + if (is.integer(times)) + { + message("Dataset appears evenly sampled in time with integer times already.") + return(invisible(data)) + } else if (inherits(times, "Date")) + { + message("Dataset appears evenly sampled in time with `Date` formatted times already.") + return(invisible(data)) + } + if (all(is.wholenumber(times))) + { + message("Dataset appears evenly sampled in time with (close to) integer times already.") + message("Rounding times to integer and replacing them...") + time_var <- data$metadata$timename + data$covariates[time_var] <- as.integer(round(times)) + return(invisible(data)) + } + } + + # add time + times <- seq_len(NROW(data$abundance)) + if (is.null(data$covariates)) # create covariates + { + time_var <- "time" + data$covariates <- tibble::tibble(time_var = times) + } else { + new_col_names <- vctrs::vec_as_names(c(colnames(data$covariates), "time"), + repair = "unique", quiet = TRUE) + time_var <- tail(new_col_names, 1) + data$covariates[time_var] <- times + } + data$metadata$timename <- time_var + message("Integer times created in variable ", usethis::ui_code(time_var), ".") + return(invisible(data)) +} + + diff --git a/man/make_integer_times.Rd b/man/make_integer_times.Rd new file mode 100644 index 0000000..f74dcdc --- /dev/null +++ b/man/make_integer_times.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-data-processing.R +\name{make_integer_times} +\alias{make_integer_times} +\title{Add a time variable with integer values for evenly sampled data} +\usage{ +make_integer_times(data, period = NULL, tol = 1e-06, confirm = interactive()) +} +\arguments{ +\item{data}{dataset to modify} + +\item{period}{period to check the times against (if \code{NULL}, first check to +see if there is a known \code{period} set in the metadata, otherwise assumes 1)} + +\item{tol}{tolerance for the period} +} +\value{ +the dataset, with integer times +} +\description{ +Add a time variable with integer values for evenly sampled data +} +\details{ +First, check if the data are evenly sampled in time. If not, we +exit early. Next, if the times are already integer or Date, we don't do +anything. If the times are numeric, but roundable to integer, we round. +Otherwise, we add a new variable to \code{covariates} from 1:n and designate +this variable as the \code{timename}. +} diff --git a/tests/testthat/test-02-dataset-processing.R b/tests/testthat/test-02-dataset-processing.R index e43af7f..cf03f65 100644 --- a/tests/testthat/test-02-dataset-processing.R +++ b/tests/testthat/test-02-dataset-processing.R @@ -14,6 +14,26 @@ test_that("BBS data processing works", { expect_equal(out$abundance, c(2, 2, 4, 2)) }) +sample_dat <- list( + abundance = data.frame(a = c(1, 2, 3, 5, 6, 9, 10, 11, 12, 13), + b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), + covariates = data.frame(time = c(0, 0.5, 1, 2, 2.5, 4, 4.5, 5, 5.5, 6)), + metadata = list(timename = "time", + period = 0.5, + is_community = FALSE, + citation = "NA") +) +attr(sample_dat, "class") <- "matssdata" + +test_that("get_full_times works", { + expect_error(full_times <- get_full_times(sample_dat), NA) + expect_equal(full_times, seq(0.0, 6.0, by = 0.5)) + + error_dat <- sample_dat + error_dat$metadata$timename <- NULL + expect_error(get_full_times(error_dat)) +}) + test_that("is_equitimed works", { m <- capture_messages(expect_false(is_equitimed(dragons))) expect_match(m, "`x` is not a regular sequence.", fixed = TRUE) @@ -38,17 +58,7 @@ test_that("make_equitimed works", { expect_match(m, "Dataset is already evenly sampled in time.", fixed = TRUE, all = FALSE) expect_equal(out, dat) - dat <- list( - abundance = data.frame(a = c(1, 2, 3, 5, 6, 9, 10, 11, 12, 13), - b = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), - covariates = data.frame(time = c(0, 0.5, 1, 2, 2.5, 4, 4.5, 5, 5.5, 6)), - metadata = list(timename = "time", - period = 0.5, - is_community = FALSE, - citation = "NA") - ) - attr(dat, "class") <- "matssdata" - expect_error(out <- make_equitimed(dat), NA) + expect_error(out <- make_equitimed(sample_dat), NA) expect_true(is_equitimed(out)) expect_true(check_data_format(out)) expect_equal(dim(out$abundance), c(13, 2)) @@ -59,5 +69,19 @@ test_that("make_equitimed works", { test_that("make_integer_times", { + expect_error(out <- make_integer_times(sample_dat), + "Dataset is not evenly sampled in time.") + expect_error(out <- make_integer_times(make_equitimed(sample_dat)), NA) + expect_true(check_data_format(out)) + expect_equal(dim(out$covariates), c(13, 2)) + expect_equal(out$covariates[[2]], 1:13) + + path <- system.file("extdata", "subsampled", + package = "MATSS", mustWork = TRUE) + dat <- get_mtquad_data(path = file.path(path, "mapped-plant-quads-mt")) + m <- capture_messages(out <- make_integer_times(dat)) + expect_match(m, "No time period found. Assuming period = 1.", fixed = TRUE, all = FALSE) + expect_match(m, "Dataset appears evenly sampled in time with integer times already.", fixed = TRUE, all = FALSE) + expect_equal(out, dat) }) \ No newline at end of file From b2538f43bc62b797fab91afb4d00aa0426ab5236 Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Tue, 21 Apr 2020 22:19:27 -0400 Subject: [PATCH 5/8] add function to check for integer times --- NAMESPACE | 1 + R/utils-data-processing.R | 154 +++++++++++++++++++++++--------------- man/has_integer_times.Rd | 22 ++++++ man/make_integer_times.Rd | 16 ++-- 4 files changed, 120 insertions(+), 73 deletions(-) create mode 100644 man/has_integer_times.Rd diff --git a/NAMESPACE b/NAMESPACE index f87cd03..9e40c80 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ export(get_portal_rodents) export(get_sdl_data) export(get_sgs_data) export(get_times_from_data) +export(has_integer_times) export(import_retriever_data) export(install_retriever_data) export(interpolate_obs) diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R index d5b270d..918b847 100644 --- a/R/utils-data-processing.R +++ b/R/utils-data-processing.R @@ -21,50 +21,6 @@ is_equitimed <- function(data, period = NULL, tol = 1e-06) #' @export is_evenly_sampled <- is_equitimed -#' get the complete time index, filling in gaps where necessary, and using the -#' period to establish the sampling frequency -#' -#' @noRd -get_full_times <- function(data, period = NULL, tol = 1e-06) -{ - times <- get_times_from_data(data) - if (is.null(times)) - { - stop("Dataset does not appear to have a times variable.\n", - "Check", usethis::ui_code("covariates"), " and ", - usethis::ui_code("metadata$timename"), ".\n") - } - period <- get_period_from_data(data, period) - - full_times <- tryCatch(tidyr::full_seq(times, period, tol), - error = function(e) { - message(e$message) - return(NULL) - }) - return(full_times) -} - -#' extract the period, given the value from the metadata field, and a value -#' specified by the user. The flowchart is: -#' (1) if user has supplied non-null `period`, use that -#' (2) if metadata period is non-null, use that -#' (3) use a default value of 1 and print a message -#' -#' @noRd -get_period_from_data <- function(data, period = NULL) -{ - if (is.null(period)) - { - period <- data$metadata$period - if (is.null(period)) - { - message("No time period found. Assuming period = 1.") - period <- 1 - } - } - return(period) -} - #' @title Insert rows if necessary so that time series are evenly sampled #' @aliases make_evenly_sampled #' @@ -171,11 +127,45 @@ make_equitimed <- function(data, period = NULL, tol = 1e-06, #' @export make_evenly_sampled <- make_equitimed +#' @title Check if a dataset has integer times +#' +#' @param data dataset to check +#' +#' @return TRUE or FALSE +#' +#' @details If the times are already integer or Date, true. Otherwise FALSE, +#' with a message if times are missing, or if times could potentially be +#' rounded. +#' +#' @export +has_integer_times <- function(data) +{ + # check for existence of times + times <- get_times_from_data(data) + if (is.null(times)) + { + message("Dataset is missing times.") + return(FALSE) + } + + # check for integer times + if (is.integer(times) || inherits(times, "Date")) + { + return(TRUE) + } else if (all(is.wholenumber(times))) { + message("Dataset has close to integer times, but they need to be rounded.\n", + "Perhaps you want to call ", usethis::ui_code("make_integer_times()"), ".\n") + return(FALSE) + } + + # otherwise + return(FALSE) +} + #' @title Add a time variable with integer values for evenly sampled data #' #' @param data dataset to modify #' @inheritParams is_equitimed -#' @inheritParams base::mean #' #' @return the dataset, with integer times #' @@ -186,20 +176,13 @@ make_evenly_sampled <- make_equitimed #' this variable as the `timename`. #' #' @export -make_integer_times <- function(data, period = NULL, tol = 1e-06, - confirm = interactive()) +make_integer_times <- function(data, period = NULL, tol = 1e-06) { - is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) - { - abs(x - round(x)) < tol - } - - # check for existence of times times <- get_times_from_data(data) # do checks based on existing times if (!is.null(times)) - { + { # check for equitimed if (!is_equitimed(data = data, period = period, tol = tol)) { @@ -210,16 +193,13 @@ make_integer_times <- function(data, period = NULL, tol = 1e-06, # check for integer times if (is.integer(times)) { - message("Dataset appears evenly sampled in time with integer times already.") + message("Dataset is evenly sampled with integer times already.") return(invisible(data)) - } else if (inherits(times, "Date")) - { - message("Dataset appears evenly sampled in time with `Date` formatted times already.") + } else if (inherits(times, "Date")) { + message("Dataset is evenly sampled with `Date` formatted times already.") return(invisible(data)) - } - if (all(is.wholenumber(times))) - { - message("Dataset appears evenly sampled in time with (close to) integer times already.") + } else if (all(is.wholenumber(times))) { + message("Dataset is evenly sampled with (close to) integer times already.") message("Rounding times to integer and replacing them...") time_var <- data$metadata$timename data$covariates[time_var] <- as.integer(round(times)) @@ -245,4 +225,54 @@ make_integer_times <- function(data, period = NULL, tol = 1e-06, } +#' get the complete time index, filling in gaps where necessary, and using the +#' period to establish the sampling frequency +#' +#' @noRd +get_full_times <- function(data, period = NULL, tol = 1e-06) +{ + times <- get_times_from_data(data) + if (is.null(times)) + { + stop("Dataset does not appear to have a times variable.\n", + "Check", usethis::ui_code("covariates"), " and ", + usethis::ui_code("metadata$timename"), ".\n") + } + period <- get_period_from_data(data, period) + + full_times <- tryCatch(tidyr::full_seq(times, period, tol), + error = function(e) { + message(e$message) + return(NULL) + }) + return(full_times) +} + +#' extract the period, given the value from the metadata field, and a value +#' specified by the user. The flowchart is: +#' (1) if user has supplied non-null `period`, use that +#' (2) if metadata period is non-null, use that +#' (3) use a default value of 1 and print a message +#' +#' @noRd +get_period_from_data <- function(data, period = NULL) +{ + if (is.null(period)) + { + period <- data$metadata$period + if (is.null(period)) + { + message("No time period found. Assuming period = 1.") + period <- 1 + } + } + return(period) +} + +#' @noRd +is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) +{ + abs(x - round(x)) < tol +} + diff --git a/man/has_integer_times.Rd b/man/has_integer_times.Rd new file mode 100644 index 0000000..ad400a7 --- /dev/null +++ b/man/has_integer_times.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-data-processing.R +\name{has_integer_times} +\alias{has_integer_times} +\title{Check if a dataset has integer times} +\usage{ +has_integer_times(data) +} +\arguments{ +\item{data}{dataset to check} +} +\value{ +TRUE or FALSE +} +\description{ +Check if a dataset has integer times +} +\details{ +If the times are already integer or Date, true. Otherwise FALSE, +with a message if times are missing, or if times could potentially be +rounded. +} diff --git a/man/make_integer_times.Rd b/man/make_integer_times.Rd index f74dcdc..48399f9 100644 --- a/man/make_integer_times.Rd +++ b/man/make_integer_times.Rd @@ -4,15 +4,10 @@ \alias{make_integer_times} \title{Add a time variable with integer values for evenly sampled data} \usage{ -make_integer_times(data, period = NULL, tol = 1e-06, confirm = interactive()) +make_integer_times(data) } \arguments{ \item{data}{dataset to modify} - -\item{period}{period to check the times against (if \code{NULL}, first check to -see if there is a known \code{period} set in the metadata, otherwise assumes 1)} - -\item{tol}{tolerance for the period} } \value{ the dataset, with integer times @@ -21,9 +16,8 @@ the dataset, with integer times Add a time variable with integer values for evenly sampled data } \details{ -First, check if the data are evenly sampled in time. If not, we -exit early. Next, if the times are already integer or Date, we don't do -anything. If the times are numeric, but roundable to integer, we round. -Otherwise, we add a new variable to \code{covariates} from 1:n and designate -this variable as the \code{timename}. +If the times are already integer or Date, we don't do anything. If +the times are numeric, but roundable to integer, we round. Otherwise, we +add a new variable to \code{covariates} from 1:n and designate this variable as +the \code{timename}. } From 6cee72101779847c56c7337ba667fe53fa46be97 Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Wed, 22 Apr 2020 12:44:32 -0400 Subject: [PATCH 6/8] Add check for missing values in abundance and covariates --- NAMESPACE | 2 + R/utils-data-processing.R | 67 ++++++++++++++++++--- man/has_integer_times.Rd | 2 +- man/has_missing_samples.Rd | 30 +++++++++ man/is_equitimed.Rd | 2 +- man/make_integer_times.Rd | 16 +++-- tests/testthat/setup.R | 5 ++ tests/testthat/teardown.R | 1 + tests/testthat/test-02-dataset-processing.R | 40 ++++++++++-- 9 files changed, 146 insertions(+), 19 deletions(-) create mode 100644 man/has_missing_samples.Rd create mode 100644 tests/testthat/setup.R create mode 100644 tests/testthat/teardown.R diff --git a/NAMESPACE b/NAMESPACE index 9e40c80..f1171b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -41,12 +41,14 @@ export(get_sdl_data) export(get_sgs_data) export(get_times_from_data) export(has_integer_times) +export(has_missing_samples) export(import_retriever_data) export(install_retriever_data) export(interpolate_obs) export(invoke) export(is_equitimed) export(is_evenly_sampled) +export(is_fully_sampled) export(make_equitimed) export(make_evenly_sampled) export(make_integer_times) diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R index 918b847..8a79bfa 100644 --- a/R/utils-data-processing.R +++ b/R/utils-data-processing.R @@ -6,16 +6,16 @@ #' see if there is a known `period` set in the metadata, otherwise assumes 1) #' @param tol tolerance for the period #' -#' @return TRUE or FALSE, invisibly +#' @return `TRUE` or `FALSE` #' #' @export is_equitimed <- function(data, period = NULL, tol = 1e-06) { stopifnot(check_data_format(data)) - full_times <- get_full_times(data = data, period = period, tol = tol) + full_times <- get_full_times(data, period, tol) times <- get_times_from_data(data) - invisible(isTRUE(all.equal(times, full_times))) + isTRUE(all.equal(times, full_times)) } #' @export @@ -53,7 +53,7 @@ make_equitimed <- function(data, period = NULL, tol = 1e-06, { stopifnot(check_data_format(data)) - full_times <- get_full_times(data = data, period = period, tol = tol) + full_times <- get_full_times(data, period, tol) if (is.null(full_times)) { stop("Unable to construct an evenly spaced time index.") @@ -131,7 +131,7 @@ make_evenly_sampled <- make_equitimed #' #' @param data dataset to check #' -#' @return TRUE or FALSE +#' @return `TRUE` or `FALSE` #' #' @details If the times are already integer or Date, true. Otherwise FALSE, #' with a message if times are missing, or if times could potentially be @@ -184,7 +184,7 @@ make_integer_times <- function(data, period = NULL, tol = 1e-06) if (!is.null(times)) { # check for equitimed - if (!is_equitimed(data = data, period = period, tol = tol)) + if (!is_equitimed(data, period, tol)) { stop(c("Dataset is not evenly sampled in time.\n", "Perhaps you want to call ", usethis::ui_code("make_equitimed()"), " first.\n")) @@ -224,6 +224,55 @@ make_integer_times <- function(data, period = NULL, tol = 1e-06) return(invisible(data)) } +#' Check for missing samples +#' @aliases is_fully_sampled +#' +#' @description Some analyses may require evenly sampled data without missing +#' values. `has_missing_samples` checks that the dataset is equitimed, and +#' then for missing values within `abundance` (and optionally, `covariates`) +#' +#' `is_full_sampled()` does the same check, but returns `TRUE` if there are +#' NO missing samples. +#' +#' @inheritParams is_equitimed +#' @param check_covariates `TRUE` or `FALSE` (whether to check covariates, too) +#' +#' @return `TRUE` or `FALSE` +#' +#' @export +has_missing_samples <- function(data, period = NULL, tol = 1e-06, + check_covariates = FALSE) +{ + if (!is_equitimed(data, period, tol)) + { + message(c("Dataset is not evenly sampled in time.\n", + "Perhaps you want to call ", usethis::ui_code("make_equitimed()"), " first.\n")) + return(TRUE) + } + + # check abundance + if (!all(is.finite(data$abundance))) + { + message("Dataset has NA (or Inf) values in ", usethis::ui_code("abundance"), ".") + return(TRUE) + } + + # check covariates + if (check_covariates && !all(is.finite(data$covariates))) + { + message("Dataset has NA (or Inf) values in ", usethis::ui_code("covariates"), ".") + return(TRUE) + } + + return(FALSE) +} + +#' @export +is_fully_sampled <- function(data, period = NULL, tol = 1e-06, + check_covariates = FALSE) +{ + return(!has_missing_samples(data, period, tol, check_covariates)) +} #' get the complete time index, filling in gaps where necessary, and using the #' period to establish the sampling frequency @@ -275,4 +324,8 @@ is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) abs(x - round(x)) < tol } - +#' @noRd +is.finite.data.frame <- function(df) +{ + do.call(cbind, lapply(df, is.finite)) +} diff --git a/man/has_integer_times.Rd b/man/has_integer_times.Rd index ad400a7..c324189 100644 --- a/man/has_integer_times.Rd +++ b/man/has_integer_times.Rd @@ -10,7 +10,7 @@ has_integer_times(data) \item{data}{dataset to check} } \value{ -TRUE or FALSE +\code{TRUE} or \code{FALSE} } \description{ Check if a dataset has integer times diff --git a/man/has_missing_samples.Rd b/man/has_missing_samples.Rd new file mode 100644 index 0000000..87d6f9d --- /dev/null +++ b/man/has_missing_samples.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-data-processing.R +\name{has_missing_samples} +\alias{has_missing_samples} +\alias{is_fully_sampled} +\title{Check for missing samples} +\usage{ +has_missing_samples(data, period = NULL, tol = 1e-06, check_covariates = FALSE) +} +\arguments{ +\item{data}{dataset to check} + +\item{period}{period to check the times against (if \code{NULL}, first check to +see if there is a known \code{period} set in the metadata, otherwise assumes 1)} + +\item{tol}{tolerance for the period} + +\item{check_covariates}{\code{TRUE} or \code{FALSE} (whether to check covariates, too)} +} +\value{ +\code{TRUE} or \code{FALSE} +} +\description{ +Some analyses may require evenly sampled data without missing +values. \code{has_missing_samples} checks that the dataset is equitimed, and +then for missing values within \code{abundance} (and optionally, \code{covariates}) + +\code{is_full_sampled()} does the same check, but returns \code{TRUE} if there are +NO missing samples. +} diff --git a/man/is_equitimed.Rd b/man/is_equitimed.Rd index 141a8e6..9afd089 100644 --- a/man/is_equitimed.Rd +++ b/man/is_equitimed.Rd @@ -16,7 +16,7 @@ see if there is a known \code{period} set in the metadata, otherwise assumes 1)} \item{tol}{tolerance for the period} } \value{ -TRUE or FALSE, invisibly +\code{TRUE} or \code{FALSE} } \description{ Check that the times of a dataset are evenly sampled diff --git a/man/make_integer_times.Rd b/man/make_integer_times.Rd index 48399f9..c4f5b66 100644 --- a/man/make_integer_times.Rd +++ b/man/make_integer_times.Rd @@ -4,10 +4,15 @@ \alias{make_integer_times} \title{Add a time variable with integer values for evenly sampled data} \usage{ -make_integer_times(data) +make_integer_times(data, period = NULL, tol = 1e-06) } \arguments{ \item{data}{dataset to modify} + +\item{period}{period to check the times against (if \code{NULL}, first check to +see if there is a known \code{period} set in the metadata, otherwise assumes 1)} + +\item{tol}{tolerance for the period} } \value{ the dataset, with integer times @@ -16,8 +21,9 @@ the dataset, with integer times Add a time variable with integer values for evenly sampled data } \details{ -If the times are already integer or Date, we don't do anything. If -the times are numeric, but roundable to integer, we round. Otherwise, we -add a new variable to \code{covariates} from 1:n and designate this variable as -the \code{timename}. +First, check if the data are evenly sampled in time. If not, we +exit early. Next, if the times are already integer or Date, we don't do +anything. If the times are numeric, but roundable to integer, we round. +Otherwise, we add a new variable to \code{covariates} from 1:n and designate +this variable as the \code{timename}. } diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R new file mode 100644 index 0000000..f0f90c7 --- /dev/null +++ b/tests/testthat/setup.R @@ -0,0 +1,5 @@ +pre_test_options <- options( + ## make ui_*() output easier to test against + ## just say no to ANSI escape codes + "crayon.enabled" = FALSE +) diff --git a/tests/testthat/teardown.R b/tests/testthat/teardown.R new file mode 100644 index 0000000..6c7ef9f --- /dev/null +++ b/tests/testthat/teardown.R @@ -0,0 +1 @@ +options(pre_test_options) diff --git a/tests/testthat/test-02-dataset-processing.R b/tests/testthat/test-02-dataset-processing.R index cf03f65..3b4ae31 100644 --- a/tests/testthat/test-02-dataset-processing.R +++ b/tests/testthat/test-02-dataset-processing.R @@ -66,10 +66,18 @@ test_that("make_equitimed works", { expect_true(!any(is.na(out$abundance[-c(4, 7, 8), ]))) }) +test_that("has_integer_times works", { + expect_true(has_integer_times(dragons)) + expect_false(has_integer_times(sample_dat)) + + path <- system.file("extdata", "subsampled", + package = "MATSS", mustWork = TRUE) + dat <- get_mtquad_data(path = file.path(path, "mapped-plant-quads-mt")) + expect_true(has_integer_times(dat)) +}) - -test_that("make_integer_times", { - expect_error(out <- make_integer_times(sample_dat), +test_that("make_integer_times works", { + expect_error(make_integer_times(sample_dat), "Dataset is not evenly sampled in time.") expect_error(out <- make_integer_times(make_equitimed(sample_dat)), NA) @@ -82,6 +90,28 @@ test_that("make_integer_times", { dat <- get_mtquad_data(path = file.path(path, "mapped-plant-quads-mt")) m <- capture_messages(out <- make_integer_times(dat)) expect_match(m, "No time period found. Assuming period = 1.", fixed = TRUE, all = FALSE) - expect_match(m, "Dataset appears evenly sampled in time with integer times already.", fixed = TRUE, all = FALSE) + expect_match(m, "Dataset is evenly sampled with integer times already.", fixed = TRUE, all = FALSE) expect_equal(out, dat) -}) \ No newline at end of file +}) + +test_that("is_fully_sampled works", { + m <- capture_messages(expect_false(is_fully_sampled(sample_dat))) + expect_match(m, "Dataset is not evenly sampled in time.") + + sample_dat$covariates["temp"] <- rnorm(NROW(sample_dat$covariates)) + + out <- make_equitimed(sample_dat) + m <- capture_messages(expect_false(is_fully_sampled(out))) + expect_match(m, "Dataset has NA (or Inf) values in `abundance`", fixed = TRUE, all = FALSE) + idx <- is.na(out$abundance) + out$abundance[idx] <- Inf + m <- capture_messages(expect_false(is_fully_sampled(out))) + expect_match(m, "Dataset has NA (or Inf) values in `abundance`", fixed = TRUE, all = FALSE) + out$abundance[idx] <- -999 + expect_true(is_fully_sampled(out)) + + m <- capture_messages(expect_false(is_fully_sampled(out, check_covariates = TRUE))) + expect_match(m, "Dataset has NA (or Inf) values in `covariates`", fixed = TRUE, all = FALSE) + out$covariates[is.na(out$covariates)] <- -999 + expect_true(is_fully_sampled(out, check_covariates = TRUE)) +}) From 2f93525e0978e6be2143858a52563bda2a3a4d35 Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Wed, 22 Apr 2020 19:47:19 -0400 Subject: [PATCH 7/8] add function to interpolate missing vals --- NAMESPACE | 1 + R/utils-data-processing.R | 61 +++++++++++++++++---- man/interpolate_missing_samples.Rd | 34 ++++++++++++ tests/testthat/test-02-dataset-processing.R | 19 +++++-- 4 files changed, 100 insertions(+), 15 deletions(-) create mode 100644 man/interpolate_missing_samples.Rd diff --git a/NAMESPACE b/NAMESPACE index f1171b0..37dbf35 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -44,6 +44,7 @@ export(has_integer_times) export(has_missing_samples) export(import_retriever_data) export(install_retriever_data) +export(interpolate_missing_samples) export(interpolate_obs) export(invoke) export(is_equitimed) diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R index 8a79bfa..d73128e 100644 --- a/R/utils-data-processing.R +++ b/R/utils-data-processing.R @@ -251,16 +251,16 @@ has_missing_samples <- function(data, period = NULL, tol = 1e-06, } # check abundance - if (!all(is.finite(data$abundance))) + if (any(is.na(data$abundance))) { - message("Dataset has NA (or Inf) values in ", usethis::ui_code("abundance"), ".") + message("Dataset has NA values in ", usethis::ui_code("abundance"), ".") return(TRUE) } # check covariates - if (check_covariates && !all(is.finite(data$covariates))) + if (check_covariates && any(is.na(data$covariates))) { - message("Dataset has NA (or Inf) values in ", usethis::ui_code("covariates"), ".") + message("Dataset has NA values in ", usethis::ui_code("covariates"), ".") return(TRUE) } @@ -274,6 +274,53 @@ is_fully_sampled <- function(data, period = NULL, tol = 1e-06, return(!has_missing_samples(data, period, tol, check_covariates)) } +#' @title Impute missing samples using linear interpolation +#' +#' @param data dataset to modify +#' @inheritParams is_equitimed +#' @param interpolate_covariates `TRUE` or `FALSE` (whether to do covariates, too) +#' +#' @return the dataset, with interpolated samples +#' +#' @details First, check if the data are evenly sampled in time. If not, we +#' exit early. Next, apply `forecast::na.interp()` to each variable that has +#' non-finite values. +#' +#' @export +interpolate_missing_samples <- function(data, period = NULL, tol = 1e-06, + interpolate_covariates = FALSE) +{ + interpolate_tbl <- function(df) + { + finite_cols_idx <- apply(is.na(df), 2, all) + # replace all non finite values with NA + + for (j in which(!finite_cols_idx)) + { + x <- df[[j]] + x[!is.finite(x)] <- NA + interpolated <- forecast::na.interp(x) + df[[j]] <- as.numeric(interpolated) + } + return(df) + } + + if (!is_equitimed(data, period, tol)) + { + stop(c("Dataset is not evenly sampled in time.\n", + "Perhaps you want to call ", usethis::ui_code("make_equitimed()"), " first.\n")) + } + + data$abundance <- interpolate_tbl(data$abundance) + + if (interpolate_covariates) + { + data$covariates <- interpolate_tbl(data$covariates) + } + + return(invisible(data)) +} + #' get the complete time index, filling in gaps where necessary, and using the #' period to establish the sampling frequency #' @@ -323,9 +370,3 @@ is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) { abs(x - round(x)) < tol } - -#' @noRd -is.finite.data.frame <- function(df) -{ - do.call(cbind, lapply(df, is.finite)) -} diff --git a/man/interpolate_missing_samples.Rd b/man/interpolate_missing_samples.Rd new file mode 100644 index 0000000..5ba58e7 --- /dev/null +++ b/man/interpolate_missing_samples.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-data-processing.R +\name{interpolate_missing_samples} +\alias{interpolate_missing_samples} +\title{Impute missing samples using linear interpolation} +\usage{ +interpolate_missing_samples( + data, + period = NULL, + tol = 1e-06, + interpolate_covariates = FALSE +) +} +\arguments{ +\item{data}{dataset to modify} + +\item{period}{period to check the times against (if \code{NULL}, first check to +see if there is a known \code{period} set in the metadata, otherwise assumes 1)} + +\item{tol}{tolerance for the period} + +\item{interpolate_covariates}{\code{TRUE} or \code{FALSE} (whether to do covariates, too)} +} +\value{ +the dataset, with interpolated samples +} +\description{ +Impute missing samples using linear interpolation +} +\details{ +First, check if the data are evenly sampled in time. If not, we +exit early. Next, apply \code{forecast::na.interp()} to each variable that has +non-finite values. +} diff --git a/tests/testthat/test-02-dataset-processing.R b/tests/testthat/test-02-dataset-processing.R index 3b4ae31..1418c94 100644 --- a/tests/testthat/test-02-dataset-processing.R +++ b/tests/testthat/test-02-dataset-processing.R @@ -102,16 +102,25 @@ test_that("is_fully_sampled works", { out <- make_equitimed(sample_dat) m <- capture_messages(expect_false(is_fully_sampled(out))) - expect_match(m, "Dataset has NA (or Inf) values in `abundance`", fixed = TRUE, all = FALSE) + expect_match(m, "Dataset has NA values in `abundance`", fixed = TRUE, all = FALSE) idx <- is.na(out$abundance) - out$abundance[idx] <- Inf - m <- capture_messages(expect_false(is_fully_sampled(out))) - expect_match(m, "Dataset has NA (or Inf) values in `abundance`", fixed = TRUE, all = FALSE) out$abundance[idx] <- -999 expect_true(is_fully_sampled(out)) m <- capture_messages(expect_false(is_fully_sampled(out, check_covariates = TRUE))) - expect_match(m, "Dataset has NA (or Inf) values in `covariates`", fixed = TRUE, all = FALSE) + expect_match(m, "Dataset has NA values in `covariates`", fixed = TRUE, all = FALSE) out$covariates[is.na(out$covariates)] <- -999 expect_true(is_fully_sampled(out, check_covariates = TRUE)) }) + +test_that("interpolate_missing_samples works", { + sample_dat$covariates["temp"] <- rnorm(NROW(sample_dat$covariates)) + + dat <- make_equitimed(sample_dat) + interpolated_dat <- interpolate_missing_samples(dat) + expect_true(is_fully_sampled(interpolated_dat)) + expect_false(is_fully_sampled(interpolated_dat, check_covariates = TRUE)) + expect_true(is_fully_sampled(interpolate_missing_samples(dat, interpolate_covariates = TRUE), + check_covariates = TRUE)) +}) + \ No newline at end of file From d3123003351d1cf922b08707cf124095d44a024e Mon Sep 17 00:00:00 2001 From: Hao Ye Date: Wed, 22 Apr 2020 23:16:16 -0400 Subject: [PATCH 8/8] preserve class when interpolating --- R/utils-data-processing.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/utils-data-processing.R b/R/utils-data-processing.R index d73128e..43466e7 100644 --- a/R/utils-data-processing.R +++ b/R/utils-data-processing.R @@ -301,6 +301,7 @@ interpolate_missing_samples <- function(data, period = NULL, tol = 1e-06, x[!is.finite(x)] <- NA interpolated <- forecast::na.interp(x) df[[j]] <- as.numeric(interpolated) + class(df[[j]]) <- class(x) } return(df) }