diff --git a/NAMESPACE b/NAMESPACE index 7472f90..dd683a3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,7 +18,6 @@ importFrom(plyr,alply) importFrom(plyr,amv_dimnames) importFrom(plyr,as.quoted) importFrom(plyr,eval.quoted) -importFrom(plyr,id) importFrom(plyr,llply) importFrom(plyr,rbind.fill) importFrom(plyr,split_labels) diff --git a/R/cast.r b/R/cast.r index b77c301..f9603cb 100644 --- a/R/cast.r +++ b/R/cast.r @@ -41,7 +41,7 @@ #' @param value.var name of column which stores values, see #' \code{\link{guess_value}} for default strategies to figure this out. #' @seealso \code{\link{melt}}, \url{http://had.co.nz/reshape/} -#' @importFrom plyr alply amv_dimnames as.quoted eval.quoted id llply rbind.fill split_labels vaggregate +#' @importFrom plyr alply amv_dimnames as.quoted eval.quoted llply rbind.fill split_labels vaggregate #' @import stringr #' @examples #' #Air quality example @@ -92,6 +92,66 @@ #' @name cast NULL +# replacement for plyr::id; it doesn't give the same result, but +# order(id(...)) matches order(plyr::id(...)) +# interaction(drop=drop) doesn't quite work because plyr::id() retains the +# actual level numbers as if drop=FALSE, whereas interaction compacts them +# into 1:attr(, "n"), the unique number of observed levels. +id <- function(x, drop) { + if (length(x) == 0L) return(structure(integer(), n = 0L)) + + # Calculate number of levels for each vector + # For vectors that already had an explicit "n" attribute (e.g. from prior id() calls) + # we must use those directly instead of recomputing from observed discrete values. + ns <- vapply(seq_along(x), function(i) { + v <- x[[i]] + explicit_n <- attr(v, "n") + if (!is.null(explicit_n)) return(as.integer(explicit_n)) + if (is.factor(v)) return(nlevels(v)) + length(unique(v)) + }, integer(1)) + + # Calculate integer values for each vector + vals <- lapply(seq_along(x), function(i) { + v <- x[[i]] + if (!is.null(attr(v, "n"))) return(as.integer(v)) + if (is.factor(v)) return(as.integer(v)) + # match includes NA by default if it's in the table + match(v, sort(unique(v), na.last = TRUE)) + }) + + # We want the LAST variable to vary fastest. + # Weight for x[[i]] is prod(ns[(i+1):n]) + n <- length(x) + weights <- rep(1, n) + if (n > 1) { + for (i in (n - 1):1) { + weights[i] <- weights[i + 1] * ns[i + 1] + } + } + + res <- rep(1, length(vals[[1]])) + for (i in seq_along(vals)) { + res <- res + (vals[[i]] - 1) * weights[i] + } + + if (drop) { + u <- unique(res) + # Sorts the unique generated IDs to be consistent. + su <- sort(u, na.last = TRUE) + res <- match(res, su) + # Ensure n is integer + n_attr <- as.integer(length(u)) + } else { + # Ensure n is integer + n_attr <- as.integer(prod(ns)) + } + + res <- as.integer(res) + attr(res, "n") <- n_attr + res +} + cast <- function(data, formula, fun.aggregate = NULL, ..., subset = NULL, fill = NULL, drop = TRUE, value.var = guess_value(data), value_var) { if (!missing(value_var)) { diff --git a/tests/testthat/test-cast.r b/tests/testthat/test-cast.r index 0aae1d7..4b85801 100644 --- a/tests/testthat/test-cast.r +++ b/tests/testthat/test-cast.r @@ -211,3 +211,53 @@ test_that("useful error message if value.var doesn't exist", { expect_error(dcast(airquality, month ~ day, value.var = "test"), "value.var (test) not found in input", fixed = TRUE) }) + +test_that("id handles recursive ID calls correctly", { + # This mimics how cast() handles multiple variables on one dimension + x1 <- c("a", "a", "b") + x2 <- c("1", "2", "1") + + id1 <- id(list(x1, x2), drop = FALSE) + expect_equal(attr(id1, "n"), 4L) + expect_equal(as.integer(id1), c(1L, 2L, 3L)) + + # Recursive call + id2 <- id(list(id1), drop = FALSE) + expect_equal(attr(id2, "n"), 4L) + expect_equal(as.integer(id2), c(1L, 2L, 3L)) +}) + +test_that("id handles NAs consistently with drop", { + x <- c(1, 2, NA) + + # drop = TRUE + id_t <- id(list(x), drop = TRUE) + expect_equal(attr(id_t, "n"), 3L) + expect_equal(as.integer(id_t), c(1L, 2L, 3L)) + + # drop = FALSE + id_f <- id(list(x), drop = FALSE) + expect_equal(attr(id_f, "n"), 3L) + expect_equal(as.integer(id_f), c(1L, 2L, 3L)) +}) + +test_that("id handles factor levels with drop=FALSE", { + f <- factor(c("a", "a"), levels = c("a", "b")) + id_f <- id(list(f), drop = FALSE) + expect_equal(attr(id_f, "n"), 2L) + expect_equal(as.integer(id_f), c(1L, 1L)) + + id_t <- id(list(f), drop = TRUE) + expect_equal(attr(id_t, "n"), 1L) + expect_equal(as.integer(id_t), c(1L, 1L)) +}) + +test_that("ordering is correct for multi-variable id", { + # Last variable varies fastest + x1 <- c(1, 1, 2, 2) + x2 <- c(1, 2, 1, 2) + res <- id(list(x1, x2), drop = FALSE) + # (1,1)->1, (1,2)->2, (2,1)->3, (2,2)->4 + expect_equal(as.integer(res), 1:4) + expect_equal(attr(res, "n"), 4L) +})