diff --git a/NEWS.md b/NEWS.md index 70fe8df173..dfc443861a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -82,6 +82,8 @@ 10. It is now possible to join two tables on their common columns, so called _natural join_, [#629](https://github.com/Rdatatable/data.table/issues/629). Use `on=.NATURAL` or `options("datatable.naturaljoin"=TRUE)`. Latter one works only when `x` has no key, if key is present then key columns are being used to join as before. Thanks to David Kulp for request. +11. `as.data.table` gains `key` argument mirroring its use in `setDT` and `data.table`, [#890](https://github.com/Rdatatable/data.table/issues/890). As a byproduct, the arguments of `as.data.table.array` have changed order, which could affect code relying on positional arguments to this method. Thanks @cooldome for the suggestion and @MichaelChirico for implementation. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/as.data.table.R b/R/as.data.table.R index a03ba35bb5..783b56c888 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -1,4 +1,4 @@ -as.data.table <-function(x, keep.rownames=FALSE, ...) +as.data.table <-function(x, keep.rownames=FALSE, key=NULL, ...) { if (is.null(x)) return(null.data.table()) @@ -6,13 +6,13 @@ as.data.table <-function(x, keep.rownames=FALSE, ...) } as.data.table.default <- function(x, ...){ - as.data.table(as.data.frame(x, ...)) # we cannot assume as.data.frame will do copy, thus setDT changed to as.data.table #3230 + as.data.table(as.data.frame(x, ...), ...) # we cannot assume as.data.frame will do copy, thus setDT changed to as.data.table #3230 } as.data.table.factor <- as.data.table.ordered <- as.data.table.integer <- as.data.table.numeric <- as.data.table.logical <- as.data.table.character <- -as.data.table.Date <- as.data.table.ITime <- function(x, keep.rownames=FALSE, ...) { +as.data.table.Date <- as.data.table.ITime <- function(x, keep.rownames=FALSE, key=NULL, ...) { if (is.matrix(x)) { return(as.data.table.matrix(x, ...)) } @@ -27,21 +27,21 @@ as.data.table.Date <- as.data.table.ITime <- function(x, keep.rownames=FALSE, .. nm = if (length(x) == 2L) if (is.character(keep.rownames)) keep.rownames[1L] else "rn" setattr(x, 'names', c(nm, tt)) } - as.data.table.list(x, FALSE) + as.data.table.list(x, FALSE, key) } # as.data.table.table - FR #4848 -as.data.table.table <- function(x, keep.rownames=FALSE, ...) { +as.data.table.table <- function(x, keep.rownames=FALSE, key=NULL, ...) { # Fix for bug #5408 - order of columns are different when doing as.data.table(with(DT, table(x, y))) val = rev(dimnames(provideDimnames(x))) if (is.null(names(val)) || !any(nzchar(names(val)))) setattr(val, 'names', paste0("V", rev(seq_along(val)))) - ans <- data.table(do.call(CJ, c(val, sorted=FALSE)), N = as.vector(x)) + ans <- data.table(do.call(CJ, c(val, sorted=FALSE)), N = as.vector(x), key=key) setcolorder(ans, c(rev(head(names(ans), -1L)), "N")) ans } -as.data.table.matrix <- function(x, keep.rownames=FALSE, ...) { +as.data.table.matrix <- function(x, keep.rownames=FALSE, key=NULL, ...) { if (!identical(keep.rownames, FALSE)) { # can specify col name to keep.rownames, #575 ans = data.table(rn=rownames(x), x, keep.rownames=FALSE) @@ -50,7 +50,6 @@ as.data.table.matrix <- function(x, keep.rownames=FALSE, ...) { return(ans) } d <- dim(x) - nrows <- d[1L] ncols <- d[2L] ic <- seq_len(ncols) if (!ncols) return(null.data.table()) @@ -64,33 +63,37 @@ as.data.table.matrix <- function(x, keep.rownames=FALSE, ...) { for (i in ic) value[[i]] <- as.vector(x[, i]) # to drop any row.names that would otherwise be retained inside every column of the data.table } col_labels <- dimnames(x)[[2L]] + setDT(value) if (length(col_labels) == ncols) { if (any(empty <- !nzchar(col_labels))) col_labels[empty] <- paste0("V", ic[empty]) - setattr(value, "names", col_labels) + setnames(value, col_labels) } else { - setattr(value, "names", paste0("V", ic)) + setnames(value, paste0("V", ic)) } - setattr(value,"row.names",.set_row_names(nrows)) - setattr(value,"class",c("data.table","data.frame")) - alloc.col(value) + # setkey now to allow matrix column names as key + setkeyv(value, key) + value } # as.data.table.array - #1418 -as.data.table.array <- function(x, keep.rownames=FALSE, sorted=TRUE, value.name="value", na.rm=TRUE, ...) { +as.data.table.array <- function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, value.name="value", na.rm=TRUE, ...) { dx = dim(x) if (length(dx) <= 2L) - stop("as.data.table.array method should be only called for arrays with 3+ dimensions, for 2 dimensions matrix method should be used") + stop("as.data.table.array method should only be called for arrays with 3+ dimensions; use the matrix method for 2-dimensional arrays") if (!is.character(value.name) || length(value.name)!=1L || is.na(value.name) || !nzchar(value.name)) - stop("Argument 'value.name' must be scalar character, non-NA and non zero char") + stop("Argument 'value.name' must be scalar character, non-NA and at least one character") if (!is.logical(sorted) || length(sorted)!=1L || is.na(sorted)) stop("Argument 'sorted' must be scalar logical and non-NA") if (!is.logical(na.rm) || length(na.rm)!=1L || is.na(na.rm)) stop("Argument 'na.rm' must be scalar logical and non-NA") + if (!missing(sorted) && !is.null(key)) + stop("Please provide either 'key' or 'sorted', but not both.") + dnx = dimnames(x) # NULL dimnames will create integer keys, not character as in table method - val = rev(if (is.null(dnx)) lapply(dim(x), seq.int) else dnx) + val = rev(if (is.null(dnx)) lapply(dx, seq.int) else dnx) if (is.null(names(val)) || all(!nzchar(names(val)))) setattr(val, 'names', paste0("V", rev(seq_along(val)))) if (value.name %chin% names(val)) @@ -102,12 +105,12 @@ as.data.table.array <- function(x, keep.rownames=FALSE, sorted=TRUE, value.name= setnames(ans, "N", value.name) dims = rev(head(names(ans), -1L)) setcolorder(ans, c(dims, value.name)) - if (isTRUE(sorted)) - setkeyv(ans, dims) + if (isTRUE(sorted) && is.null(key)) key = dims + setkeyv(ans, key) ans[] } -as.data.table.list <- function(x, keep.rownames=FALSE, ...) { +as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, ...) { wn = sapply(x,is.null) if (any(wn)) x = x[!wn] if (!length(x)) return( null.data.table() ) @@ -148,10 +151,8 @@ as.data.table.list <- function(x, keep.rownames=FALSE, ...) { setattr(xx, 'names', names(x)[nz]) x = xx } - if (is.null(names(x))) setattr(x,"names",paste0("V",seq_len(length(x)))) - setattr(x,"row.names",.set_row_names(max(n))) - setattr(x,"class",c("data.table","data.frame")) - alloc.col(x) + setDT(x, key=key) # copy ensured above; also, setDT handles naming + x } # don't retain classes before "data.frame" while converting @@ -166,16 +167,16 @@ as.data.table.list <- function(x, keep.rownames=FALSE, ...) { unique( c("data.table", "data.frame", tail(cx, length(cx)-n)) ) } -as.data.table.data.frame <- function(x, keep.rownames=FALSE, ...) { +as.data.table.data.frame <- function(x, keep.rownames=FALSE, key=NULL, ...) { if (!identical(keep.rownames, FALSE)) { # can specify col name to keep.rownames, #575 - ans = data.table(rn=rownames(x), x, keep.rownames=FALSE) + ans = data.table(rn=rownames(x), x, keep.rownames=FALSE, key=key) if (is.character(keep.rownames)) setnames(ans, 'rn', keep.rownames[1L]) return(ans) } ans = copy(x) # TO DO: change this deep copy to be shallow. - setattr(ans,"row.names",.set_row_names(nrow(x))) + setattr(ans, "row.names", .set_row_names(nrow(x))) ## NOTE: This test (#527) is no longer in effect ## # for nlme::groupedData which has class c("nfnGroupedData","nfGroupedData","groupedData","data.frame") @@ -185,6 +186,8 @@ as.data.table.data.frame <- function(x, keep.rownames=FALSE, ...) { # fix for #1078 and #1128, see .resetclass() for explanation. setattr(ans, "class", .resetclass(x, "data.frame")) alloc.col(ans) + setkeyv(ans, key) + ans } as.data.table.data.table <- function(x, ...) { diff --git a/R/xts.R b/R/xts.R index df296d76c9..34325e26a6 100644 --- a/R/xts.R +++ b/R/xts.R @@ -1,10 +1,15 @@ -as.data.table.xts <- function(x, keep.rownames = TRUE, ...) { +as.data.table.xts <- function(x, keep.rownames = TRUE, key=NULL, ...) { stopifnot(requireNamespace("xts"), !missing(x), xts::is.xts(x)) + # as.data.frame.xts will handle copying, and + # the error check above ensures as.data.frame.xts is applied r = setDT(as.data.frame(x, row.names=NULL)) if (!keep.rownames) return(r[]) if ("index" %chin% names(x)) stop("Input xts object should not have 'index' column because it would result in duplicate column names. Rename 'index' column in xts or use `keep.rownames=FALSE` and add index manually as another column.") r[, "index" := zoo::index(x)] - setcolorder(r, c("index", setdiff(names(r), "index")))[] + setcolorder(r, c("index", setdiff(names(r), "index"))) + # save to end to allow for key='index' + setkeyv(r, key) + r[] } as.xts.data.table <- function(x, ...) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 812a970c87..4d93c8173e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18,6 +18,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { all.equal.data.table = data.table:::all.equal.data.table any_na = data.table:::any_na + as.data.table.array = data.table:::as.data.table.array as.IDate.default = data.table:::as.IDate.default as.ITime.default = data.table:::as.ITime.default binary = data.table:::binary @@ -6516,6 +6517,14 @@ if (test_xts) { setcolorder(dt, c(2, 3, 1)) dt[ , char_col := 'a'] test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') + + # 890 -- key argument for as.data.table.xts + x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) + test(1465.18, capture.output(as.data.table(x, key="index")), + c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", + " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", + " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", + " 9: 1970-01-10 9", "10: 1970-01-11 10")) Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) } @@ -11064,7 +11073,7 @@ test(1774.02, TRUE, all( # 4D unnamed x = array(1:81, dim=rep(3L,4)) dt = as.data.table(x, na.rm=FALSE) -test(1774.03, TRUE, all( +test(1774.03, all( identical(dim(dt), c(81L,5L)), identical(names(dt), c(paste0("V",1:4),"value")), all(dt[J(1L)][1L, value] == 1L, dt[J(2L)][1L, value] == 2L, dt[J(3L)][.N, value] == 81L) # this also tests if dt is keyed @@ -11072,7 +11081,7 @@ test(1774.03, TRUE, all( # 4D named dim values but not dims x = array(1:81, dim=rep(3L, 4L), dimnames=rep(list(letters[1:3]), 4L)) dt = as.data.table(x, na.rm=FALSE) -test(1774.04, TRUE, all( +test(1774.04, all( identical(dim(dt), c(81L,5L)), identical(names(dt), c(paste0("V",1:4),"value")), all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L) @@ -11080,7 +11089,7 @@ test(1774.04, TRUE, all( # 4D named dim values and dims x = array(1:81, dim=rep(3L, 4L), dimnames=setNames(rep(list(letters[1:3]), 4L), letters[1:4])) dt = as.data.table(x, na.rm=FALSE) -test(1774.05, TRUE, all( +test(1774.05, all( identical(dim(dt), c(81L,5L)), identical(names(dt), c(letters[1:4],"value")), all(dt[J("a")][1L, value] == 1L, dt[J("b")][1L, value] == 2L, dt[J("c")][.N, value] == 81L) @@ -11088,7 +11097,7 @@ test(1774.05, TRUE, all( # third dim of length 1L so really 2D x = array(1:4, dim=c(2L,2L,1L), dimnames=list(a=letters[1:2], b=letters[1:2], c="a")) dt = as.data.table(x, na.rm=FALSE) -test(1774.06, TRUE, all( +test(1774.06, all( identical(dim(dt), c(4L,4L)), identical(names(dt), c("a","b","c","value")), all(dt[J("a")][, value] == c(1L,3L), dt[J("b")][, value] == c(2L,4L)) @@ -11096,7 +11105,7 @@ test(1774.06, TRUE, all( # second and third dim of length 1L so really 1D x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", c="a")) dt = as.data.table(x, na.rm=FALSE) -test(1774.07, TRUE, all( +test(1774.07, all( identical(dim(dt), c(2L,4L)), identical(names(dt), c("a","b","c","value")), all(dt[J("a")][, value] == 1L, dt[J("b")][, value] == 2L) @@ -11107,7 +11116,7 @@ x = rnorm(27) x[sample(length(x), length(x)/2)] = NA dim(x) = c(3L,3L,3L) dt = as.data.table(x, na.rm=FALSE) -test(1774.08, TRUE, all( +test(1774.08, all( identical(dim(dt), c(27L,4L)), identical(names(dt), c(paste0("V",1:3),"value")), dt[is.na(value), .N] > 0L, @@ -11115,7 +11124,7 @@ test(1774.08, TRUE, all( )) # na.rm=TRUE / sorted=TRUE dt = as.data.table(x) -test(1774.09, TRUE, all( +test(1774.09, all( identical(dim(dt), c(14L,4L)), identical(names(dt), c(paste0("V",1:3),"value")), dt[is.na(value), .N] == 0L, @@ -11123,7 +11132,7 @@ test(1774.09, TRUE, all( )) # na.rm=TRUE / sorted=FALSE dt = as.data.table(x, sorted=FALSE) -test(1774.10, TRUE, all( +test(1774.10, all( identical(dim(dt), c(14L,4L)), identical(names(dt), c(paste0("V",1:3),"value")), dt[is.na(value), .N] == 0L, @@ -11132,7 +11141,7 @@ test(1774.10, TRUE, all( )) # na.rm=FALSE / sorted=FALSE dt = as.data.table(x, na.rm=FALSE, sorted=FALSE) -test(1774.11, TRUE, all( +test(1774.11, all( identical(dim(dt), c(27L,4L)), identical(names(dt), c(paste0("V",1:3),"value")), is.unsorted(dt[[1]]), @@ -11144,6 +11153,12 @@ test(1774.12, as.data.table(x, value.name="a"), error = "Argument 'value.name' s x = array(1:2, dim=c(2L,1L,1L), dimnames=list(a=letters[1:2], b="a", value="a")) test(1774.13, as.data.table(x), error = "Argument 'value.name' should not overlap with column names in result") +## unsupported usage of as.data.table.array +test(1774.14, as.data.table.array(as.matrix(x)), error="method should only be called for arrays with 3+") +test(1774.15, as.data.table(x, value.name=NA), error="'value.name' must be scalar") +test(1774.16, as.data.table(x, sorted='a'), error="'sorted' must be scalar") +test(1774.17, as.data.table(x, na.rm='a'), error="'na.rm' must be scalar") + # verify print.keys works DT1 <- data.table(a = 1:3, key = "a") test(1775.1, capture.output(print(DT1, print.keys = TRUE)), @@ -14769,6 +14784,54 @@ setnames(d2, 1L, 'X1') test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural join using: \\[.*[.]{3}\\]") options(datatable.naturaljoin=FALSE) +#tests for adding key to as.data.table, #890 +## as.data.table.numeric (should cover as.data.table.factor, +## *.ordered, *.integer, *.logical, *.character, and *.Date since +## all are the same function in as.data.table.R) +nn = c(a=0.1, c=0.2, b=0.3, d=0.4) +ans = data.table(nn, key='nn') +ans_rn = data.table(rn = names(nn), nn, key='rn') +test(2046.01, as.data.table(nn, key="nn"), ans) +test(2046.02, as.data.table(nn, keep.rownames=TRUE, key="rn"), ans_rn) + +## as.data.table.data.frame +DF = as.data.frame(ans) +test(2046.03, as.data.table(DF, key="nn"), ans) + +## as.data.table.data.table +DT = copy(ans) +test(2046.04, as.data.table(DT, key="nn"), ans) + +## as.data.table.default +rr <- as.raw(3:1) +test(2046.05, as.data.table(rr, keep.rownames=TRUE, key="rn"), + data.table(rn = paste0(1:3), x = rr, key='rn')) + +## as.data.table.list +l = as.list(ans) +test(2046.06, as.data.table(l, key='nn'), ans) + +## as.data.table.matrix +mm <- as.matrix(ans) +test(2046.07, as.data.table(mm, key='nn'), ans) + +## as.data.table.array +aa = array(nn, c(1L, 2L, 2L)) +test(2046.08, as.data.table(aa, key='V3'), + data.table(V1 = 1L, V2 = rep(1:2, 2L), V3 = rep(1:2, each = 2L), value = as.vector(nn), key='V3')) +### conflict between sorted&key arguments +test(2046.09, as.data.table(aa, key='V3', sorted = TRUE), error="Please provide either 'key' or 'sorted'") + +## as.data.table.table +tt <- as.table(nn) +test(2046.10, as.data.table(tt, key="N"), + data.table(V1 = names(tt), N = as.vector(tt), key='N')) + +# some coverage tests uncovered by #890 +test(2047.1, as.data.table(list(character(0L))), data.table(V1 = character(0L))) +test(2047.2, as.data.table(list()), data.table(NULL)) +test(2047.3, as.data.table(rbind(1L)), data.table(V1 = 1L)) + ################################### # Add new tests above this line # diff --git a/man/as.data.table.Rd b/man/as.data.table.Rd index cecd4228cd..d5342ff040 100644 --- a/man/as.data.table.Rd +++ b/man/as.data.table.Rd @@ -19,11 +19,12 @@ Functions to check if an object is \code{data.table}, or coerce it if possible. } \usage{ -as.data.table(x, keep.rownames=FALSE, \dots) +as.data.table(x, keep.rownames=FALSE, key=NULL, \dots) \method{as.data.table}{data.table}(x, \dots) -\method{as.data.table}{array}(x, keep.rownames=FALSE, sorted=TRUE, value.name="value", na.rm=TRUE, \dots) +\method{as.data.table}{array}(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, + value.name="value", na.rm=TRUE, \dots) is.data.table(x) @@ -31,7 +32,8 @@ is.data.table(x) \arguments{ \item{x}{An R object.} \item{keep.rownames}{Default is \code{FALSE}. If \code{TRUE}, adds the input object's names as a separate column named \code{"rn"}. \code{keep.rownames = "id"} names the column \code{"id"} instead.} - \item{sorted}{logical used in \emph{array} method, default \code{TRUE}.} + \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. } + \item{sorted}{logical used in \emph{array} method, default \code{TRUE} is overridden when \code{key} is provided. } \item{value.name}{character scalar used in \emph{array} method, default \code{"value"}.} \item{na.rm}{logical used in \emph{array} method, default \code{TRUE} will remove rows with \code{NA} values.} \item{\dots}{Additional arguments to be passed to or from other methods.} @@ -67,6 +69,7 @@ mm = matrix(1:4, ncol=2, dimnames=list(c("r1", "r2"), c("c1", "c2"))) as.data.table(mm) as.data.table(mm, keep.rownames=TRUE) as.data.table(mm, keep.rownames="rownames") +as.data.table(mm, key="c1") ll = list(a=1:2, b=3:4) as.data.table(ll) @@ -80,6 +83,7 @@ as.data.table(DF, keep.rownames="rownames") DT = data.table(x=rep(c("x","y","z"),each=2), y=c(1:6)) as.data.table(DT) +as.data.table(DT, key='x') ar = rnorm(27) ar[sample(27, 15)] = NA diff --git a/man/as.data.table.xts.Rd b/man/as.data.table.xts.Rd index 6d4eb7bdde..bd879307f8 100644 --- a/man/as.data.table.xts.Rd +++ b/man/as.data.table.xts.Rd @@ -5,13 +5,12 @@ Efficient conversion xts to data.table. } \usage{ -\method{as.data.table}{xts}(x, keep.rownames = TRUE, \dots) +\method{as.data.table}{xts}(x, keep.rownames = TRUE, key=NULL, \dots) } \arguments{ \item{x}{xts to convert to data.table} - \item{keep.rownames}{keep xts index as \emph{index} column in result data.table} - +\item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. } \item{\dots}{ignored, just for consistency with \code{as.data.table}} } \seealso{ \code{\link{as.xts.data.table}} }