From dc7213f8622fa2ee343054ad2a715d147d1760c7 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 26 Mar 2019 12:19:34 -0700 Subject: [PATCH 01/15] centralized data.table() logic in very similar as.data.table.list() --- R/as.data.table.R | 102 ++++++++++++++++++------------ R/data.table.R | 142 +++++++++--------------------------------- inst/tests/tests.Rraw | 33 ++++++---- 3 files changed, 115 insertions(+), 162 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index a03ba35bb5..f49cfdefd0 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -108,50 +108,70 @@ as.data.table.array <- function(x, keep.rownames=FALSE, sorted=TRUE, value.name= } as.data.table.list <- function(x, keep.rownames=FALSE, ...) { - wn = sapply(x,is.null) - if (any(wn)) x = x[!wn] - if (!length(x)) return( null.data.table() ) - # fix for #833, as.data.table.list with matrix/data.frame/data.table as a list element.. - # TODO: move this entire logic (along with data.table() to C - for (i in seq_along(x)) { - dims = dim(x[[i]]) - if (!is.null(dims)) { - ans = do.call("data.table", x) - setnames(ans, make.unique(names(ans))) - return(ans) + n = length(x) + eachnrow = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. + eachncol = integer(n) + for (i in seq_len(n)) { + xi = x[[i]] + if (is.null(xi)) next # stop("column or argument ",i," is NULL") + if ("POSIXlt" %chin% class(xi)) { + warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") + xi = x[[i]] = as.POSIXct(xi) + } else if (is.matrix(xi) || is.data.frame(xi)) { # including data.table (a data.frame, too) + xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames) # we will never allow a matrix to be a column; always unpack the columns + } else if (is.table(xi)) { + xi = x[[i]] = as.data.table.table(xi, keep.rownames=keep.rownames) + } else if (is.function(xi)) { + xi = x[[i]] = list(xi) } + eachnrow[i] = NROW(xi) # for a vector (including list() columns) returns the length + eachncol[i] = NCOL(xi) # for a vector returns 1 } - n = vapply(x, length, 0L) - mn = max(n) - x = copy(x) - idx = which(n < mn) - if (length(idx)) { - for (i in idx) { - # any is.null(x[[i]]) were removed above, otherwise warning when a list element is NULL - if (inherits(x[[i]], "POSIXlt")) { - warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") - x[[i]] = as.POSIXct(x[[i]]) + ncol = sum(eachncol) + if (ncol==0L) return(null.data.table()) + nrow = max(eachnrow) + ans = vector("list",ncol) # always return a new VECSXP + recycle = function(x, nrow) { + if (length(x)==nrow) return(x) + if (identical(x,list())) vector("list", nrow) else rep(x, length.out=nrow) + } + vnames = vector("list", n) + k = 1L + for(i in seq_len(n)) { + xi = x[[i]] + if (is.null(xi)) next + nm = names(x)[i] + if (!length(nm) || is.na(nm)) nm = "" + if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow + warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") + if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above + vnames[[i]] = names(xi) #if (nm!="" && n>1L) paste(nm, names(xi), sep=".") else names(xi) + for (j in seq_along(xi)) { + ans[[k]] = recycle(xi[[j]], nrow) + k=k+1L } - # Implementing FR #4813 - recycle with warning when nr %% nrows[i] != 0L - if (!n[i] && mn) - warning("Item ", i, " is of size 0 but maximum size is ", mn, ", therefore recycled with 'NA'") - else if (n[i] && mn %% n[i] != 0L) - warning("Item ", i, " is of size ", n[i], " but maximum size is ", mn, " (recycled leaving a remainder of ", mn%%n[i], " items)") - x[[i]] = rep(x[[i]], length.out=mn) + } else { + vnames[[i]] = nm + ans[[k]] = recycle(xi, nrow) + k=k+1L } } - # fix for #842 - if (mn > 0L) { - nz = which(n > 0L) - xx = point(vector("list", length(nz)), seq_along(nz), x, nz) - if (!is.null(names(x))) - setattr(xx, 'names', names(x)[nz]) - x = xx - } - if (is.null(names(x))) setattr(x,"names",paste0("V",seq_len(length(x)))) - setattr(x,"row.names",.set_row_names(max(n))) - setattr(x,"class",c("data.table","data.frame")) - alloc.col(x) + vnames = unlist(vnames) + w = which(vnames=="") + if (length(w)) vnames[w] = paste0("V",w) + setattr(ans,"names",vnames) + setattr(ans,"row.names",.set_row_names(nrow)) + setattr(ans,"class",c("data.table","data.frame")) + alloc.col(ans) + + # fix for #842. Not sure what this is for. Revisit in PR in 1.12.4 if needed, else remove. + #if (mn > 0L) { + # nz = which(n > 0L) + # xx = point(vector("list", length(nz)), seq_along(nz), x, nz) + # if (!is.null(names(x))) + # setattr(xx, 'names', names(x)[nz]) + # x = xx + #} } # don't retain classes before "data.frame" while converting @@ -174,6 +194,10 @@ as.data.table.data.frame <- function(x, keep.rownames=FALSE, ...) { setnames(ans, 'rn', keep.rownames[1L]) return(ans) } + if (any(!sapply(x,is.atomic))) { + # a data.frame with a column that is data.frame needs to be expanded; test 2013.4 + return(as.data.table.list(x, keep.rownames=keep.rownames, ...)) + } ans = copy(x) # TO DO: change this deep copy to be shallow. setattr(ans,"row.names",.set_row_names(nrow(x))) diff --git a/R/data.table.R b/R/data.table.R index a73c085201..b0a7518f60 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -47,134 +47,54 @@ null.data.table <-function() { data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE) { - # NOTE: It may be faster in some circumstances to create a data.table by creating a list l first, and then setattr(l,"class",c("data.table","data.frame")) at the expense of checking. + # NOTE: It may be faster in some circumstances for users to create a data.table by creating a list l + # first, and then setattr(l,"class",c("data.table","data.frame")) and forgo checking. # TO DO: rewrite data.table(), one of the oldest functions here. Many people use data.table() to convert data.frame rather than # as.data.table which is faster; speed could be better. Revisit how many copies are taken in for example data.table(DT1,DT2) which # cbind directs to. And the nested loops for recycling lend themselves to being C level. x <- list(...) # doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) + if (length(x)==0L) return( null.data.table() ) + if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #5377 .Call(CcopyNamedInList,x) # to maintain pre-Rv3.1.0 behaviour, for now. See test 548.2. TODO: revist # TODO Something strange with NAMED on components of `...` to investigate. Or, just port data.table() to C. - if (length(x) < 1L) - return( null.data.table() ) - # fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top. - if (identical(x, list(NULL)) || identical(x, list(list())) || - identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() ) - nd = name_dots(...) - if (any(nocols<-sapply(x, NCOL)==0L)) { tt=!nocols; x=x[tt]; nd=lapply(nd,'[',tt); } # data.table(data.table(), data.table(a=integer())), #3445 - vnames = nd$vnames - novname = nd$novname # novname used later to know which were explicitly supplied in the call - n <- length(x) - if (length(vnames) != n) stop("logical error in vnames") # nocov - # cast to a list to facilitate naming of columns with dimension -- - # unlist() at the end automatically handles the need to "push" names - # to accommodate the "new" columns - vnames <- as.list.default(vnames) - nrows = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. - numcols = integer(n) # the ncols of each of the inputs (e.g. if inputs contain matrix or data.table) - for (i in seq_len(n)) { - xi = x[[i]] - if (is.null(xi)) stop("column or argument ",i," is NULL") - if ("POSIXlt" %chin% class(xi)) { - warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") - x[[i]] = as.POSIXct(xi) - } else if (is.matrix(xi) || is.data.frame(xi)) { # including data.table (a data.frame, too) - xi = as.data.table(xi, keep.rownames=keep.rownames) # TO DO: allow a matrix to be a column of a data.table. This could allow a key'd lookup to a matrix, not just by a single rowname vector, but by a combination of several columns. A matrix column could be stored either by row or by column contiguous in memory. - x[[i]] = xi - numcols[i] = length(xi) - } else if (is.table(xi)) { - x[[i]] = xi = as.data.table.table(xi, keep.rownames=keep.rownames) - numcols[i] = length(xi) - } else if (is.function(xi)) { - x[[i]] = xi = list(xi) - } - nrows[i] <- NROW(xi) # for a vector (including list() columns) returns the length - if (numcols[i]>0L) { - namesi <- names(xi) # works for both data.frame's, matrices and data.tables's - if (length(namesi)==0L) namesi = rep.int("",ncol(xi)) - namesi[is.na(namesi)] = "" - tt = namesi=="" - if (any(tt)) namesi[tt] = paste0("V", which(tt)) - if (novname[i]) vnames[[i]] = namesi - else vnames[[i]] = paste(vnames[[i]], namesi, sep=".") - } - } - nr <- max(nrows) - ckey = NULL - recycledkey = FALSE - for (i in seq_len(n)) { - xi = x[[i]] - if (is.data.table(xi) && haskey(xi)) { - if (nrows[i] 1L) "s", " in the longest column. Or, all columns can be 0 length, for insert()ing rows into.") - # Implementing FR #4813 - recycle with warning when nr %% nrows[i] != 0L - if (nr%%nrows[i] != 0L) warning("Item ", i, " is of size ", nrows[i], " but maximum size is ", nr, " (recycled leaving remainder of ", nr%%nrows[i], " items)") - if (is.data.frame(xi)) { # including data.table - ..i = rep(seq_len(nrow(xi)), length.out = nr) - x[[i]] = xi[..i,,drop=FALSE] - next - } - if (is.atomic(xi) || is.list(xi)) { - # TO DO: surely use set() here, or avoid the coercion - x[[i]] = rep(xi, length.out = nr) - next - } - stop("problem recycling column ",i,", try a simpler type") + vnames = if (is.null(names(x))) character(length(x)) else names(x) + if (length(w<-which(vnames==""))) { + sub = substitute(list(...)) + for (i in w) if (is.name(nm<-sub[[i+1L]])) vnames[i] = as.character(nm) + names(x) <- vnames } - if (any(numcols>0L)) { - value = vector("list",sum(pmax(numcols,1L))) - k = 1L - for(i in seq_len(n)) { - if (is.list(x[[i]]) && !is.ff(x[[i]])) { - for(j in seq_len(length(x[[i]]))) { - value[[k]] = x[[i]][[j]] - k=k+1L - } - } else { - value[[k]] = x[[i]] - k=k+1L - } - } - } else { - value = x - } - vnames <- unlist(vnames) - if (check.names) # default FALSE - vnames <- make.names(vnames, unique = TRUE) - setattr(value,"names",vnames) - setattr(value,"row.names",.set_row_names(nr)) - setattr(value,"class",c("data.table","data.frame")) + + # vnames = getdots() # as.list(substitute(list(...)))[-1L] + # names(x) = vnames # getdots() vnames # name_dots(...)$vnames + + ans = as.data.table.list(x, keep.rownames=keep.rownames) + if (check.names) setnames(ans, make.names(names(ans),unique=TRUE)) if (!is.null(key)) { if (!is.character(key)) stop("key argument of data.table() must be character") if (length(key)==1L) { key = strsplit(key,split=",")[[1L]] # eg key="A,B"; a syntax only useful in key argument to data.table(), really. } - setkeyv(value,key) + setkeyv(ans,key) } else { - # retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table(). - # If DT inputs with keys have been recycled then can't retain key - if (length(ckey) - && !recycledkey - && !any(duplicated(ckey)) - && all(ckey %chin% names(value)) - && !any(duplicated(names(value)[names(value) %chin% ckey]))) - setattr(value, "sorted", ckey) + # retain key of cbind(DT1, DT2, DT3) where DT2 is keyed but not DT1. cbind calls data.table(). + # If DT inputs with keys have been recycled then can't retain key + ckey = NULL + for (i in seq_along(x)) { + xi = x[[i]] + if (is.data.table(xi) && haskey(xi) && nrow(xi)==nrow(ans)) ckey=c(ckey, key(xi)) + } + if (length(ckey) && + !anyDuplicated(ckey) && + identical(is.na(chmatchdup(c(ckey,ckey), names(ans))), rep(c(FALSE,TRUE),each=length(ckey)))) { + setattr(ans, "sorted", ckey) + } } # FR #643, setfactor is an internal function in fread.R - if (isTRUE(stringsAsFactors)) setfactor(value, which(vapply(value, is.character, TRUE)), FALSE) - alloc.col(value) # returns a NAMED==0 object, unlike data.frame() + if (isTRUE(stringsAsFactors)) setfactor(ans, which(vapply(ans, is.character, TRUE)), verbose=FALSE) + ans } replace_dot_alias <- function(e) { @@ -2196,7 +2116,7 @@ transform.data.table <- function (`_data`, ...) if (any(matched)) { if (isTRUE(attr(`_data`, ".data.table.locked", TRUE))) setattr(`_data`, ".data.table.locked", NULL) # fix for #1641 `_data`[,inx[matched]] <- e[matched] - `_data` <- data.table(`_data`) + `_data` <- as.data.table(`_data`) } if (!all(matched)) { ans <- do.call("data.table", c(list(`_data`), e[!matched])) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 721b38a846..79596313bf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -379,12 +379,12 @@ test(133, TESTDT[0][J(3)], data.table(a=3L,v=NA_integer_,key="a")) # These need # tests on data table names, make.names is now FALSE by default from v1.8.0 x = 2L; `1x` = 4L dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x) -test(134, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) +test(134, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "1x", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names=TRUE) -test(134.5, names(dt), c("a.1", "b_1", "X1b", "a.1.1", "x", "V6", "V7")) +test(134.5, names(dt), c("a.1", "b_1", "X1b", "a.1.1", "x", "X1x", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names = FALSE) -test(135, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) # the last two terms differ from data.frame() +test(135, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "1x", "V7")) # the last two terms differ from data.frame() test(136, dt[,b_1, by="a.1"], data.table(a.1=1L,"b_1"=2L)) test(137, dt[,`a 1`, by="a.1"], data.table(a.1=1L,"a 1"=4L, check.names=FALSE)) @@ -2632,8 +2632,8 @@ test(927, DT[, if(a==2L) list(42:43,numeric()) else list(42L,3.14), by=a], data. test(928, cbind(data.table(a=1L),b=1:3), data.table(a=1L,b=1:3)) # FR #4813 implementation resulted in changing 929 error to warning # test(929, cbind(data.table(a=1L,b=2:3),c=1:3), error="argument 1 (nrow 2) cannot be recycled without remainder to match longest nrow (3)") -test(929, cbind(data.table(a=1L,b=2:3),c=1:3), data.table(a=1L, b=c(2L,3L,2L), c=1:3), warning="Item 1 is of size 2 but maximum size is 3") -test(930, cbind(data.table(a=1L,b=2:3),c=1:4), data.table(a=1L,b=INT(2,3,2,3),c=1:4)) +test(929, cbind(data.table(a=1L,b=2:3),c=1:3), data.table(a=1L, b=c(2L,3L,2L), c=1:3), warning="Item 1 has 2 rows but longest item has 3; recycled.") +test(930, cbind(data.table(a=1L,b=2:3),c=1:4), data.table(a=1L,b=INT(2,3,2,3),c=1:4)) # TODO: warning in future DT = data.table(x=c(1,1,1,1,2,2,3),y=c(1,1,2,3,1,1,2)) DT[,rep:=1L][c(2,7),rep:=c(2L,3L)] # duplicate row 2 and triple row 7 DT[,num:=1:.N] # to group each row by itself @@ -3710,10 +3710,10 @@ test(1139, capture.output(print(DT)), c(" x y", "1: a ~ b 1", "2: # FR #4813 - provide warnings if there are remainders for both as.data.table.list(.) and data.table(.) X = list(a = 1:2, b = 1:3) -test(1140, as.data.table(X), data.table(a=c(1:2,1L), b=c(1:3)), warning="Item 1 is of size 2 but maximum") -test(1141.1, data.table(a=1:2, b=1:3), data.table(a=c(1L,2L,1L), b=1:3), warning="Item 1 is of size 2 but maximum") -test(1141.2, data.table(a=1:2, data.table(x=1:5, y=6:10)), data.table(a=c(1L,2L,1L,2L,1L), x=1:5, y=6:10), warning="Item 1 is of size 2 but maximum") -test(1141.3, data.table(a=1:5, data.table(x=c(1,2), y=c(3,4))), data.table(a=c(1:5), x=c(1,2,1,2,1), y=c(3,4,3,4,3)), warning="Item 2 is of size 2 but maximum") +test(1140, as.data.table(X), data.table(a=c(1:2,1L), b=c(1:3)), warning="Item 1 has 2 rows but longest item has 3; recycled") +test(1141.1, data.table(a=1:2, b=1:3), data.table(a=c(1L,2L,1L), b=1:3), warning="Item 1 has 2 rows but longest item has 3; recycled") +test(1141.2, data.table(a=1:2, data.table(x=1:5, y=6:10)), data.table(a=c(1L,2L,1L,2L,1L), x=1:5, y=6:10), warning="Item 1 has 2 rows but longest item has 5; recycled") +test(1141.3, data.table(a=1:5, data.table(x=c(1,2), y=c(3,4))), data.table(a=c(1:5), x=c(1,2,1,2,1), y=c(3,4,3,4,3)), warning="Item 2 has 2 rows but longest item has 5; recycled") # Fix for bug #5098 - DT[, foo()] returns function definition. DT <- data.table(a=1:2) @@ -5725,7 +5725,7 @@ test(1380, DT[a==TRUE], DT[3:4]) # Fix #847, as.data.table.list and character(0) issue x <- data.table(a=character(0), b=character(0), c=numeric(0)) setkey(x, a, b) -test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 is of size 0 but maximum size is 1,") +test(1381, x[J("foo", character(0)), nomatch=0L], x) # Fix for #813 and #758 DT = data.table(x = 1:2) @@ -13823,7 +13823,7 @@ test(2009.1, DT[a>1], error="Column 1 is NULL; malformed data.table") DT = null.data.table() x = NULL test(2009.2, DT[, .(x)], error="Column 1 of j evaluates to NULL. A NULL column is invalid.") -test(2009.3, data.table(character(0), NULL), error="column or argument 2 is NULL") +test(2009.3, data.table(character(0), NULL), data.table(character(0))) test(2009.4, as.data.table(list(y = character(0), x = NULL)), data.table(y=character())) # use.names=NA warning for out-of-order; https://github.com/Rdatatable/data.table/pull/3455#issuecomment-472744347 @@ -13864,13 +13864,22 @@ test(2012.3, data.table(data.frame(), data.frame(a=integer())), data.table(a=int dt = as.data.table(iris) test(2012.4, cbind(data.table(), dt[0]), dt[0]) -# extra validity checks in subsetDT on data.table; #3369 +# extra validity checks in subsetDT on data.table; aside to fread issue in #3369 DT = structure(list(a=1:3, b=NULL, c=4:6), class=c("data.table","data.frame")) test(2013.1, DT[2], error="Column 2 is NULL; malformed data.table") DT = structure(list(a=1:3, b=data.frame(foo=10:12,bar=13:15), c=4:6), class=c("data.table","data.frame")) test(2013.2, DT[2], error="Column 2 ['b'] is a data.frame or data.table; malformed data.table.") DT = structure(list(a=1:3, b=1:4, c=4:6), class=c("data.table","data.frame")) test(2013.3, DT[2], error="Column 2 ['b'] is length 4 but column 1 is length 3; malformed data.table.") +DF = structure(list(a=1:3, b=data.frame(foo=4:6, bar=7:9)), row.names=1:3, class="data.frame") +DT = as.data.table(DF) +# used to be invalid DT before v1.12.2 fixed https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752 +test(2013.4, ncol(DT), 3L) +test(2013.5, DT[2], data.table(a=2L, foo=5L, bar=8L)) +DF = structure(list(a=list(c("a","b"), c("a","b"), c("a","b")), b=data.frame(foo=1:3, bar=4:6)), row.names=1:3, class="data.frame") +DT = as.data.table(DF) +test(2013.6, DT, data.table(a=list(c("a","b"), c("a","b"), c("a","b")), foo=1:3, bar=4:6)) +# ***** then rerun the test.R example in Downloads ... ***** ################################### From 84d2158a0217e2f7c46b473af198590e89b3bf41 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 26 Mar 2019 12:36:32 -0700 Subject: [PATCH 02/15] tidy --- inst/tests/tests.Rraw | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 79596313bf..8d3a1fcc66 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13864,22 +13864,23 @@ test(2012.3, data.table(data.frame(), data.frame(a=integer())), data.table(a=int dt = as.data.table(iris) test(2012.4, cbind(data.table(), dt[0]), dt[0]) -# extra validity checks in subsetDT on data.table; aside to fread issue in #3369 +# extra validity checks in subsetDT on data.table; aside in #3369 DT = structure(list(a=1:3, b=NULL, c=4:6), class=c("data.table","data.frame")) test(2013.1, DT[2], error="Column 2 is NULL; malformed data.table") DT = structure(list(a=1:3, b=data.frame(foo=10:12,bar=13:15), c=4:6), class=c("data.table","data.frame")) test(2013.2, DT[2], error="Column 2 ['b'] is a data.frame or data.table; malformed data.table.") DT = structure(list(a=1:3, b=1:4, c=4:6), class=c("data.table","data.frame")) test(2013.3, DT[2], error="Column 2 ['b'] is length 4 but column 1 is length 3; malformed data.table.") + +# unpack columns which are data.frame; aside in #3369 too. DF = structure(list(a=1:3, b=data.frame(foo=4:6, bar=7:9)), row.names=1:3, class="data.frame") DT = as.data.table(DF) -# used to be invalid DT before v1.12.2 fixed https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752 -test(2013.4, ncol(DT), 3L) -test(2013.5, DT[2], data.table(a=2L, foo=5L, bar=8L)) +test(2014.1, ncol(DT), 3L) +test(2014.2, DT[2], data.table(a=2L, foo=5L, bar=8L)) DF = structure(list(a=list(c("a","b"), c("a","b"), c("a","b")), b=data.frame(foo=1:3, bar=4:6)), row.names=1:3, class="data.frame") +# A list being first is needed to mimic the jsonlite case. With this passing, test.R works from https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752 DT = as.data.table(DF) -test(2013.6, DT, data.table(a=list(c("a","b"), c("a","b"), c("a","b")), foo=1:3, bar=4:6)) -# ***** then rerun the test.R example in Downloads ... ***** +test(2014.3, DT, data.table(a=list(c("a","b"), c("a","b"), c("a","b")), foo=1:3, bar=4:6)) ################################### From 81050bdad7fd76ded7b4b07b99f73010aff4181e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 3 Jun 2019 11:06:31 -0700 Subject: [PATCH 03/15] interim --- R/as.data.table.R | 23 +++++++++++++---------- R/data.table.R | 19 ++++++++++++------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index 5c15b313e7..910236ffd9 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -110,7 +110,7 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va ans[] } -as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, ...) { +as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, ...) { n = length(x) eachnrow = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. eachncol = integer(n) @@ -138,27 +138,28 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, ...) { if (length(x)==nrow) return(x) if (identical(x,list())) vector("list", nrow) else rep(x, length.out=nrow) } - vnames = vector("list", n) + vnames = character(ncol) k = 1L for(i in seq_len(n)) { xi = x[[i]] if (is.null(xi)) next - nm = names(x)[i] - if (!length(nm) || is.na(nm)) nm = "" if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above - vnames[[i]] = names(xi) #if (nm!="" && n>1L) paste(nm, names(xi), sep=".") else names(xi) + # vnames[[i]] = names(xi) #if (nm!="" && n>1L) paste(nm, names(xi), sep=".") else names(xi) for (j in seq_along(xi)) { ans[[k]] = recycle(xi[[j]], nrow) - k=k+1L + vnames[k] = names(xi)[j] + k = k+1L } } else { - vnames[[i]] = nm + nm = names(x)[i] + vnames[k] = if (length(nm) && !is.na(nm) && nm!="") nm else paste0("V",i) ans[[k]] = recycle(xi, nrow) - k=k+1L + k = k+1L } } + # fix for #842. Not sure what this is for. Revisit in PR in 1.12.4 if needed, else remove. #if (mn > 0L) { # nz = which(n > 0L) @@ -167,8 +168,10 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, ...) { # setattr(xx, 'names', names(x)[nz]) # x = xx #} - setDT(x, key=key) # copy ensured above; also, setDT handles naming - x + if (check.names) vnames = make.names(vnames, unique=TRUE) + setattr(ans, "names", vnames) + setDT(ans, key=key) # copy ensured above; also, setDT handles naming + ans } # don't retain classes before "data.frame" while converting diff --git a/R/data.table.R b/R/data.table.R index 42bf4df82c..a5facb12f8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -53,7 +53,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str # as.data.table which is faster; speed could be better. Revisit how many copies are taken in for example data.table(DT1,DT2) which # cbind directs to. And the nested loops for recycling lend themselves to being C level. - x <- list(...) # doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) + x = list(...) # doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) if (length(x)==0L) return( null.data.table() ) if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #5377 .Call(CcopyNamedInList,x) # to maintain pre-Rv3.1.0 behaviour, for now. See test 548.2. TODO: revist @@ -62,11 +62,14 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str vnames = if (is.null(names(x))) character(length(x)) else names(x) if (length(w <- which(vnames==""))) { sub = substitute(list(...)) - for (i in w) if (is.name(nm<-sub[[i+1L]])) vnames[i] = as.character(nm) + for (i in w) { + if (is.name(nm<-sub[[i+1L]])) vnames[i]=as.character(nm) + else if ((tmp<-deparse(nm)[1L])==make.names(tmp)) vnames[i]=tmp + } names(x) = vnames } - ans = as.data.table.list(x, keep.rownames=keep.rownames) - if (check.names) setnames(ans, make.names(names(ans),unique=TRUE)) + ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names) + # delete ... if (check.names) setnames(ans, make.names(names(ans),unique=TRUE)) if (!is.null(key)) { if (!is.character(key)) stop("key argument of data.table() must be character") if (length(key)==1L) { @@ -88,9 +91,11 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str setattr(ans, "sorted", ckey) } } - # FR #643, setfactor is an internal function in fread.R - if (isTRUE(stringsAsFactors)) setfactor(ans, which(vapply(ans, is.character, TRUE)), verbose=FALSE) - ans + if (isTRUE(stringsAsFactors)) { + for (j in which(vapply(ans, is.character, TRUE))) set(ans, NULL, j, as_factor(.subset2(ans, j))) + # as_factor is internal function in fread.R currently + } + alloc.col(ans) # returns a NAMED==0 object, unlike data.frame() } replace_dot_alias = function(e) { From 39f20b36592b6f594b20f80233f88b5f95730950 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 10 Jun 2019 16:41:52 -0700 Subject: [PATCH 04/15] interim --- R/as.data.table.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/as.data.table.R b/R/as.data.table.R index 910236ffd9..f5439b83eb 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -114,9 +114,11 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL n = length(x) eachnrow = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. eachncol = integer(n) + missing.check.names = missing(check.names) for (i in seq_len(n)) { xi = x[[i]] if (is.null(xi)) next # stop("column or argument ",i," is NULL") + if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE if ("POSIXlt" %chin% class(xi)) { warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") xi = x[[i]] = as.POSIXct(xi) From edaf3e06bd78c03ef98064ca0cdede5374ba6f38 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 10 Jun 2019 18:24:40 -0700 Subject: [PATCH 05/15] interim --- R/as.data.table.R | 3 ++- R/setkey.R | 20 ++++++++++---------- R/utils.R | 2 +- inst/tests/tests.Rraw | 21 ++++++++++++++------- 4 files changed, 27 insertions(+), 19 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index f5439b83eb..a8a7c2fc5e 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -137,7 +137,7 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL nrow = max(eachnrow) ans = vector("list",ncol) # always return a new VECSXP recycle = function(x, nrow) { - if (length(x)==nrow) return(x) + if (length(x)==nrow) return(copy(x)) if (identical(x,list())) vector("list", nrow) else rep(x, length.out=nrow) } vnames = character(ncol) @@ -170,6 +170,7 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL # setattr(xx, 'names', names(x)[nz]) # x = xx #} + if (any(vnames==".SD")) stop("A column may not be called .SD. That has special meaning.") if (check.names) vnames = make.names(vnames, unique=TRUE) setattr(ans, "names", vnames) setDT(ans, key=key) # copy ensured above; also, setDT handles naming diff --git a/R/setkey.R b/R/setkey.R index eb0a4b89bd..c508b27a19 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -367,6 +367,15 @@ CJ = function(..., sorted = TRUE, unique = FALSE) # Cross Join will then produce a join table with the combination of all values (cross product). # The last vector is varied the quickest in the table, so dates should be last for roll for example l = list(...) + if (isFALSE(getOption("datatable.CJ.names", TRUE))) { # added as FALSE in v1.11.6 with NEWS item saying TRUE in v1.12.0. TODO: remove in v1.13.0 + if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l))) + else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt)) + names(l) = vnames + } else { + names(l) = name_dots(...) + # names(l) = names(as.list(substitute(list(...)))[-1L]) + # names(l) = as.character(substitute(list(...)))[-1L] + } emptyList = FALSE ## fix for #2511 if(any(sapply(l, length) == 0L)){ ## at least one column is empty The whole thing will be empty in the end @@ -405,16 +414,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE) } } } - setattr(l, "row.names", .set_row_names(length(l[[1L]]))) - setattr(l, "class", c("data.table", "data.frame")) - if (getOption("datatable.CJ.names", TRUE)) { # added as FALSE in v1.11.6 with NEWS item saying TRUE in v1.12.0. TODO: remove in v1.13.0 - vnames = name_dots(...)$vnames - } else { - if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l))) - else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt)) - } - setattr(l, "names", vnames) - + l = as.data.table.list(l) l = alloc.col(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation. if (sorted) { if (!dups) setattr(l, 'sorted', names(l)) diff --git a/R/utils.R b/R/utils.R index 9f893fda4f..e58d4d0d56 100644 --- a/R/utils.R +++ b/R/utils.R @@ -83,7 +83,7 @@ name_dots = function(...) { } still_empty = vnames=="" if (any(still_empty)) vnames[still_empty] = paste0("V", which(still_empty)) - list(vnames=vnames, novname=novname) + vnames } # convert a vector like c(1, 4, 3, 2) into a string like [1, 4, 3, 2] diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8abedfcc16..4b5d0574dc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8357,12 +8357,19 @@ test(1613.561, all(all.equal(x, y, ignore.row.order = FALSE), all.equal(x, y, ig test(1613.562, all(is.character(all.equal(x, y, ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(x, y, ignore.row.order = TRUE, tolerance = 0)))) test(1613.563, all(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = FALSE), all.equal(rbind(x,y), rbind(y,y), ignore.row.order = TRUE))) test(1613.564, all(is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = TRUE, tolerance = 0)))) -test(1613.565, all(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE), is.character(r<-all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 +test(1613.565, all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE) && + is.character(r<-all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE)) && + any(grepl("force 'tolerance' argument to 0", r))) # no-match due factor force tolerance=0 test(1613.566, all(all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = FALSE, tolerance = 0), all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = TRUE, tolerance = 0))) -test(1613.567, all(is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE, tolerance = 0)))) -test(1613.571, all(all.equal(cbind(x, list(factor(1))), cbind(y, list(factor(1))), ignore.row.order = FALSE), is.character(r<-all.equal(cbind(x, list(factor(1))), cbind(y, list(factor(1))), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 -test(1613.572, all(all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = FALSE), all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = TRUE))) # x to x with factor equality -test(1613.573, all.equal(cbind(x, list(factor(1))), cbind(x, list(factor(1))), ignore.row.order = TRUE, tolerance = 1), error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance +test(1613.567, is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE, tolerance = 0)) && + is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE, tolerance = 0))) +test(1613.571, all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = FALSE) && + is.character(r<-all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = TRUE)) && + any(grepl("force 'tolerance' argument to 0", r))) # no-match due factor force tolerance=0 +test(1613.572, all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = FALSE) && + all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE)) # x to x with factor equality +test(1613.573, all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE, tolerance = 1), + error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance test(1613.581, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = 1), all.equal(x, y, ignore.row.order = TRUE, tolerance = 1))) test(1613.582, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = sqrt(.Machine$double.eps)/2), all.equal(x, y, ignore.row.order = TRUE, tolerance = sqrt(.Machine$double.eps)/2)), warning = "Argument 'tolerance' was forced") @@ -11991,7 +11998,7 @@ for (i in 100:1) { # in the process of PR #2573 ## data.table cannot recycle complicated types short_s4_col = getClass("MethodDefinition") -test(1872.01, data.table(a = 1:4, short_s4_col), error = 'problem recycling.*try a simpler type') +test(1872.01, data.table(a = 1:4, short_s4_col), error="attempt to replicate an object of type 'S4'") ## i must be a data.table when on is specified DT = data.table(a = 1:3) test(1872.02, DT[c(TRUE, FALSE), on = 'coefficients'], error = "not a data.table, but 'on'") @@ -13444,7 +13451,7 @@ test(1967.34, data.table(1:5, NULL), data.table(V1=1:5)) ### if (novname[i]) vnames[[i]] = namesi ### but, on pause for now pending #3193 ### test(1967.35, data.table(1:5, matrix(6:15, nrow = 5L)) -test(1967.35, data.table(1:5, integer(0L)), error = 'Item 2 has no length') +test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_)) test(1967.36, data.table(1:5, key = 5L), error = 'must be character') x = data.table(a = 1:5) From 03471e4290acb5184184733efb9c80d9517f0f69 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 10 Jun 2019 19:08:47 -0700 Subject: [PATCH 06/15] interim. passing tests --- R/data.table.R | 3 ++- inst/tests/tests.Rraw | 13 +++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index a5facb12f8..27733536c4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2705,8 +2705,9 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { } else if (is.list(x)) { # copied from as.data.table.list - except removed the copy for (i in seq_along(x)) { + if (is.null(x[[i]])) stop("Item ", i, " is NULL. Columns in data.table cannot be NULL.") if (inherits(x[[i]], "POSIXlt")) - stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") + stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") } n = vapply(x, length, 0L) n_range = range(n) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4b5d0574dc..79321db67a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14001,11 +14001,12 @@ test(2002.08, rbindlist( list(list(a=1L, z=list(list("z"))), list(a=2L, z=list(l test(2002.09, rbindlist( list(list(a=1L, z=list(list("z",1))), list(a=2L, z=list(list(c("a","b"))))) ), data.table(a=1:2, z=list(list("z",1), list(c("a","b"))))) # tests from #3343 -DT1=list(a=NULL); setDT(DT1) -DT2=list(a=NULL); setDT(DT2) -test(2002.10, rbind(DT1, DT2), data.table(a=logical())) -test(2002.11, rbind(A=DT1, B=DT2, idcol='id'), data.table(id=character(), a=logical())) -test(2002.12, rbind(DT1, DT2, idcol='id'), data.table(id=integer(), a=logical())) +L1=list(a=NULL) +test(2002.10, setDT(L1), error="Item 1 is NULL. Columns in data.table cannot be NULL.") +L2=list(a=NULL) +test(2002.11, rbindlist(list(L1, L2)), data.table(a=logical())) +test(2002.12, rbindlist(list(A=L1, B=L2), idcol='id'), data.table(id=character(), a=logical())) +test(2002.13, rbindlist(list(L1, L2), idcol='id'), data.table(id=integer(), a=logical())) #rbindlist coverage test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]") @@ -14075,7 +14076,7 @@ DT = structure(list(NULL), names="a", class=c("data.table","data.frame")) test(2009.1, DT[a>1], error="Column 1 is NULL; malformed data.table") DT = null.data.table() x = NULL -test(2009.2, DT[, .(x)], error="Column 1 of j evaluates to NULL. A NULL column is invalid.") +test(2009.2, DT[, .(x)], error="Item 1 is NULL. Columns in data.table cannot be NULL.") test(2009.3, data.table(character(0), NULL), data.table(character(0))) test(2009.4, as.data.table(list(y = character(0), x = NULL)), data.table(y=character())) From f1e849068cc3370affd0ce06d6cb839c3fd99bbb Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 12 Jun 2019 14:09:51 -0700 Subject: [PATCH 07/15] Strengthened test 1484 for #842 including comments --- R/as.data.table.R | 13 ++----------- R/data.table.R | 5 ++--- R/setkey.R | 2 +- inst/tests/tests.Rraw | 8 ++++++-- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index 619861d008..748406fd9e 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -122,7 +122,7 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL missing.check.names = missing(check.names) for (i in seq_len(n)) { xi = x[[i]] - if (is.null(xi)) next # stop("column or argument ",i," is NULL") + if (is.null(xi)) next # eachncol already initialized to 0 by integer() above if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE if ("POSIXlt" %chin% class(xi)) { warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") @@ -137,7 +137,7 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL eachnrow[i] = NROW(xi) # for a vector (including list() columns) returns the length eachncol[i] = NCOL(xi) # for a vector returns 1 } - ncol = sum(eachncol) + ncol = sum(eachncol) # hence removes NULL items silently (no error or warning), #842. if (ncol==0L) return(null.data.table()) nrow = max(eachnrow) ans = vector("list",ncol) # always return a new VECSXP @@ -166,15 +166,6 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL k = k+1L } } - - # fix for #842. Not sure what this is for. Revisit in PR in 1.12.4 if needed, else remove. - #if (mn > 0L) { - # nz = which(n > 0L) - # xx = point(vector("list", length(nz)), seq_along(nz), x, nz) - # if (!is.null(names(x))) - # setattr(xx, 'names', names(x)[nz]) - # x = xx - #} if (any(vnames==".SD")) stop("A column may not be called .SD. That has special meaning.") if (check.names) vnames = make.names(vnames, unique=TRUE) setattr(ans, "names", vnames) diff --git a/R/data.table.R b/R/data.table.R index 27733536c4..59495a08e9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2705,9 +2705,8 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { } else if (is.list(x)) { # copied from as.data.table.list - except removed the copy for (i in seq_along(x)) { - if (is.null(x[[i]])) stop("Item ", i, " is NULL. Columns in data.table cannot be NULL.") - if (inherits(x[[i]], "POSIXlt")) - stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") + if (is.null(x[[i]])) stop("Item ", i, " is NULL. Columns in data.table cannot be NULL.") + if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") } n = vapply(x, length, 0L) n_range = range(n) diff --git a/R/setkey.R b/R/setkey.R index c508b27a19..1a66636305 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -367,7 +367,7 @@ CJ = function(..., sorted = TRUE, unique = FALSE) # Cross Join will then produce a join table with the combination of all values (cross product). # The last vector is varied the quickest in the table, so dates should be last for roll for example l = list(...) - if (isFALSE(getOption("datatable.CJ.names", TRUE))) { # added as FALSE in v1.11.6 with NEWS item saying TRUE in v1.12.0. TODO: remove in v1.13.0 + if (isFALSE(getOption("datatable.CJ.names", TRUE))) { # default TRUE from v1.12.0, FALSE before. TODO: remove option in v1.13.0 as stated in news if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l))) else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt)) names(l) = vnames diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a7e2c27d54..f7e9d82fd1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6743,14 +6743,18 @@ test(1483.3, merge(x,y,by="country",all=TRUE), data.table(country=factor(c("US", setkey(y) test(1483.4, y[x], data.table(country="US", key="country")) -# Fix for #842 +# NULL items should be removed when making data.table from list, #842 +# Original fix for #842 added a branch in as.data.table.list() using point() +# Then PR#3471 moved logic from data.table() into as.data.table.list() and now removes NULL items up front, so longer need for the branch +# Since the logic was changed, this test was strengthened to explicity test the result rather than compare two calls to SomeFunction() SomeFunction <- function(x, setnull=1L) { ans <- replicate(length(x), list("bla1", "bla2"), simplify=FALSE) ans[setnull] <- list(NULL) return(ans) } DT <- data.table(ID=1:3, key="ID") -test(1484, DT[, SomeFunction(ID, setnull=1L)], DT[, SomeFunction(ID, setnull=2L)]) +test(1484.1, DT[, SomeFunction(ID, setnull=1L)], ans<-data.table(V1=list("bla1","bla2"), V2=list("bla1","bla2"))) +test(1484.2, DT[, SomeFunction(ID, setnull=2L)], ans) # Fix for #868 vals = c("setosa", "versicolor", "virginica") From 737151d94db642e63667d932be02dca6bd94e547 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 12 Jun 2019 16:21:26 -0700 Subject: [PATCH 08/15] removed copyNamedInList since as.data.table.list now does that, tidy, coverage and avoided 3 test changes --- R/as.data.table.R | 12 ++++++++++-- R/data.table.R | 22 +++------------------- R/setkey.R | 2 -- inst/tests/tests.Rraw | 10 ++++++---- src/init.c | 2 -- src/wrappers.c | 23 +---------------------- 6 files changed, 20 insertions(+), 51 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index 748406fd9e..cb32e57e60 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -142,8 +142,16 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL nrow = max(eachnrow) ans = vector("list",ncol) # always return a new VECSXP recycle = function(x, nrow) { - if (length(x)==nrow) return(copy(x)) - if (identical(x,list())) vector("list", nrow) else rep(x, length.out=nrow) + if (length(x)==nrow) { + return(copy(x)) + # This copy used to be achieved via .Call(CcopyNamedInList,x) at the top of data.table(). It maintains pre-Rv3.1.0 + # behavior, for now. See test 548.2. The copy() calls duplicate() at C level which (importantly) also expands ALTREP objects. + # TODO: port this as.data.table.list() to C and use MAYBE_REFERENCED(x) || ALTREP(x) to save some copies. + # That saving used to be done by CcopyNamedInList but the copies happened again as well, so removing CcopyNamedInList is + # not worse than before, and gets us in a better centralized place to port as.data.table.list to C and use MAYBE_REFERENCED + # again in future. + } + if (identical(x,list())) vector("list", nrow) else rep(x, length.out=nrow) # new objects don't need copy } vnames = character(ncol) k = 1L diff --git a/R/data.table.R b/R/data.table.R index 59495a08e9..b5173345a1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -49,27 +49,11 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str { # NOTE: It may be faster in some circumstances for users to create a data.table by creating a list l # first, and then setattr(l,"class",c("data.table","data.frame")) and forgo checking. - # TO DO: rewrite data.table(), one of the oldest functions here. Many people use data.table() to convert data.frame rather than - # as.data.table which is faster; speed could be better. Revisit how many copies are taken in for example data.table(DT1,DT2) which - # cbind directs to. And the nested loops for recycling lend themselves to being C level. - - x = list(...) # doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) + x = list(...) # list() doesn't copy named inputs as from R >= 3.1.0 (a very welcome change) + names(x) = name_dots(...) if (length(x)==0L) return( null.data.table() ) if (length(x)==1L && (is.null(x[[1L]]) || (is.list(x[[1L]]) && length(x[[1L]])==0L))) return( null.data.table() ) #5377 - .Call(CcopyNamedInList,x) # to maintain pre-Rv3.1.0 behaviour, for now. See test 548.2. TODO: revist - # TODO Something strange with NAMED on components of `...` to investigate. Or, just port data.table() to C. - - vnames = if (is.null(names(x))) character(length(x)) else names(x) - if (length(w <- which(vnames==""))) { - sub = substitute(list(...)) - for (i in w) { - if (is.name(nm<-sub[[i+1L]])) vnames[i]=as.character(nm) - else if ((tmp<-deparse(nm)[1L])==make.names(tmp)) vnames[i]=tmp - } - names(x) = vnames - } - ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names) - # delete ... if (check.names) setnames(ans, make.names(names(ans),unique=TRUE)) + ans = as.data.table.list(x, keep.rownames=keep.rownames, check.names=check.names) # see comments inside as.data.table.list re copies if (!is.null(key)) { if (!is.character(key)) stop("key argument of data.table() must be character") if (length(key)==1L) { diff --git a/R/setkey.R b/R/setkey.R index 1a66636305..19c84b24ca 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -373,8 +373,6 @@ CJ = function(..., sorted = TRUE, unique = FALSE) names(l) = vnames } else { names(l) = name_dots(...) - # names(l) = names(as.list(substitute(list(...)))[-1L]) - # names(l) = as.character(substitute(list(...)))[-1L] } emptyList = FALSE ## fix for #2511 if(any(sapply(l, length) == 0L)){ diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f7e9d82fd1..258a5693d5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -382,12 +382,12 @@ test(133, TESTDT[0][J(3)], data.table(a=3L,v=NA_integer_,key="a")) # These need # tests on data table names, make.names is now FALSE by default from v1.8.0 x = 2L; `1x` = 4L dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x) -test(134, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "1x", "V7")) +test(134, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names=TRUE) -test(134.5, names(dt), c("a.1", "b_1", "X1b", "a.1.1", "x", "X1x", "V7")) +test(134.5, names(dt), c("a.1", "b_1", "X1b", "a.1.1", "x", "V6", "V7")) dt = data.table(a.1 = 1L, b_1 = 2L, "1b" = 3L, `a 1` = 4L, x, `1x`, 2*x, check.names = FALSE) -test(135, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "1x", "V7")) # the last two terms differ from data.frame() +test(135, names(dt), c("a.1", "b_1", "1b", "a 1", "x", "V6", "V7")) # the last two terms differ from data.frame() test(136, dt[,b_1, by="a.1"], data.table(a.1=1L,"b_1"=2L)) test(137, dt[,`a 1`, by="a.1"], data.table(a.1=1L,"a 1"=4L, check.names=FALSE)) @@ -7100,8 +7100,10 @@ x = c(1, 2, 1) y = c(5, 8, 8, 4) options(datatable.CJ.names=FALSE) test(1525.1, CJ(x, y, unique=TRUE), CJ(V1=c(1,2), V2=c(4,5,8))) +test(1525.2, CJ(x, z=y, unique=TRUE), ans<-data.table(V1=rep(c(1,2), each=3), z=c(4,5,8), key="V1,z")) # naming of one but not both, too options(datatable.CJ.names=TRUE) -test(1525.2, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) +test(1525.3, CJ(x, y, unique=TRUE), CJ( x=c(1,2), y=c(4,5,8))) +test(1525.4, CJ(x, z=y, unique=TRUE), setnames(ans,c("x","z"))) # `key` argument fix for `setDT` when input is already a `data.table`, #1169 DT <- data.table(A = 1:4, B = 5:8) diff --git a/src/init.c b/src/init.c index 757ff61040..d3d2e6f398 100644 --- a/src/init.c +++ b/src/init.c @@ -26,7 +26,6 @@ SEXP vecseq(); SEXP setlistelt(); SEXP setmutable(); SEXP address(); -SEXP copyNamedInList(); SEXP expandAltRep(); SEXP fmelt(); SEXP fcast(); @@ -111,7 +110,6 @@ R_CallMethodDef callMethods[] = { {"Csetlistelt", (DL_FUNC) &setlistelt, -1}, {"Csetmutable", (DL_FUNC) &setmutable, -1}, {"Caddress", (DL_FUNC) &address, -1}, -{"CcopyNamedInList", (DL_FUNC) ©NamedInList, -1}, {"CexpandAltRep", (DL_FUNC) &expandAltRep, -1}, {"Cfmelt", (DL_FUNC) &fmelt, -1}, {"Cfcast", (DL_FUNC) &fcast, -1}, diff --git a/src/wrappers.c b/src/wrappers.c index 95f2bcd0a6..2dc98264ac 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -80,32 +80,11 @@ SEXP address(SEXP x) return(mkString(buffer)); } -SEXP copyNamedInList(SEXP x) -{ - // As from R 3.1.0 list() no longer copies NAMED inputs - // Since data.table allows subassignment by reference, we need a way to copy NAMED inputs, still. - // But for many other applications (such as in j and elsewhere internally) the new non-copying list() in R 3.1.0 is very welcome. - - // This is intended to be called just after list(...) in data.table(). It isn't for use on a single data.table, as - // member columns of a list aren't marked as NAMED when the VECSXP is. - - // For now, this makes the old behaviour of list() in R<3.1.0 available for use, where we need it. - - if (TYPEOF(x) != VECSXP) error("x isn't a VECSXP"); - for (int i=0; i Date: Wed, 12 Jun 2019 17:29:43 -0700 Subject: [PATCH 09/15] coverage; check of column name .SD was in two places so removed the first to reach the second via test 1940 --- R/data.table.R | 4 +++- R/utils.R | 10 +++------- inst/tests/tests.Rraw | 4 +++- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index b5173345a1..92607838e5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2107,7 +2107,9 @@ transform.data.table = function (`_data`, ...) inx = chmatch(tags, names(`_data`)) matched = !is.na(inx) if (any(matched)) { - if (isTRUE(attr(`_data`, ".data.table.locked", TRUE))) setattr(`_data`, ".data.table.locked", NULL) # fix for #1641 + if (isTRUE(attr(`_data`, ".data.table.locked", TRUE))) { + setattr(`_data`, ".data.table.locked", NULL) # fix for #1641, now covered by test 104.2 + } `_data`[,inx[matched]] = e[matched] `_data` = as.data.table(`_data`) } diff --git a/R/utils.R b/R/utils.R index e58d4d0d56..7d5e8361e3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -70,19 +70,15 @@ name_dots = function(...) { dot_sub = as.list(substitute(list(...)))[-1L] vnames = names(dot_sub) if (is.null(vnames)) { - vnames = rep.int("", length(dot_sub)) - novname = rep.int(TRUE, length(dot_sub)) + vnames = character(length(dot_sub)) } else { vnames[is.na(vnames)] = "" - if (any(vnames==".SD")) stop("A column may not be called .SD. That has special meaning.") - novname = vnames=="" } - for (i in which(novname)) { + for (i in which(vnames=="")) { if ((tmp <- deparse(dot_sub[[i]])[1L]) == make.names(tmp)) vnames[i] = tmp } - still_empty = vnames=="" - if (any(still_empty)) vnames[still_empty] = paste0("V", which(still_empty)) + if (length(w<-which(vnames==""))) vnames[w] = paste0("V", w) vnames } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 258a5693d5..4b1810f3cb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -319,7 +319,9 @@ test(102, within(dt, {A <- B^2}), transform(dt, A = B^2)) # test .SD object test(103, dt[, sum(.SD$B), by = "A"], dt[, sum(B), by = "A"]) -test(104, dt[, transform(.SD, D = min(B)), by = "A"], dt[, list(B,C,D=min(B)), by = "A"]) +test(104.1, dt[, transform(.SD, D=min(B)), by="A"], dt[, list(B,C,D=min(B)), by="A"]) +# and test transform of existing column to mimic .SD example in transform.Rd mentioned in commit to #1641 +test(104.2, dt[, transform(.SD, B=sum(C)), by="A"], data.table(A=rep(1:3,each=4), B=86L, C=21:22, key="A")) # test numeric and comparison operations on a data table test(105, all(dt + dt > dt)) From 946ba2be01bf1c3693382e650a216adadc68eb2a Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 12 Jun 2019 18:33:56 -0700 Subject: [PATCH 10/15] two more test changes avoided --- R/as.data.table.R | 2 ++ inst/tests/tests.Rraw | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index cb32e57e60..c0041b830c 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -160,6 +160,8 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL if (is.null(xi)) next if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") + if (eachnrow[i]==0L && nrow>0L && is.atomic(xi)) # is.atomic to ignore list() since list() is a common way to initialize; let's not insist on list(NULL) + warning("Item ", i, " has 0 rows but longest item has ", nrow, "; filled with NA") # the rep() in recycle() above creates the NA vector if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above # vnames[[i]] = names(xi) #if (nm!="" && n>1L) paste(nm, names(xi), sep=".") else names(xi) for (j in seq_along(xi)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4b1810f3cb..4c0f2d3fb0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5731,7 +5731,7 @@ test(1380, DT[a==TRUE], DT[3:4]) # Fix #847, as.data.table.list and character(0) issue x <- data.table(a=character(0), b=character(0), c=numeric(0)) setkey(x, a, b) -test(1381, x[J("foo", character(0)), nomatch=0L], x) +test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 has 0 rows but longest item has 1; filled with NA") # Fix for #813 and #758 DT = data.table(x = 1:2) @@ -13459,7 +13459,7 @@ test(1967.34, data.table(1:5, NULL), data.table(V1=1:5)) ### if (novname[i]) vnames[[i]] = namesi ### but, on pause for now pending #3193 ### test(1967.35, data.table(1:5, matrix(6:15, nrow = 5L)) -test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_)) +test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_), warning="Item 2 has 0 rows but longest item has 5; filled with NA") test(1967.36, data.table(1:5, key = 5L), error = 'must be character') x = data.table(a = 1:5) From d917679fdba71a5369ef7f0c64c9d244a9223f37 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 12 Jun 2019 19:30:26 -0700 Subject: [PATCH 11/15] tests 1613.571-3 reduced to their list() wrapper difference only --- R/as.data.table.R | 7 +++++-- inst/tests/tests.Rraw | 17 +++++------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index c0041b830c..ca7be9251e 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -127,8 +127,11 @@ as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FAL if ("POSIXlt" %chin% class(xi)) { warning("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") xi = x[[i]] = as.POSIXct(xi) - } else if (is.matrix(xi) || is.data.frame(xi)) { # including data.table (a data.frame, too) - xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames) # we will never allow a matrix to be a column; always unpack the columns + } else if (is.matrix(xi) || is.data.frame(xi)) { + if (!is.data.table(xi)) { + xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames) # we will never allow a matrix to be a column; always unpack the columns + } + # else avoid dispatching to as.data.table.data.table (which exists and copies) } else if (is.table(xi)) { xi = x[[i]] = as.data.table.table(xi, keep.rownames=keep.rownames) } else if (is.function(xi)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4c0f2d3fb0..7c550b0c6e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8365,19 +8365,12 @@ test(1613.561, all(all.equal(x, y, ignore.row.order = FALSE), all.equal(x, y, ig test(1613.562, all(is.character(all.equal(x, y, ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(x, y, ignore.row.order = TRUE, tolerance = 0)))) test(1613.563, all(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = FALSE), all.equal(rbind(x,y), rbind(y,y), ignore.row.order = TRUE))) test(1613.564, all(is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,y), rbind(y,y), ignore.row.order = TRUE, tolerance = 0)))) -test(1613.565, all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE) && - is.character(r<-all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE)) && - any(grepl("force 'tolerance' argument to 0", r))) # no-match due factor force tolerance=0 +test(1613.565, all(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE), is.character(r<-all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 test(1613.566, all(all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = FALSE, tolerance = 0), all.equal(rbind(x,y,y), rbind(x,y,y), ignore.row.order = TRUE, tolerance = 0))) -test(1613.567, is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE, tolerance = 0)) && - is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE, tolerance = 0))) -test(1613.571, all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = FALSE) && - is.character(r<-all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = TRUE)) && - any(grepl("force 'tolerance' argument to 0", r))) # no-match due factor force tolerance=0 -test(1613.572, all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = FALSE) && - all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE)) # x to x with factor equality -test(1613.573, all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE, tolerance = 1), - error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance +test(1613.567, all(is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = FALSE, tolerance = 0)), is.character(all.equal(rbind(x,x,y), rbind(y,y,x), ignore.row.order = TRUE, tolerance = 0)))) +test(1613.571, all(all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = FALSE), is.character(r<-all.equal(cbind(x, factor(1)), cbind(y, factor(1)), ignore.row.order = TRUE)) && any(grepl("force 'tolerance' argument to 0", r)))) # no-match due factor force tolerance=0 +test(1613.572, all(all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = FALSE), all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE))) # x to x with factor equality +test(1613.573, all.equal(cbind(x, factor(1)), cbind(x, factor(1)), ignore.row.order = TRUE, tolerance = 1), error = "Factor columns and ignore.row.order cannot be used with non 0 tolerance argument") # error due to provided non zero tolerance test(1613.581, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = 1), all.equal(x, y, ignore.row.order = TRUE, tolerance = 1))) test(1613.582, all(all.equal(x, y, ignore.row.order = FALSE, tolerance = sqrt(.Machine$double.eps)/2), all.equal(x, y, ignore.row.order = TRUE, tolerance = sqrt(.Machine$double.eps)/2)), warning = "Argument 'tolerance' was forced") From 55da6e53f2cbdd18f45406b5358f491eba6c5ca8 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 12 Jun 2019 20:15:04 -0700 Subject: [PATCH 12/15] resolved tests 2009.* --- R/data.table.R | 7 +++++-- inst/tests/tests.Rraw | 10 +++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 92607838e5..6b35bab0e7 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1304,9 +1304,12 @@ replace_order = function(isub, verbose, env) { # avoid copy if all vectors are already of same lengths, use setDT lenjval = vapply(jval, length, 0L) if (any(lenjval != lenjval[1L])) { - jval = as.data.table.list(jval) # does the vector expansion to create equal length vectors + jval = as.data.table.list(jval) # does the vector expansion to create equal length vectors, and drops any NULL items jvnames = jvnames[lenjval != 0L] # fix for #1477 - } else setDT(jval) + } else { + if (identical(jval, list(NULL))) return(null.data.table()) # test 2009.2 & 2009.3, otherwise setDT correctly errors that a column can't be NULL + setDT(jval) + } } if (is.null(jvnames)) jvnames = character(length(jval)-length(bynames)) ww = which(jvnames=="") diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7c550b0c6e..f74d8bb7c8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14077,9 +14077,13 @@ DT = structure(list(NULL), names="a", class=c("data.table","data.frame")) test(2009.1, DT[a>1], error="Column 1 is NULL; malformed data.table") DT = null.data.table() x = NULL -test(2009.2, DT[, .(x)], error="Item 1 is NULL. Columns in data.table cannot be NULL.") -test(2009.3, data.table(character(0), NULL), data.table(character(0))) -test(2009.4, as.data.table(list(y = character(0), x = NULL)), data.table(y=character())) +test(2009.2, DT[, .(x)], null.data.table()) # because .(x) evaluated to .(NULL); NULL columns in results removed +DT = data.table(A=1:3) +test(2009.3, DT[, .(x)], null.data.table()) +test(2009.4, DT[, .(x, sum(A))], data.table(V1=6L)) +test(2009.5, DT[, .(sum(A), x)], data.table(V1=6L)) +test(2009.6, data.table(character(0), NULL), data.table(V1=character())) +test(2009.7, as.data.table(list(y = character(0), x = NULL)), data.table(y=character())) # use.names="check" message|warning for out-of-order; https://github.com/Rdatatable/data.table/pull/3455#issuecomment-472744347 DT1 = data.table(a=1:2, b=5:6) From a882a99fb50366dabaf4c4c6e529dc9dce248edd Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 13 Jun 2019 15:06:51 -0700 Subject: [PATCH 13/15] two news items and tests --- NEWS.md | 22 ++++++++++++++++++++-- inst/tests/tests.Rraw | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9e8c5a76c1..0daf8b4824 100644 --- a/NEWS.md +++ b/NEWS.md @@ -92,6 +92,8 @@ 15. `DT[order(col)[1:5], ...]` (i.e. where `i` is a compound expression involving `order()`) is now optimized to use `data.table`'s multithreaded `forder`, [#1921](https://github.com/Rdatatable/data.table/issues/1921). This example is not a fully optimal top-N query since the full ordering is still computed. The improvement is that the call to `order()` is computed faster for any `i` expression using `order`. +16. `as.data.table` now unpacks columns in a `data.frame` which are themselves a `data.frame`. This need arises when parsing JSON, a corollary in [#3369](https://github.com/Rdatatable/data.table/issues/3369#issuecomment-462662752). `data.table` does not allow columns to be objects which themselves have columns (such as `matrix` and `data.frame`), unlike `data.frame` which does. Bug fix 19 in v1.12.2 (see below) added a helpful error (rather than segfault) to detect such invalid `data.table`, and promised that `as.data.table()` would unpack these columns in the next release (i.e. this release) so that the invalid `data.table` is not created in the first place. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. @@ -134,7 +136,23 @@ 20. `c`, `seq` and `mean` of `ITime` objects now retain the `ITime` class via new `ITime` methods, [#3628](https://github.com/Rdatatable/data.table/issues/3628). Thanks @UweBlock for reporting. The `cut` and `split` methods for `ITime` have been removed since the default methods work, [#3630](https://github.com/Rdatatable/data.table/pull/3630). -20. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). +21. `as.data.table.array` now handles the case when some of the array's dimension names are `NULL`, [#3636](https://github.com/Rdatatable/data.table/issues/3636). + +22. Adding a `list` column using `cbind`, `as.data.table`, or `data.table` now works rather than treating the `list` as if it were a set of columns, [#3471](https://github.com/Rdatatable/data.table/pull/3471). However, please note that using `:=` to add columns is preferred. + + ```R + # cbind( data.table(1:2), list(c("a","b"),"a") ) + # V1 V2 NA # v1.12.2 and before + # # introduced invalid NA column name too + # 1: 1 a a + # 2: 2 b a + # + # V1 V2 # v1.12.4+ + # + # 1: 1 a,b + # 2: 2 a + ``` + #### NOTES @@ -231,7 +249,7 @@ 18. `cbind` with a null (0-column) `data.table` now works as expected, [#3445](https://github.com/Rdatatable/data.table/issues/3445). Thanks to @mb706 for reporting. -19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object such as a data.frame or matrix which have columns. Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are data.frame to support this use case. +19. Subsetting does a better job of catching a malformed `data.table` with error rather than segfault. A column may not be NULL, nor may a column be an object which has columns (such as a `data.frame` or `matrix`). Thanks to a comment and reproducible example in [#3369](https://github.com/Rdatatable/data.table/issues/3369) from Drew Abbot which demonstrated the issue which arose from parsing JSON. The next release will enable `as.data.table` to unpack columns which are `data.frame` to support this use case. #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f74d8bb7c8..8ab05b376a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15009,6 +15009,39 @@ DF = structure(list(a=list(c("a","b"), c("a","b"), c("a","b")), b=data.frame(foo DT = as.data.table(DF) test(2057.3, DT, data.table(a=list(c("a","b"), c("a","b"), c("a","b")), foo=1:3, bar=4:6)) +# one and two+ row cases of data.table, as.data.table and cbind involving list columns, given +# the change to tests 1613.571-3 in PR#3471 in v1.12.4 +# in v1.12.2 and before : +# data.table( data.table(1:2), list(c("a","b"),"a") ) +# V1 V2 NA +# +# 1: 1 a a +# 2: 2 b a +# i.e. passing a data.table() to data.table() changed the meaning of list() which was inconsistent, +# and an NA column name was introduced too (a bug in itself) +# from v1.12.4 : +# V1 V2 +# +# 1: 1 a,b +# 2: 2 a +# i.e. now easier to add the list column as intended, and it's consistent with +# basic (i.e. not cbind-like) usage of data.table() +# # changed in v1.12.4 ? +ans = data.table(V1=1, V2=2) # -------------------- +test(2058.01, data.table( data.table(1), 2), ans) # no +test(2058.02, as.data.table(list(data.table(1), 2)), ans) # no +test(2058.03, cbind(data.table(1), 2), ans) # no +ans = data.table(V1=1, V2=list(2)) # 'basic' usage; i.e. not cbind-like +test(2058.04, sapply(ans, class), c(V1="numeric", V2="list")) # no +test(2058.05, data.table( data.table(1), list(2) ), ans) # yes +test(2058.06, as.data.table(list(data.table(1), list(2))), ans) # yes +test(2058.07, cbind(data.table(1), list(2)), ans) # yes +ans = data.table(V1=1:2, V2=list(c("a","b"),"a")) +test(2058.08, sapply(ans, class), c(V1="integer", V2="list")) # no +test(2058.09, data.table( data.table(1:2), list(c("a","b"),"a") ), ans) # yes +test(2058.10, as.data.table(list(data.table(1:2), list(c("a","b"),"a"))), ans) # yes +test(2058.11, cbind(data.table(1:2), list(c("a","b"),"a")), ans) # yes + ################################### # Add new tests above this line # From 3775fc3312dfd4e51b3cb391d8e21c98b6c2cc1e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 13 Jun 2019 17:05:53 -0700 Subject: [PATCH 14/15] setDT(list) allowed to create NULL columns, changes to tests 2002 no longer needed, passes eplusr --- R/data.table.R | 24 +++++++++++------------- inst/tests/tests.Rraw | 11 +++++------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 6b35bab0e7..fc80068ebc 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1301,13 +1301,13 @@ replace_order = function(isub, verbose, env) { jval = data.table(jval) # TO DO: should this be setDT(list(jval)) instead? } else { if (is.null(jvnames)) jvnames=names(jval) - # avoid copy if all vectors are already of same lengths, use setDT lenjval = vapply(jval, length, 0L) - if (any(lenjval != lenjval[1L])) { + nulljval = vapply(jval, is.null, FALSE) + if (lenjval[1L]==0L || any(lenjval != lenjval[1L])) { jval = as.data.table.list(jval) # does the vector expansion to create equal length vectors, and drops any NULL items - jvnames = jvnames[lenjval != 0L] # fix for #1477 + jvnames = jvnames[!nulljval] # fix for #1477 } else { - if (identical(jval, list(NULL))) return(null.data.table()) # test 2009.2 & 2009.3, otherwise setDT correctly errors that a column can't be NULL + # all columns same length and at least 1 row; avoid copy. TODO: remove when as.data.table.list is ported to C setDT(jval) } } @@ -1325,9 +1325,7 @@ replace_order = function(isub, verbose, env) { setattr(jval, 'class', class(x)) # fix for #5296 if (haskey(x) && all(key(x) %chin% names(jval)) && suppressWarnings(is.sorted(jval, by=key(x)))) # TO DO: perhaps this usage of is.sorted should be allowed internally then (tidy up and make efficient) setattr(jval, 'sorted', key(x)) - # postponed to v1.12.4 because package eplusr creates a NULL column and then runs setcolorder on the result which fails if there are fewer columns - # w = sapply(jval, is.null) - # if (any(w)) jval = jval[,!w,with=FALSE] # no !..w due to 'Undefined global functions or variables' note from R CMD check + if (any(sapply(jval, is.null))) stop("Internal error: j has created a data.table result containing a NULL column") # nocov } return(jval) } @@ -2694,18 +2692,18 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { } else if (is.list(x)) { # copied from as.data.table.list - except removed the copy for (i in seq_along(x)) { - if (is.null(x[[i]])) stop("Item ", i, " is NULL. Columns in data.table cannot be NULL.") + if (is.null(x[[i]])) next # allow NULL columns to be created by setDT(list) even though they are not really allowed + # many operations still work in the presence of NULL columns and it might be convenient + # e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which + # fail for NULL columns will give helpful error at that point, #3480 and #3471 if (inherits(x[[i]], "POSIXlt")) stop("Column ", i, " is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.") } n = vapply(x, length, 0L) n_range = range(n) if (n_range[1L] != n_range[2L]) { tbl = sort(table(n)) - stop("All elements in argument 'x' to 'setDT' must be of same length, ", - "but the profile of input lengths (length:frequency) is: ", - brackify(sprintf('%s:%d', names(tbl), tbl)), - "\nThe first entry with fewer than ", n_range[2L], - " entries is ", which.max(n Date: Thu, 13 Jun 2019 17:23:21 -0700 Subject: [PATCH 15/15] <- -> = --- R/as.data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/as.data.table.R b/R/as.data.table.R index ca7be9251e..8aac617e9b 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -115,7 +115,7 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va ans[] } -as.data.table.list <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, ...) { +as.data.table.list = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, ...) { n = length(x) eachnrow = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required. eachncol = integer(n)