diff --git a/NEWS.md b/NEWS.md index 372dfcde64..c9a8280c3f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,6 +12,37 @@ 3. `fwrite()` now writes UTF-8 or native csv files by specifying the `encoding=` argument, [#1770](https://github.com/Rdatatable/data.table/pull/1770). Thanks to @shrektan for the request and the PR. +4. `data.table()` no longer fills empty vectors with `NA` with warning. Instead a 0-row `data.table` is returned, [#3727](https://github.com/Rdatatable/data.table/issues/3727). Since `data.table()` is used internally by `.()`, this brings the following examples in line with expectations in most cases. Thanks to @shrektan for the suggestion and PR. + + ```R + DT = data.table(A=1:3, B=letters[1:3]) + DT[A>3, .(ITEM='A>3', A, B)] # (1) + DT[A>3][, .(ITEM='A>3', A, B)] # (2) + # the above are now equivalent as expected and return: + Empty data.table (0 rows and 3 cols): ITEM,A,B + # Previously, (2) returned : + ITEM A B + + 1: A>3 NA + Warning messages: + 1: In as.data.table.list(jval, .named = NULL) : + Item 2 has 0 rows but longest item has 1; filled with NA + 2: In as.data.table.list(jval, .named = NULL) : + Item 3 has 0 rows but longest item has 1; filled with NA + ``` + + ```R + DT = data.table(A=1:3, B=letters[1:3], key="A") + DT[.(1:3, double()), B] + # new result : + character(0) + # old result : + [1] "a" "b" "c" + Warning message: + In as.data.table.list(i) : + Item 2 has 0 rows but longest item has 3; filled with NA + ``` + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/as.data.table.R b/R/as.data.table.R index 308a7b2ffe..47219206a2 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -129,6 +129,7 @@ as.data.table.list = function(x, eachncol = integer(n) missing.check.names = missing(check.names) origListNames = if (missing(.named)) names(x) else NULL # as.data.table called directly, not from inside data.table() which provides .named, #3854 + empty_atomic = FALSE for (i in seq_len(n)) { xi = x[[i]] if (is.null(xi)) next # eachncol already initialized to 0 by integer() above @@ -148,10 +149,13 @@ as.data.table.list = function(x, } eachnrow[i] = NROW(xi) # for a vector (including list() columns) returns the length eachncol[i] = NCOL(xi) # for a vector returns 1 + if (is.atomic(xi) && length(xi)==0L && !is.null(xi)) { + empty_atomic = TRUE # any empty atomic (not empty list()) should result in nrows=0L, #3727 + } } ncol = sum(eachncol) # hence removes NULL items silently (no error or warning), #842. if (ncol==0L) return(null.data.table()) - nrow = max(eachnrow) + nrow = if (empty_atomic) 0L else max(eachnrow) ans = vector("list",ncol) # always return a new VECSXP recycle = function(x, nrow) { if (length(x)==nrow) { @@ -173,8 +177,6 @@ as.data.table.list = function(x, if (is.null(xi)) { n_null = n_null+1L; next } if (eachnrow[i]>1L && nrow%%eachnrow[i]!=0L) # in future: eachnrow[i]!=nrow warning("Item ", i, " has ", eachnrow[i], " rows but longest item has ", nrow, "; recycled with remainder.") - if (eachnrow[i]==0L && nrow>0L && is.atomic(xi)) # is.atomic to ignore list() since list() is a common way to initialize; let's not insist on list(NULL) - warning("Item ", i, " has 0 rows but longest item has ", nrow, "; filled with NA") # the rep() in recycle() above creates the NA vector if (is.data.table(xi)) { # matrix and data.frame were coerced to data.table above prefix = if (!isFALSE(.named[i]) && isTRUE(nchar(names(x)[i])>0L)) paste0(names(x)[i],".") else "" # test 2058.12 for (j in seq_along(xi)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9ae4864fe2..ba7cb0579e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5839,7 +5839,7 @@ test(1380, DT[a==TRUE], DT[3:4]) # Fix #847, as.data.table.list and character(0) issue x <- data.table(a=character(0), b=character(0), c=numeric(0)) setkey(x, a, b) -test(1381, x[J("foo", character(0)), nomatch=0L], x, warning="Item 2 has 0 rows but longest item has 1; filled with NA") +test(1381, x[J("foo", character(0)), nomatch=0L], x) # Fix for #813 and #758 DT = data.table(x = 1:2) @@ -13754,7 +13754,7 @@ test(1967.34, data.table(1:5, NULL), data.table(V1=1:5)) ### if (novname[i]) vnames[[i]] = namesi ### but, on pause for now pending #3193 ### test(1967.35, data.table(1:5, matrix(6:15, nrow = 5L)) -test(1967.35, data.table(1:5, integer(0L)), data.table(1:5, NA_integer_), warning="Item 2 has 0 rows but longest item has 5; filled with NA") +test(1967.35, data.table(1:5, integer(0L)), data.table(integer(0L), integer(0L))) # no longer NA-fill zero-length, PR#4262 test(1967.36, data.table(1:5, key = 5L), error = 'must be character') x = data.table(a = 1:5) @@ -17346,3 +17346,14 @@ test(2170.2, DT[A > -1, which = NA], 1L) test(2170.3, DT[A > -1 | is.na(A), which = NA], integer()) test(2170.4, DT[A > 10, which = NA], seq_len(nrow(DT))) test(2170.5, DT[!(A > 1), which = NA], c(1:3,6L)) # matches DT[A <= 1, which = NA] + +# data.table() zero-nrow result if any non-null & atomic element is length 0, #3727 +test(2171.1, data.table(A=double(), B=1:2), data.table(A=double(), B=integer())) +DT = data.table(CODE=c('a','b'), DATE=1:2, VALUE=c(1.3, 1.5), key=c('CODE','DATE')) +test(2171.2, DT[J(character(), 1), VALUE], double()) # because "J" is a wrapper of list() +test(2171.3, data.table(A=NULL, B=1.0), data.table(B=1.0)) # NULL is omited +test(2171.4, NROW(data.table(A=list(), B=1.0)), 1L) # empty list() regarded as `list(list())` which is length 1, and recycled +DT = data.table(A=1:3, B=letters[1:3]) +test(2171.5, ans <- DT[A>3, .(ITEM='A>3', A, B)], # now identical as expected + DT[A>3][, .(ITEM='A>3', A, B)]) +test(2171.6, ans, data.table(ITEM=character(), A=integer(), B=character())) # not just identical to each other, but correct too