diff --git a/NEWS.md b/NEWS.md index adc1c48362..52b5d76768 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,8 @@ ## NEW FEATURES +1. `DT[, {...; .(A,B)}]` (when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. + ## BUG FIXES ## NOTES diff --git a/R/data.table.R b/R/data.table.R index e774a7f4a7..08ef6872b1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -81,7 +81,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str } } if (isTRUE(stringsAsFactors)) { - for (j in which(vapply(ans, is.character, TRUE))) set(ans, NULL, j, as_factor(.subset2(ans, j))) + for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j))) # as_factor is internal function in fread.R currently } setalloccol(ans) # returns a NAMED==0 object, unlike data.frame() @@ -854,29 +854,54 @@ replace_dot_alias = function(e) { } jvnames = NULL - if (is.name(jsub)) { - # j is a single unquoted column name - if (jsub!=".SD") { - jvnames = gsub("^[.](N|I|GRP|BY)$","\\1",as.character(jsub)) - # jsub is list()ed after it's eval'd inside dogroups. - } - } else if (is.call(jsub) && as.character(jsub[[1L]])[[1L]] %chin% c("list",".")) { - jsub[[1L]] = quote(list) - jsubl = as.list.default(jsub) # TO DO: names(jsub) and names(jsub)="" seem to work so make use of that - if (length(jsubl)>1L) { - jvnames = names(jsubl)[-1L] # check list(a=sum(v),v) - if (is.null(jvnames)) jvnames = rep.int("", length(jsubl)-1L) - for (jj in seq.int(2L,length(jsubl))) { - if (jvnames[jj-1L] == "" && mode(jsubl[[jj]])=="name") { - if (jsubl[[jj]]=="") stop("Item ", jj-1L, " of the .() or list() passed to j is missing") #3507 - jvnames[jj-1L] = gsub("^[.](N|I|GRP|BY)$", "\\1", deparse(jsubl[[jj]])) + drop_dot = function(x) { + tt = x %chin% c(".N",".I",".GRP",".BY") + if (any(tt)) x[tt] = substring(x[tt], 2L) + x + } + # handle auto-naming of last item of j (e.g. within {} or if/else, #2478) + # e.g. DT[, .(a=sum(v), v, .N), by=] should create columns named a, v, N + do_j_names = function(q) { + if (!is.call(q) || !is.name(q[[1L]])) return(q) + if (as.character(q[[1L]]) %chin% c('list', '.')) { + q[[1L]] = quote(list) + qlen = length(q) + if (qlen>1L) { + nm = names(q[-1L]) # check list(a=sum(v),v) + if (is.null(nm)) nm = rep.int("", qlen-1L) + # attempt to auto-name unnamed columns + idx = which(!nzchar(nm)) + for (jj in idx) { + thisq = q[[jj + 1L]] + if (missing(thisq)) stop("Item ", jj, " of the .() or list() passed to j is missing") #3507 + if (is.name(thisq)) nm[jj] = as.character(thisq) + # TO DO: if call to a[1] for example, then call it 'a' too } - # TO DO: if call to a[1] for example, then call it 'a' too + nm[idx] = drop_dot(nm[idx]) + if (!is.null(jvnames) && any(idx <- nm != jvnames)) + warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) + jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better + setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result. } - setattr(jsubl, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result. - jsub = as.call(jsubl) - } # else empty list is needed for test 468: adding an empty list column - } # else maybe a call to transform or something which returns a list. + return(q) # else empty list is needed for test 468: adding an empty list column + } + if (q[[1L]] == '{') { + q[[length(q)]] = do_j_names(q[[length(q)]]) + return(q) + } + if (q[[1L]] == 'if') { + #explicit NULL would return NULL, assigning NULL would delete that from the expression + if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]]) + if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]]) + return(q) + } + return(q) + } + if (is.name(jsub)) { + # j is a single unquoted column name + if (jsub!=".SD") jvnames = drop_dot(as.character(jsub)) + # jsub is list()ed after it's eval'd inside dogroups. + } else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list. av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c) use.I = ".I" %chin% av if (any(c(".SD","eval","get","mget") %chin% av)) { @@ -1441,7 +1466,7 @@ replace_dot_alias = function(e) { lockBinding(".iSD",SDenv) GForce = FALSE - if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[[1L]] %chin% c(".SD",".N"))) ) { # Ability to turn off if problems or to benchmark the benefit + if ( getOption("datatable.optimize")>=1L && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[1L] %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit # Optimization to reduce overhead of calling lapply over and over for each group oldjsub = jsub funi = 1L # Fix for #985 @@ -1600,7 +1625,7 @@ replace_dot_alias = function(e) { if (getOption("datatable.optimize")>=2L && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { if (!length(ansvars) && !use.I) { GForce = FALSE - if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && length(as.character(jsub[[1L]])) && as.character(jsub[[1L]])[1L] == "list" && length(as.character(jsub[[2L]])) && as.character(jsub[[2L]])[1L] == ".N") ) { + if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && jsub[[1L]]== "list" && jsub[[2L]] == ".N") ) { GForce = TRUE if (verbose) cat("GForce optimized j to '",deparse(jsub, width.cutoff=200L, nlines=1L),"'\n",sep="") } @@ -2293,7 +2318,7 @@ copy = function(x) { if (!is.data.table(x)) { # fix for #1476. TODO: find if a cleaner fix is possible.. if (is.list(x)) { - anydt = vapply(x, is.data.table, TRUE, USE.NAMES=FALSE) + anydt = vapply_1b(x, is.data.table, use.names=FALSE) if (sum(anydt)) { newx[anydt] = lapply(newx[anydt], function(x) { .Call(C_unlock, x) @@ -2591,7 +2616,7 @@ setDF = function(x, rownames=NULL) { } x } else { - n = vapply(x, length, 0L) + n = vapply_1i(x, length) mn = max(n) if (any(n 1L && prod(vapply(i, length, integer(1L))) > 1e4){ + if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){ ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635 if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()} return(NULL) diff --git a/R/fread.R b/R/fread.R index d7ea4de61b..b337743934 100644 --- a/R/fread.R +++ b/R/fread.R @@ -312,9 +312,9 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) if (stringsAsFactors) { if (is.double(stringsAsFactors)) { #2025 should_be_factor = function(v) is.character(v) && uniqueN(v) < nr * stringsAsFactors - cols_to_factor = which(vapply(ans, should_be_factor, logical(1L))) + cols_to_factor = which(vapply_1b(ans, should_be_factor)) } else { - cols_to_factor = which(vapply(ans, is.character, logical(1L))) + cols_to_factor = which(vapply_1b(ans, is.character)) } if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="") for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j))) diff --git a/R/groupingsets.R b/R/groupingsets.R index dadcbb0fe8..c39c6b2096 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -83,7 +83,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) setcolorder(empty, c("grouping", by, setdiff(names(empty), c("grouping", by)))) } # workaround for rbindlist fill=TRUE on integer64 #1459 - int64.cols = vapply(empty, inherits, logical(1L), "integer64") + int64.cols = vapply_1b(empty, inherits, "integer64") int64.cols = names(int64.cols)[int64.cols] if (length(int64.cols) && !requireNamespace("bit64", quietly=TRUE)) stop("Using integer64 class columns require to have 'bit64' package installed.") # nocov diff --git a/R/print.data.table.R b/R/print.data.table.R index ac608133f2..455204abb8 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -81,7 +81,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), factor = "", POSIXct = "", logical = "", IDate = "", integer64 = "", raw = "", expression = "", ordered = "") - classes = vapply(x, function(col) class(col)[1L], "", USE.NAMES=FALSE) + classes = vapply_1c(x, function(col) class(col)[1L], use.names=FALSE) abbs = unname(class_abb[classes]) if ( length(idx <- which(is.na(abbs))) ) abbs[idx] = paste0("<", classes[idx], ">") toprint = rbind(abbs, toprint) diff --git a/R/setops.R b/R/setops.R index 56ad2803e7..62368b79ab 100644 --- a/R/setops.R +++ b/R/setops.R @@ -41,8 +41,7 @@ funique = function(x) { if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have the same column names") if (!identical(names(x), names(y))) stop("x and y must have the same column order") bad_types = c("raw", "complex", if (block_list) "list") - found = bad_types %chin% c(vapply(x, typeof, FUN.VALUE = ""), - vapply(y, typeof, FUN.VALUE = "")) + found = bad_types %chin% c(vapply_1c(x, typeof), vapply_1c(y, typeof)) if (any(found)) stop("unsupported column type", if (sum(found) > 1L) "s" else "", " found in x or y: ", brackify(bad_types[found])) super = function(x) { @@ -176,7 +175,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu if (ignore.row.order) { if (".seqn" %chin% names(target)) stop("None of the datasets to compare should contain a column named '.seqn'") - bad.type = setNames(c("raw","complex","list") %chin% c(vapply(current, typeof, FUN.VALUE = ""), vapply(target, typeof, FUN.VALUE = "")), c("raw","complex","list")) + bad.type = setNames(c("raw","complex","list") %chin% c(vapply_1c(current, typeof), vapply_1c(target, typeof)), c("raw","complex","list")) if (any(bad.type)) stop("Datasets to compare with 'ignore.row.order' must not have unsupported column types: ", brackify(names(bad.type)[bad.type])) if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 81d9fcaebd..0c6494d594 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10271,11 +10271,11 @@ test(1725, DT, data.table(a=1:3, add0=NA_real_, add1=c(NA,NA,1.1), add2=c(NA,NA, DT = data.table(grp=rep(3:1,each=3), val=1:9) lastGrp = 0L test(1726.1, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, keyby=grp], - data.table(grp=1:3, V1=c(8,12,6), V2=1:3, key="grp") ) + data.table(grp=1:3, ans=c(8,12,6), GRP=1:3, key="grp") ) test(1726.2, lastGrp, 1L) lastGrp = -1L test(1726.3, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, by=grp], - data.table(grp=3:1, V1=c(1,6,12), V2=1:3) ) + data.table(grp=3:1, ans=c(1,6,12), GRP=1:3) ) test(1726.4, lastGrp, 7L) rm(lastGrp) @@ -16370,6 +16370,13 @@ test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L)) # wrapping wi test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]], # different result with the NA data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L))) +# auto-name .() when it's the last item of {...} or wrapped with if(), #2478 #609 +DT = data.table(a = c(1, 1, 2), b = 4:6) +test(2121.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)]) +test(2121.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)]) +test(2121.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2]) +test(2121.4, DT[ , if (.N > 1L) .(b) else .(c=b), by=a], DT[ , .(a, c=b)], + warning="Different branches of j expression produced different auto-named columns") ################################### # Add new tests above this line #