From 0b2e058e0f05e4d7fb51223b8e912266a2d95260 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 30 Aug 2019 02:55:43 +0800 Subject: [PATCH 01/13] clean up use of vapply --- R/data.table.R | 21 +++++++++++---------- R/fread.R | 4 ++-- R/groupingsets.R | 2 +- R/print.data.table.R | 2 +- R/setops.R | 5 ++--- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 43541e9afb..9e8868028e 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -76,7 +76,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str } } if (isTRUE(stringsAsFactors)) { - for (j in which(vapply(ans, is.character, TRUE))) set(ans, NULL, j, as_factor(.subset2(ans, j))) + for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j))) # as_factor is internal function in fread.R currently } alloc.col(ans) # returns a NAMED==0 object, unlike data.frame() @@ -881,7 +881,7 @@ replace_order = function(isub, verbose, env) { jvnames = gsub("^[.](N|I|GRP|BY)$","\\1",as.character(jsub)) # jsub is list()ed after it's eval'd inside dogroups. } - } else if (is.call(jsub) && as.character(jsub[[1L]])[[1L]] %chin% c("list",".")) { + } else if (is.call(jsub) && as.character(jsub[[1L]])[1L] %chin% c("list", ".")) { jsub[[1L]] = quote(list) jsubl = as.list.default(jsub) # TO DO: names(jsub) and names(jsub)="" seem to work so make use of that if (length(jsubl)>1L) { @@ -1177,6 +1177,7 @@ replace_order = function(isub, verbose, env) { assign(sym, get(getName, parent.frame()), SDenv) } # hash=TRUE (the default) does seem better as expected using e.g. test 645. TO DO experiment with 'size' argument + #browser() if (missingby || bynull || (!byjoin && !length(byval))) { # No grouping: 'by' = missing | NULL | character() | "" | list() # Considered passing a one-group to dogroups but it doesn't do the recycling of i within group, that's done here @@ -1310,8 +1311,8 @@ replace_order = function(isub, verbose, env) { jval = data.table(jval) # TO DO: should this be setDT(list(jval)) instead? } else { if (is.null(jvnames)) jvnames=names(jval) - lenjval = vapply(jval, length, 0L) - nulljval = vapply(jval, is.null, FALSE) + lenjval = vapply_1i(jval, length) + nulljval = vapply_1b(jval, is.null) if (lenjval[1L]==0L || any(lenjval != lenjval[1L])) { jval = as.data.table.list(jval) # does the vector expansion to create equal length vectors, and drops any NULL items jvnames = jvnames[!nulljval] # fix for #1477 @@ -1462,7 +1463,7 @@ replace_order = function(isub, verbose, env) { lockBinding(".iSD",SDenv) GForce = FALSE - if ( getOption("datatable.optimize")>=1 && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[[1L]] %chin% c(".SD",".N"))) ) { # Ability to turn off if problems or to benchmark the benefit + if ( getOption("datatable.optimize")>=1 && (is.call(jsub) || (is.name(jsub) && as.character(jsub)[1L] %chin% c(".SD", ".N"))) ) { # Ability to turn off if problems or to benchmark the benefit # Optimization to reduce overhead of calling lapply over and over for each group oldjsub = jsub funi = 1L # Fix for #985 @@ -1621,7 +1622,7 @@ replace_order = function(isub, verbose, env) { if (getOption("datatable.optimize")>=2 && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { if (!length(ansvars) && !use.I) { GForce = FALSE - if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && length(as.character(jsub[[1L]])) && as.character(jsub[[1L]])[1L] == "list" && length(as.character(jsub[[2L]])) && as.character(jsub[[2L]])[1L] == ".N") ) { + if ( (is.name(jsub) && jsub == ".N") || (is.call(jsub) && length(jsub)==2L && jsub[[1L]]== "list" && jsub[[2L]] == ".N") ) { GForce = TRUE if (verbose) cat("GForce optimized j to '",deparse(jsub,width.cutoff=200L),"'\n",sep="") } @@ -2315,7 +2316,7 @@ copy = function(x) { if (!is.data.table(x)) { # fix for #1476. TODO: find if a cleaner fix is possible.. if (is.list(x)) { - anydt = vapply(x, is.data.table, TRUE, USE.NAMES=FALSE) + anydt = vapply_1b(x, is.data.table, use.names=FALSE) if (sum(anydt)) { newx[anydt] = lapply(newx[anydt], function(x) { .Call(C_unlock, x) @@ -2618,7 +2619,7 @@ setDF = function(x, rownames=NULL) { } x } else { - n = vapply(x, length, 0L) + n = vapply_1i(x, length) mn = max(n) if (any(n 1L && prod(vapply(i, length, integer(1L))) > 1e4){ + if(length(i) > 1L && prod(vapply_1i(i, length)) > 1e4){ ## CJ would result in more than 1e4 rows. This would be inefficient, especially memory-wise #2635 if (verbose) {cat("Subsetting optimization disabled because the cross-product of RHS values exceeds 1e4, causing memory problems.\n");flush.console()} return(NULL) diff --git a/R/fread.R b/R/fread.R index 46fa812e7c..707697bfb6 100644 --- a/R/fread.R +++ b/R/fread.R @@ -312,9 +312,9 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) if (stringsAsFactors) { if (is.double(stringsAsFactors)) { #2025 should_be_factor = function(v) is.character(v) && uniqueN(v) < nr * stringsAsFactors - cols_to_factor = which(vapply(ans, should_be_factor, logical(1L))) + cols_to_factor = which(vapply_1b(ans, should_be_factor)) } else { - cols_to_factor = which(vapply(ans, is.character, logical(1L))) + cols_to_factor = which(vapply_1b(ans, is.character)) } if (verbose) cat("stringsAsFactors=", stringsAsFactors, " converted ", length(cols_to_factor), " column(s): ", brackify(names(ans)[cols_to_factor]), "\n", sep="") for (j in cols_to_factor) set(ans, j=j, value=as_factor(.subset2(ans, j))) diff --git a/R/groupingsets.R b/R/groupingsets.R index dadcbb0fe8..c39c6b2096 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -83,7 +83,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) setcolorder(empty, c("grouping", by, setdiff(names(empty), c("grouping", by)))) } # workaround for rbindlist fill=TRUE on integer64 #1459 - int64.cols = vapply(empty, inherits, logical(1L), "integer64") + int64.cols = vapply_1b(empty, inherits, "integer64") int64.cols = names(int64.cols)[int64.cols] if (length(int64.cols) && !requireNamespace("bit64", quietly=TRUE)) stop("Using integer64 class columns require to have 'bit64' package installed.") # nocov diff --git a/R/print.data.table.R b/R/print.data.table.R index ac82dcfafa..37c43f5e95 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -81,7 +81,7 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), factor = "", POSIXct = "", logical = "", IDate = "", integer64 = "", raw = "", expression = "", ordered = "") - classes = vapply(x, function(col) class(col)[1L], "", USE.NAMES=FALSE) + classes = vapply_1c(x, function(col) class(col)[1L], use.names=FALSE) abbs = unname(class_abb[classes]) if ( length(idx <- which(is.na(abbs))) ) abbs[idx] = paste0("<", classes[idx], ">") toprint = rbind(abbs, toprint) diff --git a/R/setops.R b/R/setops.R index 1dce93702a..0c11032589 100644 --- a/R/setops.R +++ b/R/setops.R @@ -41,8 +41,7 @@ funique = function(x) { if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have the same column names") if (!identical(names(x), names(y))) stop("x and y must have the same column order") bad_types = c("raw", "complex", if (block_list) "list") - found = bad_types %chin% c(vapply(x, typeof, FUN.VALUE = ""), - vapply(y, typeof, FUN.VALUE = "")) + found = bad_types %chin% c(vapply_1c(x, typeof), vapply_1c(y, typeof)) if (any(found)) stop("unsupported column type", if (sum(found) > 1L) "s" else "", " found in x or y: ", brackify(bad_types[found])) if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have the same column classes") @@ -168,7 +167,7 @@ all.equal.data.table = function(target, current, trim.levels=TRUE, check.attribu if (ignore.row.order) { if (".seqn" %chin% names(target)) stop("None of the datasets to compare should contain a column named '.seqn'") - bad.type = setNames(c("raw","complex","list") %chin% c(vapply(current, typeof, FUN.VALUE = ""), vapply(target, typeof, FUN.VALUE = "")), c("raw","complex","list")) + bad.type = setNames(c("raw","complex","list") %chin% c(vapply_1c(current, typeof), vapply_1c(target, typeof)), c("raw","complex","list")) if (any(bad.type)) stop("Datasets to compare with 'ignore.row.order' must not have unsupported column types: ", brackify(names(bad.type)[bad.type])) if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) { From b5b014fd24cbc8357b0e149d044a1b12c588aecc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 30 Aug 2019 04:05:48 +0800 Subject: [PATCH 02/13] Automatic naming of j output is more powerful --- NEWS.md | 2 ++ R/data.table.R | 54 +++++++++++++++++++++++++++++-------------- inst/tests/tests.Rraw | 11 +++++++-- 3 files changed, 48 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index 0b762ca3db..a278b82948 100644 --- a/NEWS.md +++ b/NEWS.md @@ -164,6 +164,8 @@ 24. More efficient optimization of many columns in `j` (e.g. from `.SD`), [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. +25. Auto-naming in `j` now handles more complicated expressions (e.g. in `{` or `if`) and is hence more consistent, [#2478](https://github.com/Rdatatable/data.table/issues/2478). + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/data.table.R b/R/data.table.R index 9e8868028e..e1679cf83c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -874,30 +874,16 @@ replace_order = function(isub, verbose, env) { setattr(byval, "names", bynames) # byval is just a list not a data.table hence setattr not setnames } + #browser() jvnames = NULL + env = environment() if (is.name(jsub)) { # j is a single unquoted column name if (jsub!=".SD") { jvnames = gsub("^[.](N|I|GRP|BY)$","\\1",as.character(jsub)) # jsub is list()ed after it's eval'd inside dogroups. } - } else if (is.call(jsub) && as.character(jsub[[1L]])[1L] %chin% c("list", ".")) { - jsub[[1L]] = quote(list) - jsubl = as.list.default(jsub) # TO DO: names(jsub) and names(jsub)="" seem to work so make use of that - if (length(jsubl)>1L) { - jvnames = names(jsubl)[-1L] # check list(a=sum(v),v) - if (is.null(jvnames)) jvnames = rep.int("", length(jsubl)-1L) - for (jj in seq.int(2L,length(jsubl))) { - if (jvnames[jj-1L] == "" && mode(jsubl[[jj]])=="name") { - if (jsubl[[jj]]=="") stop("Item ", jj-1L, " of the .() or list() passed to j is missing") #3507 - jvnames[jj-1L] = gsub("^[.](N|I|GRP|BY)$", "\\1", deparse(jsubl[[jj]])) - } - # TO DO: if call to a[1] for example, then call it 'a' too - } - setattr(jsubl, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result. - jsub = as.call(jsubl) - } # else empty list is needed for test 468: adding an empty list column - } # else maybe a call to transform or something which returns a list. + } else jsub = do_j_names(jsub, env) # else maybe a call to transform or something which returns a list. av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c) use.I = ".I" %chin% av if (any(c(".SD","eval","get","mget") %chin% av)) { @@ -3071,3 +3057,37 @@ isReallyReal = function(x) { names(on) = xCols return(list(on = on, ops = idx_op)) } + +# function to handle auto-naming of j for potentially complicated expressions, #2478 +do_j_names = function(q, env) { + if (!is.call(q) || !is.name(q[[1L]])) return(q) + if (as.character(q[[1L]]) %chin% c('list', '.')) { + q[[1L]] = quote(list) + qlen = length(q) + if (qlen>1L) { + nm = names(q[-1L]) # check list(a=sum(v),v) + if (is.null(nm)) nm = rep.int("", qlen-1L) + for (jj in seq.int(2L, qlen)) { + if (nm[jj-1L] == "" && is.name(q[[jj]])) { + if (q[[jj]] == "") stop("Item ", jj-1L, " of the .() or list() passed to j is missing") #3507 + nm[jj-1L] = gsub("^[.](N|I|GRP|BY)$", "\\1", deparse(q[[jj]])) + } + # TO DO: if call to a[1] for example, then call it 'a' too + } + assign('jvnames', nm, envir=env) # TODO: handle if() list(a, b) else list(b, a) better + setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result. + } + return(q) # else empty list is needed for test 468: adding an empty list column + } + if (q[[1L]] == '{') { + q[[length(q)]] = do_j_names(q[[length(q)]], env) + return(q) + } + if (q[[1L]] == 'if') { + #explicit NULL would return NULL, assigning NULL would delete that from the expression + if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]], env) + if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]], env) + return(q) + } + return(q) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8a308c19c8..e5e0c870a8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10218,11 +10218,11 @@ test(1725, DT, data.table(a=1:3, add0=NA_real_, add1=c(NA,NA,1.1), add2=c(NA,NA, DT = data.table(grp=rep(3:1,each=3), val=1:9) lastGrp = 0L test(1726.1, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, keyby=grp], - data.table(grp=1:3, V1=c(8,12,6), V2=1:3, key="grp") ) + data.table(grp=1:3, ans=c(8,12,6), GRP=1:3, key="grp") ) test(1726.2, lastGrp, 1L) lastGrp = -1L test(1726.3, DT[, {ans=mean(val)+lastGrp; lastGrp<<-min(val); .(ans, .GRP)}, by=grp], - data.table(grp=3:1, V1=c(1,6,12), V2=1:3) ) + data.table(grp=3:1, ans=c(1,6,12), GRP=1:3) ) test(1726.4, lastGrp, 7L) rm(lastGrp) @@ -15828,6 +15828,13 @@ df2 = data.table(c=c(1, 1, 2, 2, 3), d=c(3, 4, 3, 5, 4)) test(2092.3, copy(df2)[ , s := df1[.SD, on=.(a >= c, a <= d), sum(b), by=.EACHI]$V1], df2[ , s := c(1, 1, 1, 3, 1)]) +# #2478 auto-name j even in complicated expressions +DT = data.table(a = c(1, 1, 2), b = 4:6) +test(2093.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)]) +test(2093.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)]) +test(2093.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2]) + + ################################### # Add new tests above this line # ################################### From 247cbcf522ba4ccfc7689f7197c7a748c569fea3 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 30 Aug 2019 04:06:56 +0800 Subject: [PATCH 03/13] remove browser() --- R/data.table.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index e1679cf83c..0c530effb1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -874,7 +874,6 @@ replace_order = function(isub, verbose, env) { setattr(byval, "names", bynames) # byval is just a list not a data.table hence setattr not setnames } - #browser() jvnames = NULL env = environment() if (is.name(jsub)) { From b5c736afcf65d279177be5d770f643019715bd95 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 30 Aug 2019 04:26:29 +0800 Subject: [PATCH 04/13] merge master & increase test #s --- inst/tests/tests.Rraw | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8d1373ca65..e61663143d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15827,7 +15827,7 @@ df1 = data.table(a=1:5, b=c(0, 0, 1, 0, 2)) df2 = data.table(c=c(1, 1, 2, 2, 3), d=c(3, 4, 3, 5, 4)) test(2092.3, copy(df2)[ , s := df1[.SD, on=.(a >= c, a <= d), sum(b), by=.EACHI]$V1], df2[ , s := c(1, 1, 1, 3, 1)]) - + # POSIXct overflow to NA before 1901 and after 2038, #3780 date=as.POSIXct("1900-01-01", tz="UTC") test(2093.1, as.IDate(date), as.IDate(-25567L)) @@ -15835,12 +15835,11 @@ test(2093.2, hour(date), 0L) test(2093.3, minute(date), 0L) test(2093.4, second(date), 0L) - # #2478 auto-name j even in complicated expressions DT = data.table(a = c(1, 1, 2), b = 4:6) -test(2093.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)]) -test(2093.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)]) -test(2093.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2]) +test(2094.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)]) +test(2094.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)]) +test(2094.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2]) ################################### From 3e29d6158b878d446e79d0d2d0de9e05448a4741 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 31 Aug 2019 02:37:48 +0800 Subject: [PATCH 05/13] another refactor for readability&efficieny, also handle if/else mismatch --- R/data.table.R | 86 +++++++++++++++++++++++-------------------- inst/tests/tests.Rraw | 2 + 2 files changed, 48 insertions(+), 40 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 0c530effb1..4fba57da8b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -875,14 +875,54 @@ replace_order = function(isub, verbose, env) { } jvnames = NULL - env = environment() + DOT_REGEX = "^[.](N|I|GRP|BY)$" + # function to handle auto-naming of j for potentially complicated expressions, #2478 + # defined here to have [.data.table as a parent + # j-ending (ends of if/else or {}) for expression like + # list(a=sum(v), v) + # should create columns named a, v + do_j_names = function(q) { + if (!is.call(q) || !is.name(q[[1L]])) return(q) + if (as.character(q[[1L]]) %chin% c('list', '.')) { + q[[1L]] = quote(list) + qlen = length(q) + if (qlen>1L) { + nm = names(q[-1L]) # check list(a=sum(v),v) + if (is.null(nm)) nm = rep.int("", qlen-1L) + # attempt to auto-name unnamed columns + idx = which(!nzchar(nm)) + for (jj in idx) { + thisq = q[[jj + 1L]] + if (missing(thisq)) stop("Item ", jj, " of the .() or list() passed to j is missing") #3507 + if (is.name(thisq)) nm[jj] = deparse(thisq, width.cutoff = 50L, backtick = FALSE, nlines = 1L) + # TO DO: if call to a[1] for example, then call it 'a' too + } + nm[idx] = gsub(DOT_REGEX, "\\1", nm[idx]) + if (!is.null(jvnames) && any(idx <- nm != jvnames)) + warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) + jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better + setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result. + } + return(q) # else empty list is needed for test 468: adding an empty list column + } + if (q[[1L]] == '{') { + q[[length(q)]] = do_j_names(q[[length(q)]]) + return(q) + } + if (q[[1L]] == 'if') { + #explicit NULL would return NULL, assigning NULL would delete that from the expression + if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]]) + if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]]) + return(q) + } + return(q) + } + if (is.name(jsub)) { # j is a single unquoted column name - if (jsub!=".SD") { - jvnames = gsub("^[.](N|I|GRP|BY)$","\\1",as.character(jsub)) - # jsub is list()ed after it's eval'd inside dogroups. - } - } else jsub = do_j_names(jsub, env) # else maybe a call to transform or something which returns a list. + if (jsub!=".SD") jvnames = gsub(DOT_REGEX, "\\1", jsub) + # jsub is list()ed after it's eval'd inside dogroups. + } else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list. av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c) use.I = ".I" %chin% av if (any(c(".SD","eval","get","mget") %chin% av)) { @@ -3056,37 +3096,3 @@ isReallyReal = function(x) { names(on) = xCols return(list(on = on, ops = idx_op)) } - -# function to handle auto-naming of j for potentially complicated expressions, #2478 -do_j_names = function(q, env) { - if (!is.call(q) || !is.name(q[[1L]])) return(q) - if (as.character(q[[1L]]) %chin% c('list', '.')) { - q[[1L]] = quote(list) - qlen = length(q) - if (qlen>1L) { - nm = names(q[-1L]) # check list(a=sum(v),v) - if (is.null(nm)) nm = rep.int("", qlen-1L) - for (jj in seq.int(2L, qlen)) { - if (nm[jj-1L] == "" && is.name(q[[jj]])) { - if (q[[jj]] == "") stop("Item ", jj-1L, " of the .() or list() passed to j is missing") #3507 - nm[jj-1L] = gsub("^[.](N|I|GRP|BY)$", "\\1", deparse(q[[jj]])) - } - # TO DO: if call to a[1] for example, then call it 'a' too - } - assign('jvnames', nm, envir=env) # TODO: handle if() list(a, b) else list(b, a) better - setattr(q, "names", NULL) # drops the names from the list so it's faster to eval the j for each group. We'll put them back afterwards on the result. - } - return(q) # else empty list is needed for test 468: adding an empty list column - } - if (q[[1L]] == '{') { - q[[length(q)]] = do_j_names(q[[length(q)]], env) - return(q) - } - if (q[[1L]] == 'if') { - #explicit NULL would return NULL, assigning NULL would delete that from the expression - if (!is.null(q[[3L]])) q[[3L]] = do_j_names(q[[3L]], env) - if (length(q) == 4L && !is.null(q[[4L]])) q[[4L]] = do_j_names(q[[4L]], env) - return(q) - } - return(q) -} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e61663143d..24af96ddfe 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15840,6 +15840,8 @@ DT = data.table(a = c(1, 1, 2), b = 4:6) test(2094.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)]) test(2094.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)]) test(2094.3, DT[ , if (.N > 1L) .(b), by=a], DT[1:2]) +test(2094.4, DT[ , if (.N > 1L) .(b) else .(c=b), by=a], DT[ , .(a, c=b)], + warning="Different branches of j expression produced different auto-named columns") ################################### From 80adb9a89ea5f84a82ed7725a4e068d3da23a319 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 31 Aug 2019 06:11:25 +0800 Subject: [PATCH 06/13] order->forder replacement suffered the same issue with needing env, may as well solve here --- R/data.table.R | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 4fba57da8b..1c2dec953d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -120,26 +120,6 @@ replace_dot_alias = function(e) { } } -# replace order -> forder wherever it appears in i -replace_order = function(isub, verbose, env) { - if (length(isub) == 1L) return(isub) - for (ii in seq_along(isub)) { - isub_el = isub[[ii]] - if (missing(isub_el)) break - if (is.name(isub_el)) { - # stop base::order from becoming forder(x, base, order) - if (isub_el == '::') break - if (isub_el == 'order') { - if (verbose) cat("order optimisation is on, changed 'order(...)' in i to 'forder(x, ...)'.\n") - env$eval_forder = TRUE - return(as.call(c(list(quote(forder), quote(x)), as.list(isub)[-1L]))) - } - } - if (is.call(isub_el)) isub[[ii]] = replace_order(isub_el, verbose, env) - } - return(isub) -} - "[.data.table" = function (x, i, j, by, keyby, with=TRUE, nomatch=getOption("datatable.nomatch", NA), mult="all", roll=FALSE, rollends=if (roll=="nearest") c(TRUE,TRUE) else if (roll>=0) c(FALSE,TRUE) else c(TRUE,FALSE), which=FALSE, .SDcols, verbose=getOption("datatable.verbose"), allow.cartesian=getOption("datatable.allow.cartesian"), drop=NULL, on=NULL) { # ..selfcount <<- ..selfcount+1 # in dev, we check no self calls, each of which doubles overhead, or could @@ -351,12 +331,29 @@ replace_order = function(isub, verbose, env) { if (is.null(isub)) return( null.data.table() ) # optimize here so that we can switch it off if needed - check_eval_env = environment() - check_eval_env$eval_forder = FALSE - if (getOption("datatable.optimize") >= 1) { - isub = replace_order(isub, verbose, check_eval_env) + # replace order -> forder wherever it appears in i + # replace_order defined here to have right env inheritance + replace_order = function(isub, verbose) { + if (length(isub) == 1L) return(isub) + for (ii in seq_along(isub)) { + isub_el = isub[[ii]] + if (missing(isub_el)) break + if (is.name(isub_el)) { + # stop base::order from becoming forder(x, base, order) + if (isub_el == '::') break + if (isub_el == 'order') { + if (verbose) cat("order optimisation is on, changed 'order(...)' in i to 'forder(x, ...)'.\n") + eval_forder = TRUE + return(as.call(c(list(quote(forder), quote(x)), as.list(isub)[-1L]))) + } + } + if (is.call(isub_el)) isub[[ii]] = replace_order(isub_el, verbose) + } + return(isub) } - if (check_eval_env$eval_forder) { + eval_forder = FALSE + if (getOption("datatable.optimize") >= 1) isub = replace_order(isub, verbose) + if (eval_forder) { order_env = new.env(parent=parent.frame()) # until 'forder' is exported assign("forder", forder, order_env) assign("x", x, order_env) From a7e1c4bd5bbb2385ef9b86acba8248d495efa461 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 31 Aug 2019 22:53:06 +0800 Subject: [PATCH 07/13] remove <<< --- R/data.table.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 78194253ad..7f1145c247 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -331,7 +331,6 @@ replace_dot_alias = function(e) { if (is.null(isub)) return( null.data.table() ) # optimize here so that we can switch it off if needed -<<<<<<< HEAD # replace order -> forder wherever it appears in i # replace_order defined here to have right env inheritance replace_order = function(isub, verbose) { From ed77e1c4ec5dde152101a7f0c40f5547b87693cd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 31 Aug 2019 22:53:46 +0800 Subject: [PATCH 08/13] remove browser() --- R/data.table.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 7f1145c247..b88c3891a9 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1199,7 +1199,6 @@ replace_dot_alias = function(e) { assign(sym, get(getName, parent.frame()), SDenv) } # hash=TRUE (the default) does seem better as expected using e.g. test 645. TO DO experiment with 'size' argument - #browser() if (missingby || bynull || (!byjoin && !length(byval))) { # No grouping: 'by' = missing | NULL | character() | "" | list() # Considered passing a one-group to dogroups but it doesn't do the recycling of i within group, that's done here From 1609991b7a1b2f4555a015ff15f9d0cd968b52bd Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 31 Aug 2019 23:04:33 +0800 Subject: [PATCH 09/13] need <<- not = --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index b88c3891a9..291c2d368f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -343,7 +343,7 @@ replace_dot_alias = function(e) { if (isub_el == '::') break if (isub_el == 'order') { if (verbose) cat("order optimisation is on, changed 'order(...)' in i to 'forder(x, ...)'.\n") - eval_forder = TRUE + eval_forder <<- TRUE return(as.call(c(list(quote(forder), quote(x)), as.list(isub)[-1L]))) } } From b4b8d3add10d07ff377272cbad1b09994167c096 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 7 Nov 2019 17:48:19 -0800 Subject: [PATCH 10/13] further merge resolution --- R/data.table.R | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 26d1110d2f..68e8bb76ef 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -338,6 +338,7 @@ replace_dot_alias = function(e) { } if (is.null(isub)) return( null.data.table() ) + if (length(o <- .prepareFastSubset(isub = isub, x = x, enclos = parent.frame(), notjoin = notjoin, verbose = verbose))){ @@ -1309,19 +1310,8 @@ replace_dot_alias = function(e) { )) jval = lapply(jval, `[`, 0L) if (is.atomic(jval)) { - setattr(jval,"names",NULL) # discard names of named vectors otherwise each cell in the column would have a name + setattr(jval,"names",NULL) # discard names of named vectors otherwise each cell in the column would have a name jval = list(jval) - } else { - if (is.null(jvnames)) jvnames=names(jval) - lenjval = vapply_1i(jval, length) - nulljval = vapply_1b(jval, is.null) - if (lenjval[1L]==0L || any(lenjval != lenjval[1L])) { - jval = as.data.table.list(jval) # does the vector expansion to create equal length vectors, and drops any NULL items - jvnames = jvnames[!nulljval] # fix for #1477 - } else { - # all columns same length and at least 1 row; avoid copy. TODO: remove when as.data.table.list is ported to C - # setDT(jval) - } } if (!is.null(jvnames) && !all(jvnames=="")) setattr(jval, 'names', jvnames) # e.g. jvnames=="N" for DT[,.N,] jval = as.data.table.list(jval, .named=NULL) From e825d28e875158c533f70111c42f4e6745f53f36 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 7 Nov 2019 17:58:13 -0800 Subject: [PATCH 11/13] issue number at the end --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 21e51bb76e..62eecec926 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16370,7 +16370,7 @@ test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L)) # wrapping wi test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]], # different result with the NA data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L))) -# #2478 auto-name j even in complicated expressions +# auto-name j even in complicated expressions, #2478 DT = data.table(a = c(1, 1, 2), b = 4:6) test(2121.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)]) test(2121.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)]) From fbe3f0a78c07c44adda7a0df40b529b77361a0d3 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 8 Nov 2019 14:00:46 -0800 Subject: [PATCH 12/13] news item refined --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index dbb45df4cf..54359e2e0e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ ## NEW FEATURES -1. Auto-naming in `j` now handles more complicated expressions (e.g. in `{` or `if`), [#2478](https://github.com/Rdatatable/data.table/issues/2478). For example, both `DT[ , .(x, y)]` and `DT[ , {x = 4+x; y=4+y; .(x, y)}]` now return columns named `x` and `y`. +1. `DT[, {...; .(A,B)}]` (when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the result column `A` before, and still does. ## BUG FIXES From 86c444aa7ad6a0fa76e219b1d851b7189d22fa42 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 13 Nov 2019 18:39:53 -0800 Subject: [PATCH 13/13] avoid gsub and deparse for a tiny bit of efficiency --- NEWS.md | 2 +- R/data.table.R | 20 ++++++++++---------- inst/tests/tests.Rraw | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index 54359e2e0e..52b5d76768 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,7 @@ ## NEW FEATURES -1. `DT[, {...; .(A,B)}]` (when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the result column `A` before, and still does. +1. `DT[, {...; .(A,B)}]` (when `.()` is the final item of a multi-statement `{...}`) now auto-names the columns `A` and `B` (just like `DT[, .(A,B)]`) rather than `V1` and `V2`, [#2478](https://github.com/Rdatatable/data.table/issues/2478) [#609](https://github.com/Rdatatable/data.table/issues/609). Similarly, `DT[, if (.N>1) .(B), by=A]` now auto-names the column `B` rather than `V1`. Explicit names are unaffected; e.g. `DT[, {... y= ...; .(A=C+y)}, by=...]` named the column `A` before, and still does. ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index 68e8bb76ef..08ef6872b1 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -854,12 +854,13 @@ replace_dot_alias = function(e) { } jvnames = NULL - DOT_REGEX = "^[.](N|I|GRP|BY)$" - # function to handle auto-naming of j for potentially complicated expressions, #2478 - # defined here to have [.data.table as a parent - # j-ending (ends of if/else or {}) for expression like - # list(a=sum(v), v) - # should create columns named a, v + drop_dot = function(x) { + tt = x %chin% c(".N",".I",".GRP",".BY") + if (any(tt)) x[tt] = substring(x[tt], 2L) + x + } + # handle auto-naming of last item of j (e.g. within {} or if/else, #2478) + # e.g. DT[, .(a=sum(v), v, .N), by=] should create columns named a, v, N do_j_names = function(q) { if (!is.call(q) || !is.name(q[[1L]])) return(q) if (as.character(q[[1L]]) %chin% c('list', '.')) { @@ -873,10 +874,10 @@ replace_dot_alias = function(e) { for (jj in idx) { thisq = q[[jj + 1L]] if (missing(thisq)) stop("Item ", jj, " of the .() or list() passed to j is missing") #3507 - if (is.name(thisq)) nm[jj] = deparse(thisq, width.cutoff = 50L, backtick = FALSE, nlines = 1L) + if (is.name(thisq)) nm[jj] = as.character(thisq) # TO DO: if call to a[1] for example, then call it 'a' too } - nm[idx] = gsub(DOT_REGEX, "\\1", nm[idx]) + nm[idx] = drop_dot(nm[idx]) if (!is.null(jvnames) && any(idx <- nm != jvnames)) warning("Different branches of j expression produced different auto-named columns: ", brackify(sprintf('%s!=%s', nm[idx], jvnames[idx])), '; using the most "last" names', call. = FALSE) jvnames <<- nm # TODO: handle if() list(a, b) else list(b, a) better @@ -896,10 +897,9 @@ replace_dot_alias = function(e) { } return(q) } - if (is.name(jsub)) { # j is a single unquoted column name - if (jsub!=".SD") jvnames = gsub(DOT_REGEX, "\\1", jsub) + if (jsub!=".SD") jvnames = drop_dot(as.character(jsub)) # jsub is list()ed after it's eval'd inside dogroups. } else jsub = do_j_names(jsub) # else maybe a call to transform or something which returns a list. av = all.vars(jsub,TRUE) # TRUE fixes bug #1294 which didn't see b in j=fns[[b]](c) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 62eecec926..0c6494d594 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16370,7 +16370,7 @@ test(2120.07, iDT[(i_id), order(e_date, e_time)], c(3L,4L,1L,2L)) # wrapping wi test(2120.08, tmp[iDT[(i_id), order(e_date, e_time)]], # different result with the NA data.table(i_id=c("A",NA,"B","C"), N=c(5L,NA,5L,5L))) -# auto-name j even in complicated expressions, #2478 +# auto-name .() when it's the last item of {...} or wrapped with if(), #2478 #609 DT = data.table(a = c(1, 1, 2), b = 4:6) test(2121.1, DT[ , {b = b; .(a, b = b + 1)}], DT[ , .(a, b=b+1)]) test(2121.2, DT[ , {{{b = b; .(a, b = b + 1)}}}], DT[ , .(a, b=b+1)])