From 8947a0a50321909effef9f5dc111a9e65cb54daf Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Tue, 3 Sep 2019 10:20:09 +0800 Subject: [PATCH 1/3] Correctly deal with {}-wrapped by --- NEWS.md | 2 ++ R/data.table.R | 44 +++++++++++++++++++++---------------------- inst/tests/tests.Rraw | 4 ++++ 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 65dfb8f3a8..ea8ce30e58 100644 --- a/NEWS.md +++ b/NEWS.md @@ -259,6 +259,8 @@ 38. `rbindlist` now returns correct idcols for lists with different length vectors, [#3785](https://github.com/Rdatatable/data.table/issues/3785), [#3786](https://github.com/Rdatatable/data.table/pull/3786). Thanks to @shrektan for the report and fix. +39. `by` correctly handles extended expressions in `{`, [#3156](https://github.com/Rdatatable/data.table/issues/3156). Thanks to @tdhock for the report. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/data.table.R b/R/data.table.R index eb9bc40379..ab02def4b2 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -842,29 +842,29 @@ replace_order = function(isub, verbose, env) { } tt = vapply_1i(byval,length) if (any(tt!=xnrow)) stop("The items in the 'by' or 'keyby' list are length (",paste(tt,collapse=","),"). Each must be length ", xnrow, "; the same length as there are rows in x (after subsetting if i is provided).") - if (is.null(bynames)) bynames = rep.int("",length(byval)) - if (any(bynames=="") && !bynull) { - for (jj in seq_along(bynames)) { - if (bynames[jj]=="") { - # Best guess. Use "month" in the case of by=month(date), use "a" in the case of by=a%%2 - byvars = all.vars(bysubl[[jj+1L]], functions = TRUE) - if (length(byvars) == 1L) tt = byvars - else { - # take the first variable that is (1) not eval (#3758) and (2) starts with a character that can't start a variable name - tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE) - # byvars but exclude functions or `0`+`1` becomes `+` - tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L] - } - # fix for #497 - if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { - bynames[jj] = deparse(bysubl[[jj+1L]]) - if (verbose) - cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, - "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="") - } - else bynames[jj] = tt - # if user doesn't like this inferred name, user has to use by=list() to name the column + if (is.null(bynames)) bynames = rep.int("", length(byval)) + if (length(idx <- which(!nzchar(bynames))) && !bynull) { + # TODO: improve this and unify auto-naming of jsub and bysub + if (is.name(bysubl[[1L]]) && bysubl[[1L]] == '{') bysubl = bysubl[[length(bysubl)]] # fix for #3156 + for (jj in idx) { + # Best guess. Use "month" in the case of by=month(date), use "a" in the case of by=a%%2 + byvars = all.vars(bysubl[[jj+1L]], functions = TRUE) + if (length(byvars) == 1L) tt = byvars + else { + # take the first variable that is (1) not eval (#3758) and (2) starts with a character that can't start a variable name + tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE) + # byvars but exclude functions or `0`+`1` becomes `+` + tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L] + } + # fix for #497 + if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { + bynames[jj] = deparse(bysubl[[jj+1L]]) + if (verbose) + cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, + "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="") } + else bynames[jj] = tt + # if user doesn't like this inferred name, user has to use by=list() to name the column } # Fix for #1334 if (any(duplicated(bynames))) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 78514cae0f..cd680cd6f6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15870,6 +15870,10 @@ test(2097.1, DT[.(NA_character_)], data.table(fac=NA_character_, v=2L, key="fac" test(2097.2, DT[J(NA)], error="Factor columns must join to factor or character columns") test(2097.3, DT[J(NA_integer_)], error="Factor columns must join to factor or character columns") +# bug in autonaming by #3156 +library(data.table) +DT = data.table(State=c("ERROR", "COMPLETED", "ERROR"), ExitCode=c(1, 0, 2)) +test(2098, DT[ , list(count=.N), by={list(State, ExitCode)}], DT[ , count := 1L]) ################################### # Add new tests above this line # From 1730b2d1fc0f904d698b66920e3a5542804cc6ff Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 11 Sep 2019 18:53:15 -0700 Subject: [PATCH 2/3] news item tweak, and dummy comment to help visual diff show indent-only changes --- NEWS.md | 2 +- R/data.table.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index 491c514f24..7a559dc5f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -302,7 +302,7 @@ 41. Using `get`/`mget` in `j` could cause `.SDcols` to be ignored or reordered, [#1744](https://github.com/Rdatatable/data.table/issues/1744), [#1965](https://github.com/Rdatatable/data.table/issues/1965), [#2036](https://github.com/Rdatatable/data.table/issues/2036), and [#2946](https://github.com/Rdatatable/data.table/issues/2946). Thanks @franknarf1, @MichaelChirico, @TonyBonen, and Steffen J. (StackOverflow) for the reports. -39. `by` correctly handles extended expressions in `{`, [#3156](https://github.com/Rdatatable/data.table/issues/3156). Thanks to @tdhock for the report. +42. `DT[...,by={...}]` now handles expressions in `{`, [#3156](https://github.com/Rdatatable/data.table/issues/3156). Thanks to @tdhock for the report. #### NOTES diff --git a/R/data.table.R b/R/data.table.R index 033602ae6f..7c87fefb4f 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -817,7 +817,7 @@ replace_dot_alias = function(e) { } tt = vapply_1i(byval,length) if (any(tt!=xnrow)) stop("The items in the 'by' or 'keyby' list are length (",paste(tt,collapse=","),"). Each must be length ", xnrow, "; the same length as there are rows in x (after subsetting if i is provided).") - if (is.null(bynames)) bynames = rep.int("", length(byval)) + if (is.null(bynames)) bynames = rep.int("",length(byval)) if (length(idx <- which(!nzchar(bynames))) && !bynull) { # TODO: improve this and unify auto-naming of jsub and bysub if (is.name(bysubl[[1L]]) && bysubl[[1L]] == '{') bysubl = bysubl[[length(bysubl)]] # fix for #3156 @@ -830,14 +830,14 @@ replace_dot_alias = function(e) { tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE) # byvars but exclude functions or `0`+`1` becomes `+` tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L] - } + } # dummy comment to help visual diff show just indent changes; can delete this dummy comment # fix for #497 if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { bynames[jj] = deparse(bysubl[[jj+1L]]) if (verbose) cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="") - } + } # dummy comment to help visual diff show just indent changes; can delete this dummy comment else bynames[jj] = tt # if user doesn't like this inferred name, user has to use by=list() to name the column } From 22651b27dae86c09fcfe9f063dd2fd71010cc667 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 11 Sep 2019 19:00:48 -0700 Subject: [PATCH 3/3] remove dummy comments --- R/data.table.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 7c87fefb4f..a8498899f5 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -830,14 +830,14 @@ replace_dot_alias = function(e) { tt = grep("^eval$|^[^[:alpha:]. ]", byvars, invert=TRUE, value=TRUE) # byvars but exclude functions or `0`+`1` becomes `+` tt = if (length(tt)) tt[1L] else all.vars(bysubl[[jj+1L]])[1L] - } # dummy comment to help visual diff show just indent changes; can delete this dummy comment + } # fix for #497 if (length(byvars) > 1L && tt %chin% all.vars(jsub, FALSE)) { bynames[jj] = deparse(bysubl[[jj+1L]]) if (verbose) cat("by-expression '", bynames[jj], "' is not named, and the auto-generated name '", tt, "' clashed with variable(s) in j. Therefore assigning the entire by-expression as name.\n", sep="") - } # dummy comment to help visual diff show just indent changes; can delete this dummy comment + } else bynames[jj] = tt # if user doesn't like this inferred name, user has to use by=list() to name the column }