From a4205e499c23ec09256c51ec760daf4260f1f97a Mon Sep 17 00:00:00 2001 From: jangorecki Date: Thu, 1 Aug 2019 10:21:57 +0200 Subject: [PATCH 1/5] fix autonaaming issue in groupings sets for special symbols, closes #3653 --- NEWS.md | 2 ++ R/groupingsets.R | 13 ++++++++++--- inst/tests/tests.Rraw | 10 ++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 411a589857..1ff2688fb6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -198,6 +198,8 @@ 24. `column not found` could incorrectly occur in rare non-equi-join cases, [#3635](https://github.com/Rdatatable/data.table/issues/3635). Thanks to @UweBlock for the report. +25. `groupingsets` functions will now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/groupingsets.R b/R/groupingsets.R index e1dda42654..954d007034 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -67,6 +67,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) # input arguments handling jj = if (!missing(jj)) jj else substitute(j) av = all.vars(jj, TRUE) + jsym = if (length(av)==1L) {if (".N"==av) "N" else if (".I"==av) "I" else if (".GRP"==av) "GRP"} # workaround for autonamed columns in grand total #3653 if (":=" %chin% av) stop("Expression passed to grouping sets function must not update by reference. Use ':=' on results of your grouping function.") if (missing(.SDcols)) @@ -97,13 +98,19 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stop("Using integer64 class columns require to have 'bit64' package installed.") # nocov int64.by.cols = intersect(int64.cols, by) # aggregate function called for each grouping set - aggregate.set = function(by.set) { + aggregate.set = function(by.set, jsym) { if (length(by.set)) { r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set] } else { + ## workaround for grand total single var as data.table too, change to drop=FALSE after #648 solved r = if (length(.SDcols)) x[, eval(jj), .SDcols=.SDcols] else x[, eval(jj)] - # workaround for grand total single var as data.table too, change to drop=FALSE after #648 solved if (!is.data.table(r)) r = setDT(list(r)) + if (!is.null(jsym)) { + if (!"V1" %in% names(r)) + stop("internal error in groupingsets, V1 is not present, please report") # nocov + else + setnames(r, "V1", jsym) + } } if (id) { # integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE @@ -121,6 +128,6 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) # actually processing everything here rbindlist(c( list(empty), # 0 rows template for colorder and type - lapply(sets, aggregate.set) # all aggregations + lapply(sets, aggregate.set, jsym) # all aggregations ), use.names=TRUE, fill=TRUE) } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index dbfb740e69..f396f29635 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15428,6 +15428,16 @@ test(2071.10, dcast(data.table(a=1, b=1, l=list(list(1))), a ~ b, value.var='l') test(2071.11, dcast(data.table(a = 1, b = 2, c = 3), a ~ b, value.var = 'c', fill = '2'), data.table(a=1, `2`=3, key='a')) +# groupingsets j=.N by character(0) set #3653 +d = data.table(x = c("a", "a", "b")) +test(2073.01, groupingsets(d, j = .N, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), N=c(2L,1L,3L))) +test(2073.02, groupingsets(d, j = .N, by = "x", sets = list(character())), data.table(x=NA_character_, N=3L)) +test(2073.03, groupingsets(d, j = .GRP, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), GRP=c(1L,2L,1L))) +test(2073.04, groupingsets(d, j = .GRP, by = "x", sets = list(character())), data.table(x=NA_character_, GRP=1L)) +test(2073.05, groupingsets(d, j = .I, by = "x", sets = list("x", character())), data.table(x=c("a","a","b",rep(NA_character_,3L)), I=c(1:3,1:3))) +test(2073.06, groupingsets(d, j = .I, by = "x", sets = list(character())), data.table(x=rep(NA_character_,3L), I=1:3)) + + ################################### # Add new tests above this line # ################################### From eb07b988ee70d0363c2dcff2a89e98c59a6ae850 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 5 Aug 2019 23:26:21 +0200 Subject: [PATCH 2/5] remove workarounds thanks to #3270 --- R/groupingsets.R | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 954d007034..3cc66bac8c 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -67,21 +67,12 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) # input arguments handling jj = if (!missing(jj)) jj else substitute(j) av = all.vars(jj, TRUE) - jsym = if (length(av)==1L) {if (".N"==av) "N" else if (".I"==av) "I" else if (".GRP"==av) "GRP"} # workaround for autonamed columns in grand total #3653 if (":=" %chin% av) stop("Expression passed to grouping sets function must not update by reference. Use ':=' on results of your grouping function.") if (missing(.SDcols)) .SDcols = if (".SD" %chin% av) setdiff(names(x), by) else NULL # 0 rows template data.table to keep colorder and type - if (length(by)) { - empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by] - } else { - empty = if (length(.SDcols)) x[0L, eval(jj), .SDcols=.SDcols] else x[0L, eval(jj)] - if (!is.data.table(empty)) { - if (length(empty)>0) empty = empty[0L] # fix for #3173 when no grouping and j constant - empty = setDT(list(empty)) # improve after #648, see comment in aggregate.set - } - } + empty = if (length(.SDcols)) x[0L, eval(jj), by, .SDcols=.SDcols] else x[0L, eval(jj), by] if (id && "grouping" %chin% names(empty)) # `j` could have been evaluated to `grouping` field stop("When using `id=TRUE` the 'j' expression must not evaluate to column named 'grouping'.") if (anyDuplicated(names(empty)) > 0L) @@ -99,19 +90,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) int64.by.cols = intersect(int64.cols, by) # aggregate function called for each grouping set aggregate.set = function(by.set, jsym) { - if (length(by.set)) { - r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set] - } else { - ## workaround for grand total single var as data.table too, change to drop=FALSE after #648 solved - r = if (length(.SDcols)) x[, eval(jj), .SDcols=.SDcols] else x[, eval(jj)] - if (!is.data.table(r)) r = setDT(list(r)) - if (!is.null(jsym)) { - if (!"V1" %in% names(r)) - stop("internal error in groupingsets, V1 is not present, please report") # nocov - else - setnames(r, "V1", jsym) - } - } + r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set] if (id) { # integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE # 3267: strtoi("", base = 2L) output apparently unstable across platforms @@ -128,6 +107,6 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) # actually processing everything here rbindlist(c( list(empty), # 0 rows template for colorder and type - lapply(sets, aggregate.set, jsym) # all aggregations + lapply(sets, aggregate.set) ), use.names=TRUE, fill=TRUE) } From 34be1179008bcbd10f32bb9571c8c29716b6f904 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 5 Aug 2019 23:44:11 +0200 Subject: [PATCH 3/5] rm leftover --- R/groupingsets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 3cc66bac8c..87d7ec015b 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -89,7 +89,7 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) stop("Using integer64 class columns require to have 'bit64' package installed.") # nocov int64.by.cols = intersect(int64.cols, by) # aggregate function called for each grouping set - aggregate.set = function(by.set, jsym) { + aggregate.set = function(by.set) { r = if (length(.SDcols)) x[, eval(jj), by.set, .SDcols=.SDcols] else x[, eval(jj), by.set] if (id) { # integer bit mask of aggregation levels: http://www.postgresql.org/docs/9.5/static/functions-aggregate.html#FUNCTIONS-GROUPING-TABLE From 857c6aa06eb93ee17cf76846757f30a7f81109e3 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 5 Aug 2019 23:44:40 +0200 Subject: [PATCH 4/5] keepp comment --- R/groupingsets.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/groupingsets.R b/R/groupingsets.R index 87d7ec015b..9f2a6b63fb 100644 --- a/R/groupingsets.R +++ b/R/groupingsets.R @@ -107,6 +107,6 @@ groupingsets.data.table = function(x, j, by, sets, .SDcols, id = FALSE, jj, ...) # actually processing everything here rbindlist(c( list(empty), # 0 rows template for colorder and type - lapply(sets, aggregate.set) + lapply(sets, aggregate.set) # all aggregations ), use.names=TRUE, fill=TRUE) } From b8ad229e37569ba426bb96158d87d0da1bf2a0df Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 23 Aug 2019 20:23:52 -0700 Subject: [PATCH 5/5] merge test number increasing --- inst/tests/tests.Rraw | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 43acdaf4bd..af01a1addd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15738,15 +15738,14 @@ if (test_bit64) { test(2083.2, rbind(data.table(a=1:2, b=as.integer64(c(1,NA))), data.table(a=3L), fill=TRUE)$b, as.integer64(c(1, NA, NA))) } - # groupingsets j=.N by character(0) set #3653 d = data.table(x = c("a", "a", "b")) -test(2073.01, groupingsets(d, j = .N, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), N=c(2L,1L,3L))) -test(2073.02, groupingsets(d, j = .N, by = "x", sets = list(character())), data.table(x=NA_character_, N=3L)) -test(2073.03, groupingsets(d, j = .GRP, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), GRP=c(1L,2L,1L))) -test(2073.04, groupingsets(d, j = .GRP, by = "x", sets = list(character())), data.table(x=NA_character_, GRP=1L)) -test(2073.05, groupingsets(d, j = .I, by = "x", sets = list("x", character())), data.table(x=c("a","a","b",rep(NA_character_,3L)), I=c(1:3,1:3))) -test(2073.06, groupingsets(d, j = .I, by = "x", sets = list(character())), data.table(x=rep(NA_character_,3L), I=1:3)) +test(2084.01, groupingsets(d, j = .N, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), N=c(2L,1L,3L))) +test(2084.02, groupingsets(d, j = .N, by = "x", sets = list(character())), data.table(x=NA_character_, N=3L)) +test(2084.03, groupingsets(d, j = .GRP, by = "x", sets = list("x", character())), data.table(x=c("a","b",NA_character_), GRP=c(1L,2L,1L))) +test(2084.04, groupingsets(d, j = .GRP, by = "x", sets = list(character())), data.table(x=NA_character_, GRP=1L)) +test(2084.05, groupingsets(d, j = .I, by = "x", sets = list("x", character())), data.table(x=c("a","a","b",rep(NA_character_,3L)), I=c(1:3,1:3))) +test(2084.06, groupingsets(d, j = .I, by = "x", sets = list(character())), data.table(x=rep(NA_character_,3L), I=1:3)) ###################################