From dc006ba51d29ad57410e44490d6f375e4ca0b3ab Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 17 Mar 2019 19:26:00 +0530 Subject: [PATCH 1/8] gforce opts for first-last, head-tail, closes #2030 and #3462 --- NEWS.md | 5 +++++ R/data.table.R | 7 ++++++- inst/tests/tests.Rraw | 9 +++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 610f678e31..d89e03f875 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,11 @@ 11. A join's result could be incorrectly keyed when a single nomatch occurred at the very beginning while all other values matched, [#3441](https://github.com/Rdatatable/data.table/issues/3441). The incorrect key would cause incorrect results in subsequent queries. Thanks to @symbalex for reporting and @franknarf1 for pinpointing the root cause. +12. `first` and `last` methods called by group inside `[.data.table` on `.SD` are now optimized to GForce. [#2030](https://github.com/Rdatatable/data.table/issues/2030). Thanks to @franknarf1 for reporting. + +13. `head` and `tail` methods called by group inside `[.data.table` on `.SD` now use default `n` argument. [#3462](https://github.com/Rdatatable/data.table/issues/3462). + + #### NOTES 1. When upgrading to 1.12.0 some Windows users might have seen `CdllVersion not found` in some circumstances. We found a way to catch that so the [helpful message](https://twitter.com/MattDowle/status/1084528873549705217) now occurs for those upgrading from versions prior to 1.12.0 too, as well as those upgrading from 1.12.0 to a later version. See item 1 in notes section of 1.12.0 below for more background. diff --git a/R/data.table.R b/R/data.table.R index 2052d1c921..07993a26b8 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1613,7 +1613,12 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { jvnames = ansvarsnew } } else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with - if (length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "head" || jsub[[1L]] == "tail") && jsub[[2L]] == ".SD" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") ) { + subopt = length(jsub) == 3L && jsub[[1L]] == "[" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") + headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail" + firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030 + if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") && + (subopt || headopt || firstopt)) { + if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462 # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet. jsub = as.call(c(quote(list), lapply(ansvarsnew, function(x) { jsub[[2L]] = as.name(x); jsub }))) jvnames = ansvarsnew diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2993c05a22..10281a36ce 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13626,6 +13626,15 @@ dx = data.table(id = 1L, key = "id") di = list(z=c(2L, 1L)) test(1999.2, key(dx[di]), NULL) +# gfirst(.SD) throws an error about not using head(.SD, n), but the latter works #2030 +DT = data.table(id = c(1L,1L,2L), v = 1:3) +test(2000.1, DT[, first(.SD), by=id, .SDcols="v"], DT[, head(.SD, 1L), by=id]) +DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1) +test(2000.2, DT[, first(.SD), by=id, .SDcols=c("v","y")], DT[, head(.SD, 1L), by=id, .SDcols=c("v","y")]) +# ghead argument "n" is missing, with no default #3462 +DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) +test(2001, DT[, head(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1))) + ################################### # Add new tests above this line # From f0a0a94e0b740474ecfae3a90e32cef6328a78c6 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Sun, 17 Mar 2019 19:43:52 +0530 Subject: [PATCH 2/8] R dataptrs out of loops in gforce funs --- src/gsumm.c | 63 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index 419d0459e2..6f8dc1f584 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -955,32 +955,41 @@ SEXP glast(SEXP x) { SEXP ans; if (nrow != n) error("nrow [%d] != length(x) [%d] in gtail", nrow, n); switch(TYPEOF(x)) { - case LGLSXP: + case LGLSXP: { + const int *ix = LOGICAL(x); ans = PROTECT(allocVector(LGLSXP, ngrp)); + int *ians = LOGICAL(ans); for (i=0; i grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; } k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; - LOGICAL(ans)[i] = LOGICAL(x)[k]; + ians[i] = ix[k]; } + } break; - case INTSXP: + case INTSXP: { + const int *ix = LOGICAL(x); ans = PROTECT(allocVector(INTSXP, ngrp)); + int *ians = LOGICAL(ans); for (i=0; i grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; } k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; - INTEGER(ans)[i] = INTEGER(x)[k]; + ians[i] = ix[k]; } + } break; - case REALSXP: + case REALSXP: { + const double *dx = REAL(x); ans = PROTECT(allocVector(REALSXP, ngrp)); + double *dans = REAL(ans); for (i=0; i grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; } k = ff[i]+val-2; if (isunsorted) k = oo[k]-1; k = (irowslen == -1) ? k : irows[k]-1; - REAL(ans)[i] = REAL(x)[k]; + dans[i] = dx[k]; } + } break; case STRSXP: ans = PROTECT(allocVector(STRSXP, ngrp)); From 0eae0bc2e1382c14543bf3c0657c5d5f112a2f49 Mon Sep 17 00:00:00 2001 From: jangorecki Date: Mon, 18 Mar 2019 09:55:23 +0530 Subject: [PATCH 3/8] coverage gforce --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 26 +++++++++++++++++++++++--- src/gsumm.c | 4 ++-- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 07993a26b8..a96c0efff3 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1717,7 +1717,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { else cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L),"'\n",sep="") } - dotN <- function(x) if (is.name(x) && x == ".N") TRUE else FALSE # For #5760 + dotN <- function(x) is.name(x) && x == ".N" # For #5760 # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with # nomatch=0L even now.. but not switching it on yet, will deal it separately. if (getOption("datatable.optimize")>=2 && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 10281a36ce..b5659c4519 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13629,11 +13629,31 @@ test(1999.2, key(dx[di]), NULL) # gfirst(.SD) throws an error about not using head(.SD, n), but the latter works #2030 DT = data.table(id = c(1L,1L,2L), v = 1:3) test(2000.1, DT[, first(.SD), by=id, .SDcols="v"], DT[, head(.SD, 1L), by=id]) -DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1) -test(2000.2, DT[, first(.SD), by=id, .SDcols=c("v","y")], DT[, head(.SD, 1L), by=id, .SDcols=c("v","y")]) +DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1, z = c(TRUE, TRUE, FALSE), u = c("a","b","c")) +test(2000.2, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u")], DT[, head(.SD, 1L), by=id, .SDcols=c("v","y","z","u")]) +test(2000.3, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u")], DT[, tail(.SD, 1L), by=id, .SDcols=c("v","y","z","u")]) # ghead argument "n" is missing, with no default #3462 DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) -test(2001, DT[, head(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1))) +test(2001.1, DT[, head(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1))) +# test(2001.2, DT[, head(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12))) # TODO, requires rework of https://github.com/Rdatatable/data.table/blob/8a355cf653977752f8d39c23cf0ea6a7714851eb/R/data.table.R#L1728 +# gforce tests coverage +if (test_bit64) { + DT = data.table(id=c(rep(1L,3), rep(2L, 3)), v=bit64::as.integer64(c(1:3, 4L, 5:6))) + test(2002.01, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows +} +DT = data.table(id = c(1L,1L,2L), v = c(1i, 2i, 3i)) +test(2002.02, DT[, min(v), by=id], error="not supported by GForce min") +test(2002.03, DT[, max(v), by=id], error="not supported by GForce max") +test(2002.04, DT[, median(v), by=id], error="not supported by GForce median") +test(2002.05, DT[, head(v, 1), by=id], error="not supported by GForce head") +test(2002.06, DT[, tail(v, 1), by=id], error="not supported by GForce tail") +test(2002.07, DT[, v[1], by=id], error="not supported by GForce subset") +test(2002.08, DT[, sd(v), by=id], error="not supported by GForce sd") +test(2002.09, DT[, var(v), by=id], error="not supported by GForce var") +test(2002.10, DT[, prod(v), by=id], error="not supported by GForce prod") +DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA)) +test(2002.11, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs + ################################### diff --git a/src/gsumm.c b/src/gsumm.c index 6f8dc1f584..0960aea65a 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -496,7 +496,7 @@ SEXP gmean(SEXP x, SEXP narm) break; default: free(s); free(c); - error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x))); + error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x))); // # nocov because it already stops at gsum, remove nocov if gmean will support a type that gsum wont } ans = PROTECT(allocVector(REALSXP, ngrp)); for (int i=0; i Date: Mon, 18 Mar 2019 12:38:17 +0530 Subject: [PATCH 4/8] more coverage --- inst/tests/tests.Rraw | 13 ++++++++----- src/gsumm.c | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b5659c4519..4bfe9f5d75 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13628,14 +13628,18 @@ test(1999.2, key(dx[di]), NULL) # gfirst(.SD) throws an error about not using head(.SD, n), but the latter works #2030 DT = data.table(id = c(1L,1L,2L), v = 1:3) -test(2000.1, DT[, first(.SD), by=id, .SDcols="v"], DT[, head(.SD, 1L), by=id]) +test(2000.1, DT[, first(.SD), by=id, .SDcols="v"], data.table(id=1:2, v=c(1L,3L))) +test(2000.2, DT[, first(v), by=id], data.table(id=1:2, V1=c(1L,3L))) +test(2000.3, DT[, last(v), by=id], data.table(id=1:2, V1=c(2L,3L))) +test(2000.4, DT[, v[1L], by=id], data.table(id=1:2, V1=c(1L,3L))) DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1, z = c(TRUE, TRUE, FALSE), u = c("a","b","c")) -test(2000.2, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u")], DT[, head(.SD, 1L), by=id, .SDcols=c("v","y","z","u")]) -test(2000.3, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u")], DT[, tail(.SD, 1L), by=id, .SDcols=c("v","y","z","u")]) +test(2000.5, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u")], data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c"))) +test(2000.6, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u")], data.table(id=1:2, v=c(2L,3L), y=c(2L,1L), z=c(TRUE,FALSE), u=c("b","c"))) +test(2000.7, DT[, .SD[1L], by=id, .SDcols=c("v","y","z","u")], data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c"))) # ghead argument "n" is missing, with no default #3462 DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) test(2001.1, DT[, head(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1))) -# test(2001.2, DT[, head(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12))) # TODO, requires rework of https://github.com/Rdatatable/data.table/blob/8a355cf653977752f8d39c23cf0ea6a7714851eb/R/data.table.R#L1728 +# test(2001.2, DT[, head(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12))) # TODO, requires rework of 8a355cf/R/data.table.R#L1728 # gforce tests coverage if (test_bit64) { DT = data.table(id=c(rep(1L,3), rep(2L, 3)), v=bit64::as.integer64(c(1:3, 4L, 5:6))) @@ -13655,7 +13659,6 @@ DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA)) test(2002.11, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs - ################################### # Add new tests above this line # ################################### diff --git a/src/gsumm.c b/src/gsumm.c index 0960aea65a..725f08978b 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -495,8 +495,8 @@ SEXP gmean(SEXP x, SEXP narm) } break; default: - free(s); free(c); - error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x))); // # nocov because it already stops at gsum, remove nocov if gmean will support a type that gsum wont + free(s); free(c); // # nocov because it already stops at gsum, remove nocov if gmean will support a type that gsum wont + error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x))); // # nocov } ans = PROTECT(allocVector(REALSXP, ngrp)); for (int i=0; i Date: Mon, 18 Mar 2019 21:18:42 +0530 Subject: [PATCH 5/8] missing n, handle non-SD head-tail input #3462 --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index a96c0efff3..8cc1e86a20 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1735,7 +1735,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) { # Need is.symbol() check. See #1369, #1974 or #2949 issues and explanation below by searching for one of these issues. cond = is.call(q) && is.symbol(q[[1]]) && (q1c <- as.character(q[[1]])) %chin% gfuns && !is.call(q[[2L]]) # run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls - ans = cond && (length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) + ans = cond && (length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1c %chin% c("head","tail")) # head-tail uses default value n=6 which as of now should not go gforce if (identical(ans, TRUE)) return(ans) # otherwise there must be three arguments, and only in two cases -- # 1) head/tail(x, 1) or 2) x[n], n>0 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4bfe9f5d75..6e7a42f965 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13639,7 +13639,9 @@ test(2000.7, DT[, .SD[1L], by=id, .SDcols=c("v","y","z","u")], data.table(id=1:2 # ghead argument "n" is missing, with no default #3462 DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) test(2001.1, DT[, head(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1))) -# test(2001.2, DT[, head(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12))) # TODO, requires rework of 8a355cf/R/data.table.R#L1728 +test(2001.2, DT[, head(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12))) +test(2001.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1))) +test(2001.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12))) # gforce tests coverage if (test_bit64) { DT = data.table(id=c(rep(1L,3), rep(2L, 3)), v=bit64::as.integer64(c(1:3, 4L, 5:6))) From 3389fd2441aad6075cf3de598fb1e4e8f90521c6 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 1 Apr 2019 14:29:11 -0700 Subject: [PATCH 6/8] Added 'complex' to the error strings to test that part of error message construction --- inst/tests/tests.Rraw | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f84b45d726..f51c6046c0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13947,15 +13947,15 @@ if (test_bit64) { test(2019.01, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows } DT = data.table(id = c(1L,1L,2L), v = c(1i, 2i, 3i)) -test(2019.02, DT[, min(v), by=id], error="not supported by GForce min") -test(2019.03, DT[, max(v), by=id], error="not supported by GForce max") -test(2019.04, DT[, median(v), by=id], error="not supported by GForce median") -test(2019.05, DT[, head(v, 1), by=id], error="not supported by GForce head") -test(2019.06, DT[, tail(v, 1), by=id], error="not supported by GForce tail") -test(2019.07, DT[, v[1], by=id], error="not supported by GForce subset") -test(2019.08, DT[, sd(v), by=id], error="not supported by GForce sd") -test(2019.09, DT[, var(v), by=id], error="not supported by GForce var") -test(2019.10, DT[, prod(v), by=id], error="not supported by GForce prod") +test(2019.02, DT[, min(v), by=id], error="'complex' not supported by GForce min") +test(2019.03, DT[, max(v), by=id], error="'complex' not supported by GForce max") +test(2019.04, DT[, median(v), by=id], error="'complex' not supported by GForce median") +test(2019.05, DT[, head(v, 1), by=id], error="'complex' not supported by GForce head") +test(2019.06, DT[, tail(v, 1), by=id], error="'complex' not supported by GForce tail") +test(2019.07, DT[, v[1], by=id], error="'complex' not supported by GForce subset") +test(2019.08, DT[, sd(v), by=id], error="'complex' not supported by GForce sd") +test(2019.09, DT[, var(v), by=id], error="'complex' not supported by GForce var") +test(2019.10, DT[, prod(v), by=id], error="'complex' not supported by GForce prod") DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA)) test(2019.11, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs From 42d0fa72615fc8c5fbd6b7dc888092eab2f43d51 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 1 Apr 2019 15:18:13 -0700 Subject: [PATCH 7/8] added checks on optimization messages, to act as documentation/reminders in the tests.Raw file too --- NEWS.md | 2 +- inst/tests/tests.Rraw | 52 ++++++++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/NEWS.md b/NEWS.md index a128cb717d..c73c869c32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,7 +8,7 @@ #### BUG FIXES -1. `head` and `tail` by group were optimized but no longer require `n` to be provided (the default is n=6L), [#3462](https://github.com/Rdatatable/data.table/issues/3462). +1. `head` and `tail` by group no longer error when `n=` is not provided (i.e. the default `n=6L` now works), [#3462](https://github.com/Rdatatable/data.table/issues/3462). #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f51c6046c0..5e08636260 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13927,37 +13927,49 @@ unlink(tt) # gfirst(.SD) throws an error about not using head(.SD, n), but the latter works #2030 DT = data.table(id = c(1L,1L,2L), v = 1:3) -test(2017.1, DT[, first(.SD), by=id, .SDcols="v"], data.table(id=1:2, v=c(1L,3L))) -test(2017.2, DT[, first(v), by=id], data.table(id=1:2, V1=c(1L,3L))) -test(2017.3, DT[, last(v), by=id], data.table(id=1:2, V1=c(2L,3L))) -test(2017.4, DT[, v[1L], by=id], data.table(id=1:2, V1=c(1L,3L))) +test(2017.1, DT[, first(.SD), by=id, .SDcols="v", verbose=TRUE], data.table(id=1:2, v=c(1L,3L)), output="optimized j to 'list(gfirst(v))'") +test(2017.2, DT[, first(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to 'gfirst(v)'") +test(2017.3, DT[, last(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(2L,3L)), output="optimized j to 'glast(v)'") +test(2017.4, DT[, v[1L], by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to '`g[`(v, 1L)'") DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1, z = c(TRUE, TRUE, FALSE), u = c("a","b","c")) -test(2017.5, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u")], data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c"))) -test(2017.6, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u")], data.table(id=1:2, v=c(2L,3L), y=c(2L,1L), z=c(TRUE,FALSE), u=c("b","c"))) -test(2017.7, DT[, .SD[1L], by=id, .SDcols=c("v","y","z","u")], data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c"))) +test(2017.5, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u"), verbose=TRUE], + data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c")), + output="optimized j to 'list(gfirst(v), gfirst(y), gfirst(z), gfirst(u))'") +test(2017.6, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u"), verbose=TRUE], + data.table(id=1:2, v=c(2L,3L), y=c(2L,1L), z=c(TRUE,FALSE), u=c("b","c")), + output="optimized j to 'list(glast(v), glast(y), glast(z), glast(u))'") +test(2017.7, DT[, .SD[1L], by=id, .SDcols=c("v","y","z","u"), verbose=TRUE], + data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c")), + output="optimized j to 'list(`g[`(v, 1L), `g[`(y, 1L), `g[`(z, 1L), `g[`(u, 1L))'") # ghead argument "n" is missing, with no default #3462 DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) -test(2018.1, DT[, head(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1))) -test(2018.2, DT[, head(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12))) +test(2018.1, DT[, head(.SD), a, verbose=TRUE], + data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)), + output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'", + "GForce is on, left j unchanged")) +test(2018.2, DT[, head(b), a, verbose=TRUE], + data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)), + output=c("lapply optimization is on, j unchanged as 'head(b)'", + "GForce is on, left j unchanged")) test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1))) test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12))) # gforce tests coverage if (test_bit64) { DT = data.table(id=c(rep(1L,3), rep(2L, 3)), v=bit64::as.integer64(c(1:3, 4L, 5:6))) - test(2019.01, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows + test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows } DT = data.table(id = c(1L,1L,2L), v = c(1i, 2i, 3i)) -test(2019.02, DT[, min(v), by=id], error="'complex' not supported by GForce min") -test(2019.03, DT[, max(v), by=id], error="'complex' not supported by GForce max") -test(2019.04, DT[, median(v), by=id], error="'complex' not supported by GForce median") -test(2019.05, DT[, head(v, 1), by=id], error="'complex' not supported by GForce head") -test(2019.06, DT[, tail(v, 1), by=id], error="'complex' not supported by GForce tail") -test(2019.07, DT[, v[1], by=id], error="'complex' not supported by GForce subset") -test(2019.08, DT[, sd(v), by=id], error="'complex' not supported by GForce sd") -test(2019.09, DT[, var(v), by=id], error="'complex' not supported by GForce var") -test(2019.10, DT[, prod(v), by=id], error="'complex' not supported by GForce prod") +test(2020.01, DT[, min(v), by=id], error="'complex' not supported by GForce min") +test(2020.02, DT[, max(v), by=id], error="'complex' not supported by GForce max") +test(2020.03, DT[, median(v), by=id], error="'complex' not supported by GForce median") +test(2020.04, DT[, head(v, 1), by=id], error="'complex' not supported by GForce head") +test(2020.05, DT[, tail(v, 1), by=id], error="'complex' not supported by GForce tail") +test(2020.06, DT[, v[1], by=id], error="'complex' not supported by GForce subset") +test(2020.07, DT[, sd(v), by=id], error="'complex' not supported by GForce sd") +test(2020.08, DT[, var(v), by=id], error="'complex' not supported by GForce var") +test(2020.09, DT[, prod(v), by=id], error="'complex' not supported by GForce prod") DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA)) -test(2019.11, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs +test(2020.10, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs ################################### From 94835225a4ebc4af738dcf50db70598af286b76e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 1 Apr 2019 15:25:46 -0700 Subject: [PATCH 8/8] news item --- NEWS.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index c73c869c32..cd462c3663 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,11 +4,9 @@ #### NEW FEATURES -1. `first` and `last` by group are now optimized, [#2030](https://github.com/Rdatatable/data.table/issues/2030). Thanks to @franknarf1 for reporting. - #### BUG FIXES -1. `head` and `tail` by group no longer error when `n=` is not provided (i.e. the default `n=6L` now works), [#3462](https://github.com/Rdatatable/data.table/issues/3462). +1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. #### NOTES