From fc03664cb6edfa569f88bbd64c4b1efc226c8eee Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 30 Jul 2021 20:46:09 +0200 Subject: [PATCH 01/16] added ghead/gtail support of n>1 --- NEWS.md | 2 + R/data.table.R | 26 ++++- inst/tests/tests.Rraw | 30 ++++- src/gsumm.c | 258 +++++++++++++++++++++++++++++++++++++++++- 4 files changed, 307 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5d6a7da3ea..c31945bf4d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -93,6 +93,8 @@ 13. `fifelse()` now coerces logical `NA` to other types and the `na` argument supports vectorized input, [#4277](https://github.com/Rdatatable/data.table/issues/4277) [#4286](https://github.com/Rdatatable/data.table/issues/4286) [#4287](https://github.com/Rdatatable/data.table/issues/4287). Thanks to @michaelchirico and @shrektan for reporting, and @shrektan for implementing. +14. `ghead(n)` / `gtail(n)` now also accepct values for n > 1, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). Thanks to @jangorecki and @myoung3 for reporting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index c15d65f034..f95b3b9e7a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1733,13 +1733,13 @@ replace_dot_alias = function(e) { # is.symbol() is for #1369, #1974 and #2949 if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 - if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na"))) && (!q1 %chin% c("head","tail"))) return(TRUE) + if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) # ^^ base::startWith errors on NULL unfortunately # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: # 1) head/tail(x, 1) or 2) x[n], n>0 length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && - ( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) + ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) } if (jsub[[1L]]=="list") { GForce = TRUE @@ -1755,6 +1755,8 @@ replace_dot_alias = function(e) { if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } @@ -1834,6 +1836,26 @@ replace_dot_alias = function(e) { ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971. gi = if (length(o__)) o__[f__] else f__ g = lapply(grpcols, function(i) groups[[i]][gi]) + + # adding ghead/gtail(n) support for n > 1 #5060 #523 + q3 = 0 + if (!is.symbol(jsub)) { + headTail_arg = function(q) { + if (length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && + (q1 <- q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 + else 0 + } + + if (jsub[[1L]] == "list"){ + q3 = max(sapply(jsub, headTail_arg)) + } else if (length(jsub)==3L) { + q3 = headTail_arg(jsub) + } + } + if (q3 > 0) { + grplens = pmin.int(q3, len__) + g = list(rep(unlist(g), grplens)) + } ans = c(g, ans) } else { ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index aa654a236c..588395587f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8094,7 +8094,8 @@ test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) # GForce _doesn't_ work when n > 1 -test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE') +# removed test since GForce now works for head/tail with n > 1 +# test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE') mysub <- function(x, n) x[n] test(1579.23, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) @@ -14654,11 +14655,11 @@ DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) test(2018.1, DT[, head(.SD), a, verbose=TRUE], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)), output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'", - "GForce is on, left j unchanged")) + "GForce optimized j to 'list(ghead(b, n = 6L), ghead(d, n = 6L))'")) test(2018.2, DT[, head(b), a, verbose=TRUE], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)), output=c("lapply optimization is on, j unchanged as 'head(b)'", - "GForce is on, left j unchanged")) + "GForce optimized j to 'ghead(b, n = 6L)'")) test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1))) test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12))) # gforce tests coverage @@ -17802,3 +17803,26 @@ test(2199.1, as.data.table(as.list(1:2))[, .SD,.SDcols=(-1L)], data.table(V2=2 test(2199.2, as.data.table(as.list(1:2))[, .SD,.SDcols=(-(1L))], data.table(V2=2L)) test(2199.3, as.data.table(as.list(1:3))[, .SD,.SDcols=(-1L)], data.table(V2=2L, V3=3L)) test(2199.4, data.table(V1=-1L, V2=-2L, V3=-3L)[,.SD,.SDcols=-V2:-V1], error="not found") + +# gforce head/tail for n > 1 +dt = data.table(x = sample(letters[1:5], 20, TRUE), + i1 = sample(-10:10, 20, TRUE), + i2 = sample(c(-10:10, NA), 20, TRUE), + d1 = as.numeric(sample(-10:10, 20, TRUE)), + d2 = as.numeric(sample(c(-10:10, NA), 20, TRUE))) +test(2200.01, dt[, head(.SD,2), by=x], dt[, utils::head(.SD,2), by=x]) +test(2200.02, dt[, head(.SD,2), by=x], dt[, utils::head(.SD,2), by=x]) +test(2200.03, dt[, head(.SD,2), keyby=x], dt[, utils::head(.SD,2), keyby=x]) +test(2200.04, dt[, head(.SD,2), keyby=x], dt[, utils::head(.SD,2), keyby=x]) +test(2200.05, dt[, head(.SD,5), by=x], dt[, utils::head(.SD,5), by=x]) +test(2200.06, dt[, head(.SD,5), by=x], dt[, utils::head(.SD,5), by=x]) +test(2200.07, dt[, head(.SD,5), keyby=x], dt[, utils::head(.SD,5), keyby=x]) +test(2200.08, dt[, head(.SD,5), keyby=x], dt[, utils::head(.SD,5), keyby=x]) +test(2200.11, dt[, tail(.SD,2), by=x], dt[, utils::tail(.SD,2), by=x]) +test(2200.12, dt[, tail(.SD,2), by=x], dt[, utils::tail(.SD,2), by=x]) +test(2200.13, dt[, tail(.SD,2), keyby=x], dt[, utils::tail(.SD,2), keyby=x]) +test(2200.14, dt[, tail(.SD,2), keyby=x], dt[, utils::tail(.SD,2), keyby=x]) +test(2200.15, dt[, tail(.SD,5), by=x], dt[, utils::tail(.SD,5), by=x]) +test(2200.16, dt[, tail(.SD,5), by=x], dt[, utils::tail(.SD,5), by=x]) +test(2200.17, dt[, tail(.SD,5), keyby=x], dt[, utils::tail(.SD,5), keyby=x]) +test(2200.18, dt[, tail(.SD,5), keyby=x], dt[, utils::tail(.SD,5), keyby=x]) \ No newline at end of file diff --git a/src/gsumm.c b/src/gsumm.c index 651f1c3385..eacac95b14 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1186,13 +1186,263 @@ SEXP gfirst(SEXP x) { } SEXP gtail(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, gtail is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return (glast(x)); + if (!isInteger(valArg) || LENGTH(valArg)!=1) error(_("Internal error, gtail is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov + const int val=INTEGER(valArg)[0]; + if (val == 1) return (glast(x)); + + const int n = (irowslen == -1) ? length(x) : irowslen; + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gtail"); + + int anslen = 0; + for (int i=0; i grpsize[i]) anslen += grpsize[i]; else anslen += val; + } + SEXP ans; + switch(TYPEOF(x)) { + case LGLSXP: { + const int *ix = LOGICAL(x); + ans = PROTECT(allocVector(LGLSXP, anslen)); + int *ians = LOGICAL(ans); + int offset = 0; + for (int i=0; i thisgrpsize ? thisgrpsize : val); + for (int j=0; j thisgrpsize-1) break; + int k = ff[i]+thisgrpsize-2-j; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[offset+(rev-1-j)] = ix[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case INTSXP: { + const int *ix = INTEGER(x); + ans = PROTECT(allocVector(INTSXP, anslen)); + int *ians = INTEGER(ans); + int offset = 0; + for (int i=0; i thisgrpsize ? thisgrpsize : val); + for (int j=0; j thisgrpsize-1) break; + int k = ff[i]+thisgrpsize-2-j; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[offset+(rev-1-j)] = ix[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case REALSXP: { + const double *dx = REAL(x); + ans = PROTECT(allocVector(REALSXP, anslen)); + double *dans = REAL(ans); + int offset = 0; + for (int i=0; i thisgrpsize ? thisgrpsize : val); + for (int j=0; j thisgrpsize-1) break; + int k = ff[i]+thisgrpsize-2-j; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[offset+(rev-1-j)] = dx[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case CPLXSXP: { + const Rcomplex *dx = COMPLEX(x); + ans = PROTECT(allocVector(CPLXSXP, anslen)); + Rcomplex *dans = COMPLEX(ans); + int offset = 0; + for (int i=0; i thisgrpsize ? thisgrpsize : val); + for (int j=0; j thisgrpsize-1) break; + int k = ff[i]+thisgrpsize-2-j; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[offset+(rev-1-j)] = dx[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case STRSXP: { + ans = PROTECT(allocVector(STRSXP, anslen)); + int offset = 0; + for (int i=0; i thisgrpsize ? thisgrpsize : val); + for (int j=0; j thisgrpsize-1) break; + int k = ff[i]+thisgrpsize-2-j; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_STRING_ELT(ans, offset+(rev-1-j), STRING_ELT(x, k)); + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case VECSXP: { + ans = PROTECT(allocVector(VECSXP, anslen)); + int offset = 0; + for (int i=0; i thisgrpsize ? thisgrpsize : val); + for (int j=0; j thisgrpsize-1) break; + int k = ff[i]+thisgrpsize-2-j; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_VECTOR_ELT(ans, offset+(rev-1-j), VECTOR_ELT(x, k)); + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + default: + error(_("Type '%s' not supported by GForce gtail. Either add the prefix utils::tail(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); + } + copyMostAttrib(x, ans); + UNPROTECT(1); + return(ans); + } SEXP ghead(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, ghead is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return (gfirst(x)); + if (!isInteger(valArg) || LENGTH(valArg)!=1) error(_("Internal error, ghead is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov + const int val=INTEGER(valArg)[0]; + if (val == 1) return (gfirst(x)); + + const int n = (irowslen == -1) ? length(x) : irowslen; + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "ghead"); + + int anslen = 0; + for (int i=0; i thisgrpsize) anslen += thisgrpsize; else anslen += val; + } + SEXP ans; + switch(TYPEOF(x)) { + case LGLSXP: { + const int *ix = LOGICAL(x); + ans = PROTECT(allocVector(LGLSXP, anslen)); + int *ians = LOGICAL(ans); + int offset = 0; + for (int i=0; i thisgrpsize-1) break; + int k = ff[i]+j-1; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[offset+j] = ix[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case INTSXP: { + const int *ix = INTEGER(x); + ans = PROTECT(allocVector(INTSXP, anslen)); + int *ians = INTEGER(ans); + int offset = 0; + for (int i=0; i thisgrpsize-1) break; + int k = ff[i]+j-1; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + ians[offset+j] = ix[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case REALSXP: { + const double *dx = REAL(x); + ans = PROTECT(allocVector(REALSXP, anslen)); + double *dans = REAL(ans); + int offset = 0; + for (int i=0; i thisgrpsize-1) break; + int k = ff[i]+j-1; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[offset+j] = dx[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case CPLXSXP: { + const Rcomplex *dx = COMPLEX(x); + ans = PROTECT(allocVector(CPLXSXP, anslen)); + Rcomplex *dans = COMPLEX(ans); + int offset = 0; + for (int i=0; i thisgrpsize-1) break; + int k = ff[i]+j-1; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + dans[offset+j] = dx[k]; + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case STRSXP: { + ans = PROTECT(allocVector(STRSXP, anslen)); + int offset = 0; + for (int i=0; i thisgrpsize-1) break; + int k = ff[i]+j-1; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_STRING_ELT(ans, offset+j, STRING_ELT(x, k)); + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + case VECSXP: { + ans = PROTECT(allocVector(VECSXP, anslen)); + int offset = 0; + for (int i=0; i thisgrpsize-1) break; + int k = ff[i]+j-1; + if (isunsorted) k = oo[k]-1; + k = (irowslen == -1) ? k : irows[k]-1; + SET_VECTOR_ELT(ans, offset+j, VECTOR_ELT(x, k)); + } + if (val > thisgrpsize) offset += thisgrpsize; else offset += val; + } + } + break; + default: + error(_("Type '%s' not supported by GForce ghead. Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); + } + copyMostAttrib(x, ans); + UNPROTECT(1); + return(ans); } SEXP gnthvalue(SEXP x, SEXP valArg) { From 805849f3ece233e3e4a9b6adecd14b62b8577363 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 24 Aug 2021 23:06:52 -0600 Subject: [PATCH 02/16] macro'd to avoid repeated logic and for easier maintenance/extension --- src/gsumm.c | 149 +++++++++++++++++----------------------------------- 1 file changed, 49 insertions(+), 100 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index bee1be31b0..cd8791f625 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -904,11 +904,10 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) // w: which item (1 other than for gnthvalue when could be >1) // headw: select 1:w of each group when first=true, and (n-w+1):n when first=false (i.e. tail) const bool nosubset = irowslen == -1; + const bool issorted = !isunsorted; // make a const-bool for use inside loops const int n = nosubset ? length(x) : irowslen; - SEXP ans; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, first?"gfirst":"glast"); - const bool gnth = w>1 && !headw; // const bool to avoid fetching grpsize[i] when not needed - if (w==1 && headw) error(_("Internal error: headw should only be true when w>1")); + if (w==1 && headw) error(_("Internal error: gfirstlast headw should only be true when w>1")); int anslen = ngrp; if (headw) { anslen = 0; @@ -916,105 +915,55 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) anslen += MIN(w, grpsize[i]); } } + SEXP ans = PROTECT(allocVector(TYPEOF(x), anslen)); int ansi = 0; - switch(TYPEOF(x)) { - case LGLSXP: { - const int *xd = LOGICAL(x); - ans = PROTECT(allocVector(LGLSXP, anslen)); - int *ansd = LOGICAL(ans); - for (int i=0; igrpn) { ansd[ansi++]=NA_LOGICAL; continue; } - const int thisn = headw ? MIN(w,grpn) : 1; - int jstart = ff[i]-1 + (first ? (!headw)*(w-1) : (grpn-thisn)); - const int jend = jstart+thisn; - for (int j=jstart; jgrpn) { ansd[ansi++]=NA_INTEGER; continue; } - const int thisn = headw ? MIN(w,grpn) : 1; - int jstart = ff[i]-1 + (first ? (!headw)*(w-1) : (grpn-thisn)); - const int jend = jstart+thisn; - for (int j=jstart; jgrpn) { ansd[ansi++]=NA_REAL; continue; } - const int thisn = headw ? MIN(w,grpn) : 1; - int jstart = ff[i]-1 + (first ? (!headw)*(w-1) : (grpn-thisn)); - const int jend = jstart+thisn; - for (int j=jstart; j1 */ \ + for (int i=0; i1 && first) { \ + /* gnthvalue */ \ + for (int i=0; igrpn) { const CTYPE val=RNA; ASSIGN; continue; } \ + const int j = ff[i]-1+w-1; \ + const int k = issorted ? j : oo[j]-1; \ + const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]); \ + ASSIGN; \ + } \ + } else { \ + /* w>1 && !first not supported because -i in R means everything-but-i and gnthvalue */ \ + /* currently takes n>0 only. However, we could still support n'th from the end, somehow */ \ + error(_("Internal error: unanticipated case in gfirstlast first=%d w=%d headw=%d"), \ + first, w, headw); \ + } \ } - break; - case CPLXSXP: { - const Rcomplex *xd = COMPLEX(x); - ans = PROTECT(allocVector(CPLXSXP, anslen)); - Rcomplex *ansd = COMPLEX(ans); - for (int i=0; igrpn) { ansd[ansi++]=NA_CPLX; continue; } - const int thisn = headw ? MIN(w,grpn) : 1; - int jstart = ff[i]-1 + (first ? (!headw)*(w-1) : (grpn-thisn)); - const int jend = jstart+thisn; - for (int j=jstart; jgrpn) { SET_STRING_ELT(ans, ansi++, NA_STRING); continue; } - const int thisn = headw ? MIN(w,grpn) : 1; - int jstart = ff[i]-1 + (first ? (!headw)*(w-1) : (grpn-thisn)); - const int jend = jstart+thisn; - for (int j=jstart; jgrpn) { SET_VECTOR_ELT(ans, ansi++, ScalarLogical(NA_LOGICAL)); continue; } - const int thisn = headw ? MIN(w,grpn) : 1; - int jstart = ff[i]-1 + (first ? (!headw)*(w-1) : (grpn-thisn)); - const int jend = jstart+thisn; - for (int j=jstart; j Date: Tue, 24 Aug 2021 23:15:46 -0600 Subject: [PATCH 03/16] remove identical tests --- inst/tests/tests.Rraw | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fa5a6383c2..e7a684b30f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18043,19 +18043,11 @@ dt = data.table(x = sample(letters[1:5], 20, TRUE), d1 = as.numeric(sample(-10:10, 20, TRUE)), d2 = as.numeric(sample(c(-10:10, NA), 20, TRUE))) test(2212.01, dt[, head(.SD,2), by=x], dt[, utils::head(.SD,2), by=x]) -test(2212.02, dt[, head(.SD,2), by=x], dt[, utils::head(.SD,2), by=x]) -test(2212.03, dt[, head(.SD,2), keyby=x], dt[, utils::head(.SD,2), keyby=x]) -test(2212.04, dt[, head(.SD,2), keyby=x], dt[, utils::head(.SD,2), keyby=x]) -test(2212.05, dt[, head(.SD,5), by=x], dt[, utils::head(.SD,5), by=x]) -test(2212.06, dt[, head(.SD,5), by=x], dt[, utils::head(.SD,5), by=x]) -test(2212.07, dt[, head(.SD,5), keyby=x], dt[, utils::head(.SD,5), keyby=x]) -test(2212.08, dt[, head(.SD,5), keyby=x], dt[, utils::head(.SD,5), keyby=x]) -test(2212.11, dt[, tail(.SD,2), by=x], dt[, utils::tail(.SD,2), by=x]) -test(2212.12, dt[, tail(.SD,2), by=x], dt[, utils::tail(.SD,2), by=x]) -test(2212.13, dt[, tail(.SD,2), keyby=x], dt[, utils::tail(.SD,2), keyby=x]) -test(2212.14, dt[, tail(.SD,2), keyby=x], dt[, utils::tail(.SD,2), keyby=x]) -test(2212.15, dt[, tail(.SD,5), by=x], dt[, utils::tail(.SD,5), by=x]) -test(2212.16, dt[, tail(.SD,5), by=x], dt[, utils::tail(.SD,5), by=x]) -test(2212.17, dt[, tail(.SD,5), keyby=x], dt[, utils::tail(.SD,5), keyby=x]) -test(2212.18, dt[, tail(.SD,5), keyby=x], dt[, utils::tail(.SD,5), keyby=x]) +test(2212.02, dt[, head(.SD,2), keyby=x], dt[, utils::head(.SD,2), keyby=x]) +test(2212.03, dt[, head(.SD,5), by=x], dt[, utils::head(.SD,5), by=x]) +test(2212.04, dt[, head(.SD,5), keyby=x], dt[, utils::head(.SD,5), keyby=x]) +test(2212.05, dt[, tail(.SD,2), by=x], dt[, utils::tail(.SD,2), by=x]) +test(2212.06, dt[, tail(.SD,2), keyby=x], dt[, utils::tail(.SD,2), keyby=x]) +test(2212.07, dt[, tail(.SD,5), by=x], dt[, utils::tail(.SD,5), by=x]) +test(2212.08, dt[, tail(.SD,5), keyby=x], dt[, utils::tail(.SD,5), keyby=x]) From 35f9f4a4aba1df73fab6745ee002f1bd293abcca Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 24 Aug 2021 23:17:01 -0600 Subject: [PATCH 04/16] dt renamed DT in tests --- inst/tests/tests.Rraw | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e7a684b30f..8a5c28e596 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18037,17 +18037,17 @@ for (col in c("a","b","c")) { } # gforce head/tail for n>1, #5060 -dt = data.table(x = sample(letters[1:5], 20, TRUE), +DT = data.table(x = sample(letters[1:5], 20, TRUE), i1 = sample(-10:10, 20, TRUE), i2 = sample(c(-10:10, NA), 20, TRUE), d1 = as.numeric(sample(-10:10, 20, TRUE)), d2 = as.numeric(sample(c(-10:10, NA), 20, TRUE))) -test(2212.01, dt[, head(.SD,2), by=x], dt[, utils::head(.SD,2), by=x]) -test(2212.02, dt[, head(.SD,2), keyby=x], dt[, utils::head(.SD,2), keyby=x]) -test(2212.03, dt[, head(.SD,5), by=x], dt[, utils::head(.SD,5), by=x]) -test(2212.04, dt[, head(.SD,5), keyby=x], dt[, utils::head(.SD,5), keyby=x]) -test(2212.05, dt[, tail(.SD,2), by=x], dt[, utils::tail(.SD,2), by=x]) -test(2212.06, dt[, tail(.SD,2), keyby=x], dt[, utils::tail(.SD,2), keyby=x]) -test(2212.07, dt[, tail(.SD,5), by=x], dt[, utils::tail(.SD,5), by=x]) -test(2212.08, dt[, tail(.SD,5), keyby=x], dt[, utils::tail(.SD,5), keyby=x]) +test(2212.01, DT[, head(.SD,2), by=x], DT[, utils::head(.SD,2), by=x]) +test(2212.02, DT[, head(.SD,2), keyby=x], DT[, utils::head(.SD,2), keyby=x]) +test(2212.03, DT[, head(.SD,5), by=x], DT[, utils::head(.SD,5), by=x]) +test(2212.04, DT[, head(.SD,5), keyby=x], DT[, utils::head(.SD,5), keyby=x]) +test(2212.05, DT[, tail(.SD,2), by=x], DT[, utils::tail(.SD,2), by=x]) +test(2212.06, DT[, tail(.SD,2), keyby=x], DT[, utils::tail(.SD,2), keyby=x]) +test(2212.07, DT[, tail(.SD,5), by=x], DT[, utils::tail(.SD,5), by=x]) +test(2212.08, DT[, tail(.SD,5), keyby=x], DT[, utils::tail(.SD,5), keyby=x]) From 4fa5fe7edb631d0d30f67873ebd570f5d96c2302 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 24 Aug 2021 23:39:22 -0600 Subject: [PATCH 05/16] more NA in tests and added integer64 --- inst/tests/tests.Rraw | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8a5c28e596..cbb04763f5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18037,17 +18037,20 @@ for (col in c("a","b","c")) { } # gforce head/tail for n>1, #5060 -DT = data.table(x = sample(letters[1:5], 20, TRUE), - i1 = sample(-10:10, 20, TRUE), - i2 = sample(c(-10:10, NA), 20, TRUE), - d1 = as.numeric(sample(-10:10, 20, TRUE)), - d2 = as.numeric(sample(c(-10:10, NA), 20, TRUE))) -test(2212.01, DT[, head(.SD,2), by=x], DT[, utils::head(.SD,2), by=x]) -test(2212.02, DT[, head(.SD,2), keyby=x], DT[, utils::head(.SD,2), keyby=x]) -test(2212.03, DT[, head(.SD,5), by=x], DT[, utils::head(.SD,5), by=x]) -test(2212.04, DT[, head(.SD,5), keyby=x], DT[, utils::head(.SD,5), keyby=x]) -test(2212.05, DT[, tail(.SD,2), by=x], DT[, utils::tail(.SD,2), by=x]) -test(2212.06, DT[, tail(.SD,2), keyby=x], DT[, utils::tail(.SD,2), keyby=x]) -test(2212.07, DT[, tail(.SD,5), by=x], DT[, utils::tail(.SD,5), by=x]) -test(2212.08, DT[, tail(.SD,5), keyby=x], DT[, utils::tail(.SD,5), keyby=x]) +set.seed(99) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE)) +if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] +old = options(datatable.optimize=2L) +test(2212.01, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(2212.02, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(2212.03, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(2212.04, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(2212.05, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(2212.06, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(2212.07, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(2212.08, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(2212.09, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +options(old) From 64f019240f7071bd56548381b244d1b00dca6f6e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 24 Aug 2021 23:46:39 -0600 Subject: [PATCH 06/16] added char and vec cols to test --- inst/tests/tests.Rraw | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index cbb04763f5..9da737cf5e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18040,7 +18040,9 @@ for (col in c("a","b","c")) { set.seed(99) DT = data.table(x = sample(letters[1:5], 20, TRUE), i = sample(c(-2L,0L,3L,NA), 20, TRUE), - d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE)) + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] old = options(datatable.optimize=2L) test(2212.01, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize From b6cd87f9ddf979e1ea7c9eaa512668908410db4b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 00:00:45 -0600 Subject: [PATCH 07/16] more dup tests elsewhere --- inst/tests/tests.Rraw | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9da737cf5e..b0b39120a4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8121,17 +8121,10 @@ test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) # test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE') mysub <- function(x, n) x[n] -test(1579.23, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) -test(1579.24, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) -test(1579.25, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.26, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.27, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.28, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.29, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.30, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) - -ans = capture.output(dt[, .SD[2], by=x, verbose=TRUE]) -test(1579.31, any(grepl("GForce optimized", ans)), TRUE) +test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) options(datatable.optimize = Inf) From 4d718825a4e139d787c4cbc2c4f588910b9b0350 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 00:04:29 -0600 Subject: [PATCH 08/16] new test shows gnth on int64 is wrong NA when n>grpsize --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b0b39120a4..520b9fdcd9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8125,6 +8125,8 @@ test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], outp test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,2L), by=x]) # 15 tests > grpsize and that NA is correct including for integer64 +test(1579.28, dt[, .SD[15L], keyby=x],dt[, mysub(.SD,2L), keyby=x]) options(datatable.optimize = Inf) From 47cb68a0496f25892401ebe45e6553abce5d4db5 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 00:19:11 -0600 Subject: [PATCH 09/16] now tests 1579.27 & 1579.28 do show the wrong NA for int64 (y still had 2 not 15 in prev commit) --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 520b9fdcd9..3b16efa04e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8125,8 +8125,8 @@ test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], outp test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,2L), by=x]) # 15 tests > grpsize and that NA is correct including for integer64 -test(1579.28, dt[, .SD[15L], keyby=x],dt[, mysub(.SD,2L), keyby=x]) +test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # 15 tests > grpsize and that NA is correct including for integer64 +test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) options(datatable.optimize = Inf) From 7009d1dc74fba92caec72aaf686e21a39f51644e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 00:37:40 -0600 Subject: [PATCH 10/16] added int64 branch (easier now macro'd) so new tests 1579.27 & 1579.28 now pass --- src/gsumm.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index cd8791f625..7470f9f527 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -918,7 +918,7 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) SEXP ans = PROTECT(allocVector(TYPEOF(x), anslen)); int ansi = 0; #define DO(CTYPE, RTYPE, RNA, ASSIGN) { \ - const CTYPE *xd = RTYPE(x); \ + const CTYPE *xd = (const CTYPE *)RTYPE(x); \ if (headw) { \ /* returning more than 1 per group; w>1 */ \ for (int i=0; i Date: Wed, 25 Aug 2021 00:59:46 -0600 Subject: [PATCH 11/16] grouping by 2 columns fails with invalid 'times' argument; extra test --- inst/tests/tests.Rraw | 2 ++ 1 file changed, 2 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3b16efa04e..70cacaa5fe 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18034,6 +18034,7 @@ for (col in c("a","b","c")) { # gforce head/tail for n>1, #5060 set.seed(99) DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly i = sample(c(-2L,0L,3L,NA), 20, TRUE), d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), s = sample(c("foo","bar",NA), 20, TRUE), @@ -18049,5 +18050,6 @@ test(2212.06, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), test(2212.07, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") test(2212.08, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") test(2212.09, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(2212.10, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") options(old) From 3926224c14b88f6d534f8bb5fbd0450a0a2987cd Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 01:05:30 -0600 Subject: [PATCH 12/16] rep on 2 group columns fixed --- R/data.table.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 66f81afc71..26c01def71 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1852,7 +1852,6 @@ replace_dot_alias = function(e) { (q1 <- q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 else 0 } - if (jsub[[1L]] == "list"){ q3 = max(sapply(jsub, headTail_arg)) } else if (length(jsub)==3L) { @@ -1861,7 +1860,7 @@ replace_dot_alias = function(e) { } if (q3 > 0) { grplens = pmin.int(q3, len__) - g = list(rep(unlist(g), grplens)) + g = lapply(g, rep.int, times=grplens) } ans = c(g, ans) } else { From aff4a4756878988047b0f703e90691e2391d5569 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 01:06:46 -0600 Subject: [PATCH 13/16] trailing whitespace --- NEWS.md | 2 +- R/data.table.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8448fefac5..a17ecfd067 100644 --- a/NEWS.md +++ b/NEWS.md @@ -132,7 +132,7 @@ # 1: 1 # 2: 3 ``` - + 24. `DT[, head(.SD,n), by=grp]` (and `tail`) is now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES diff --git a/R/data.table.R b/R/data.table.R index 26c01def71..4dfa9c276a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -809,8 +809,8 @@ replace_dot_alias = function(e) { # when the 'by' expression includes get/mget/eval, all.vars cannot be trusted to infer all used columns, #4981 allbyvars = NULL else - allbyvars = intersect(all.vars(bysub), names_x) - + allbyvars = intersect(all.vars(bysub), names_x) + orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__) bysameorder = byindex = FALSE if (!bysub %iscall% ":" && ##Fix #4285 From b13c3859b13ae4c1c5d5afdccc3de57f1b61404f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 01:10:30 -0600 Subject: [PATCH 14/16] fix test --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 70cacaa5fe..997326958c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18050,6 +18050,6 @@ test(2212.06, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), test(2212.07, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") test(2212.08, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") test(2212.09, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(2212.10, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(2212.10, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") options(old) From cd018e7565031437aca4357b3b36ed84f30b6d86 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 01:35:17 -0600 Subject: [PATCH 15/16] move tests up --- inst/tests/tests.Rraw | 49 ++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 997326958c..32b16e471f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8116,18 +8116,37 @@ test(1579.18, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) -# GForce _doesn't_ work when n > 1 -# removed test since GForce now works for head/tail with n > 1 -# test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE') +# 1579.22 tested gtail with n>1; now 1579.4+ below mysub <- function(x, n) x[n] test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # 15 tests > grpsize and that NA is correct including for integer64 +test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) +# gforce head/tail for n>1, #5060 +set.seed(99) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) +if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] +options(datatable.optimize=2L) +test(1579.401, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") + options(datatable.optimize = Inf) # test for #1419, rleid doesn't remove names attribute @@ -18031,25 +18050,3 @@ for (col in c("a","b","c")) { } } -# gforce head/tail for n>1, #5060 -set.seed(99) -DT = data.table(x = sample(letters[1:5], 20, TRUE), - y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly - i = sample(c(-2L,0L,3L,NA), 20, TRUE), - d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), - s = sample(c("foo","bar",NA), 20, TRUE), - l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) -if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] -old = options(datatable.optimize=2L) -test(2212.01, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize -test(2212.02, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") -test(2212.03, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") -test(2212.04, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") -test(2212.05, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") -test(2212.06, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") -test(2212.07, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") -test(2212.08, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") -test(2212.09, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") -test(2212.10, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") -options(old) - From fde19a9c643bbdacb91549e6ac641ec28bc3cdf3 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 25 Aug 2021 01:55:03 -0600 Subject: [PATCH 16/16] news item tweak --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index a17ecfd067..4c9647cdfb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,7 +133,7 @@ # 2: 3 ``` -24. `DT[, head(.SD,n), by=grp]` (and `tail`) is now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. +24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES