diff --git a/NEWS.md b/NEWS.md index c91e72b81c..4c9647cdfb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,6 +133,8 @@ # 2: 3 ``` +24. `DT[, head(.SD,n), by=grp]` and `tail` are now optimized when `n>1`, [#5060](https://github.com/Rdatatable/data.table/issues/5060) [#523](https://github.com/Rdatatable/data.table/issues/523#issuecomment-162934391). `n==1` was already optimized. Thanks to Jan Gorecki and Michael Young for requesting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/data.table.R b/R/data.table.R index 96e9547fc7..4dfa9c276a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -809,8 +809,8 @@ replace_dot_alias = function(e) { # when the 'by' expression includes get/mget/eval, all.vars cannot be trusted to infer all used columns, #4981 allbyvars = NULL else - allbyvars = intersect(all.vars(bysub), names_x) - + allbyvars = intersect(all.vars(bysub), names_x) + orderedirows = .Call(CisOrderedSubset, irows, nrow(x)) # TRUE when irows is NULL (i.e. no i clause). Similar but better than is.sorted(f__) bysameorder = byindex = FALSE if (!bysub %iscall% ":" && ##Fix #4285 @@ -1740,13 +1740,13 @@ replace_dot_alias = function(e) { # is.symbol() is for #1369, #1974 and #2949 if (!(is.call(q) && is.symbol(q[[1L]]) && is.symbol(q[[2L]]) && (q1 <- q[[1L]]) %chin% gfuns)) return(FALSE) if (!(q2 <- q[[2L]]) %chin% names(SDenv$.SDall) && q2 != ".I") return(FALSE) # 875 - if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na"))) && (!q1 %chin% c("head","tail"))) return(TRUE) + if ((length(q)==2L || (!is.null(names(q)) && startsWith(names(q)[3L], "na")))) return(TRUE) # ^^ base::startWith errors on NULL unfortunately # head-tail uses default value n=6 which as of now should not go gforce ... ^^ # otherwise there must be three arguments, and only in two cases: # 1) head/tail(x, 1) or 2) x[n], n>0 length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && - ( (q1 %chin% c("head", "tail") && q3==1L) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) + ( (q1 %chin% c("head", "tail")) || ((q1 == "[" || (q1 == "[[" && eval(call('is.atomic', q[[2L]]), envir=x))) && q3>0L) ) } if (jsub[[1L]]=="list") { GForce = TRUE @@ -1762,6 +1762,8 @@ replace_dot_alias = function(e) { if (length(jsub[[ii]])==3L) jsub[[ii]][[3L]] = eval(jsub[[ii]][[3L]], parent.frame()) # tests 1187.2 & 1187.4 } else { + # adding argument to ghead/gtail if none is supplied to g-optimized head/tail + if (length(jsub) == 2L && jsub[[1L]] %chin% c("head", "tail")) jsub[["n"]] = 6L jsub[[1L]] = as.name(paste0("g", jsub[[1L]])) if (length(jsub)==3L) jsub[[3L]] = eval(jsub[[3L]], parent.frame()) # tests 1187.3 & 1187.5 } @@ -1841,6 +1843,25 @@ replace_dot_alias = function(e) { ans = gforce(thisEnv, jsub, o__, f__, len__, irows) # irows needed for #971. gi = if (length(o__)) o__[f__] else f__ g = lapply(grpcols, function(i) groups[[i]][gi]) + + # adding ghead/gtail(n) support for n > 1 #5060 #523 + q3 = 0 + if (!is.symbol(jsub)) { + headTail_arg = function(q) { + if (length(q)==3L && length(q3 <- q[[3L]])==1L && is.numeric(q3) && + (q1 <- q[[1L]]) %chin% c("ghead", "gtail") && q3!=1) q3 + else 0 + } + if (jsub[[1L]] == "list"){ + q3 = max(sapply(jsub, headTail_arg)) + } else if (length(jsub)==3L) { + q3 = headTail_arg(jsub) + } + } + if (q3 > 0) { + grplens = pmin.int(q3, len__) + g = lapply(g, rep.int, times=grplens) + } ans = c(g, ans) } else { ans = .Call(Cdogroups, x, xcols, groups, grpcols, jiscols, xjiscols, grporder, o__, f__, len__, jsub, SDenv, cols, newnames, !missing(on), verbose) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4d782e4e92..32b16e471f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8116,21 +8116,36 @@ test(1579.18, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.19, dt[, tail(.SD,1L), by=x], dt[, utils::tail(.SD,1L), by=x]) test(1579.20, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) test(1579.21, dt[, tail(.SD,1L), keyby=x], dt[, utils::tail(.SD,1L), keyby=x]) -# GForce _doesn't_ work when n > 1 -test(1579.22, dt[ , tail(.SD, 2), by = x, verbose = TRUE], output = 'GForce FALSE') +# 1579.22 tested gtail with n>1; now 1579.4+ below mysub <- function(x, n) x[n] -test(1579.23, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) -test(1579.24, dt[, .SD[2], by=x], dt[, mysub(.SD,2), by=x]) -test(1579.25, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.26, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) -test(1579.27, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.28, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) -test(1579.29, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) -test(1579.30, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) - -ans = capture.output(dt[, .SD[2], by=x, verbose=TRUE]) -test(1579.31, any(grepl("GForce optimized", ans)), TRUE) +test(1579.23, dt[, .SD[2], by=x, verbose=TRUE], dt[, mysub(.SD,2), by=x], output="GForce optimized.*g[[]") +test(1579.24, dt[, .SD[2], keyby=x], dt[, mysub(.SD,2), keyby=x]) +test(1579.25, dt[, .SD[2L], by=x], dt[, mysub(.SD,2L), by=x]) +test(1579.26, dt[, .SD[2L], keyby=x], dt[, mysub(.SD,2L), keyby=x]) +test(1579.27, dt[, .SD[15], by=x], dt[, mysub(.SD,15), by=x]) # tests 15 > grpsize and that NA is correct including for integer64 +test(1579.28, dt[, .SD[15], keyby=x], dt[, mysub(.SD,15), keyby=x]) + +# gforce head/tail for n>1, #5060 +set.seed(99) +DT = data.table(x = sample(letters[1:5], 20, TRUE), + y = rep.int(1:2, 10), # to test 2 grouping columns get rep'd properly + i = sample(c(-2L,0L,3L,NA), 20, TRUE), + d = sample(c(1.2,-3.4,5.6,NA), 20, TRUE), + s = sample(c("foo","bar",NA), 20, TRUE), + l = sample(list(1:3, mean, letters[4:5], NULL), 20, replace=TRUE)) +if (test_bit64) DT[, i64:=sample(as.integer64(c(-2200000000,+2400000000,NA)), 20, TRUE)] +options(datatable.optimize=2L) +test(1579.401, DT[, .N, by=x]$N, INT(4,6,5,2,3)) # the smallest group is 2, so n=5 tests n constrained to grpsize +test(1579.402, DT[, head(.SD,2), by=x, verbose=TRUE], DT[, utils::head(.SD,2), by=x], output="optimized.*ghead") +test(1579.403, DT[, head(.SD,2), keyby=x, verbose=TRUE], DT[, utils::head(.SD,2), keyby=x], output="optimized.*ghead") +test(1579.404, DT[, head(.SD,5), by=x, verbose=TRUE], DT[, utils::head(.SD,5), by=x], output="optimized.*ghead") +test(1579.405, DT[, head(.SD,5), keyby=x, verbose=TRUE], DT[, utils::head(.SD,5), keyby=x], output="optimized.*ghead") +test(1579.406, DT[, tail(.SD,2), by=x, verbose=TRUE], DT[, utils::tail(.SD,2), by=x], output="optimized.*gtail") +test(1579.407, DT[, tail(.SD,2), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,2), keyby=x], output="optimized.*gtail") +test(1579.408, DT[, tail(.SD,5), by=x, verbose=TRUE], DT[, utils::tail(.SD,5), by=x], output="optimized.*gtail") +test(1579.409, DT[, tail(.SD,5), keyby=x, verbose=TRUE], DT[, utils::tail(.SD,5), keyby=x], output="optimized.*gtail") +test(1579.410, DT[, tail(.SD,2), by=.(x,y), verbose=TRUE], DT[, utils::tail(.SD,2), by=.(x,y)], output="optimized.*gtail") options(datatable.optimize = Inf) @@ -14695,11 +14710,11 @@ DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1) test(2018.1, DT[, head(.SD), a, verbose=TRUE], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)), output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'", - "GForce is on, left j unchanged")) + "GForce optimized j to 'list(ghead(b, n = 6L), ghead(d, n = 6L))'")) test(2018.2, DT[, head(b), a, verbose=TRUE], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)), output=c("lapply optimization is on, j unchanged as 'head(b)'", - "GForce is on, left j unchanged")) + "GForce optimized j to 'ghead(b, n = 6L)'")) test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1))) test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12))) # gforce tests coverage diff --git a/src/gsumm.c b/src/gsumm.c index f806b1e3c8..7470f9f527 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -900,81 +900,72 @@ SEXP gmedian(SEXP x, SEXP narmArg) { return ans; } -static SEXP gfirstlast(SEXP x, const bool first, const int w) { +static SEXP gfirstlast(SEXP x, const bool first, const int w, const bool headw) { // w: which item (1 other than for gnthvalue when could be >1) + // headw: select 1:w of each group when first=true, and (n-w+1):n when first=false (i.e. tail) const bool nosubset = irowslen == -1; + const bool issorted = !isunsorted; // make a const-bool for use inside loops const int n = nosubset ? length(x) : irowslen; - SEXP ans; if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, first?"gfirst":"glast"); - const bool gnth = w>1; // const bool to avoid fetching grpsize[i] when not needed - switch(TYPEOF(x)) { - case LGLSXP: { - const int *ix = LOGICAL(x); - ans = PROTECT(allocVector(LGLSXP, ngrp)); - int *ians = LOGICAL(ans); - for (int i=0; igrpsize[i]) { ians[i]=NA_LOGICAL; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_LOGICAL : ix[irows[k]-1]); - } - } - break; - case INTSXP: { - const int *ix = INTEGER(x); - ans = PROTECT(allocVector(INTSXP, ngrp)); - int *ians = INTEGER(ans); + if (w==1 && headw) error(_("Internal error: gfirstlast headw should only be true when w>1")); + int anslen = ngrp; + if (headw) { + anslen = 0; for (int i=0; igrpsize[i]) { ians[i]=NA_INTEGER; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - ians[i] = nosubset ? ix[k] : (irows[k]==NA_INTEGER ? NA_INTEGER : ix[irows[k]-1]); + anslen += MIN(w, grpsize[i]); } } - break; - case REALSXP: { - const double *dx = REAL(x); - ans = PROTECT(allocVector(REALSXP, ngrp)); - double *dans = REAL(ans); - for (int i=0; igrpsize[i]) { dans[i]=NA_REAL; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_REAL : dx[irows[k]-1]); - } + SEXP ans = PROTECT(allocVector(TYPEOF(x), anslen)); + int ansi = 0; + #define DO(CTYPE, RTYPE, RNA, ASSIGN) { \ + const CTYPE *xd = (const CTYPE *)RTYPE(x); \ + if (headw) { \ + /* returning more than 1 per group; w>1 */ \ + for (int i=0; i1 && first) { \ + /* gnthvalue */ \ + for (int i=0; igrpn) { const CTYPE val=RNA; ASSIGN; continue; } \ + const int j = ff[i]-1+w-1; \ + const int k = issorted ? j : oo[j]-1; \ + const CTYPE val = nosubset ? xd[k] : (irows[k]==NA_INTEGER ? RNA : xd[irows[k]-1]); \ + ASSIGN; \ + } \ + } else { \ + /* w>1 && !first not supported because -i in R means everything-but-i and gnthvalue */ \ + /* currently takes n>0 only. However, we could still support n'th from the end, somehow */ \ + error(_("Internal error: unanticipated case in gfirstlast first=%d w=%d headw=%d"), \ + first, w, headw); \ + } \ } - break; - case CPLXSXP: { - const Rcomplex *dx = COMPLEX(x); - ans = PROTECT(allocVector(CPLXSXP, ngrp)); - Rcomplex *dans = COMPLEX(ans); - for (int i=0; igrpsize[i]) { dans[i]=NA_CPLX; continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - dans[i] = nosubset ? dx[k] : (irows[k]==NA_INTEGER ? NA_CPLX : dx[irows[k]-1]); - } - } break; - case STRSXP: { - const SEXP *sx = STRING_PTR(x); - ans = PROTECT(allocVector(STRSXP, ngrp)); - for (int i=0; igrpsize[i]) { SET_STRING_ELT(ans, i, NA_STRING); continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - SET_STRING_ELT(ans, i, nosubset ? sx[k] : (irows[k]==NA_INTEGER ? NA_STRING : sx[irows[k]-1])); - } - } break; - case VECSXP: { - const SEXP *vx = SEXPPTR_RO(x); - ans = PROTECT(allocVector(VECSXP, ngrp)); - for (int i=0; igrpsize[i]) { SET_VECTOR_ELT(ans, i, ScalarLogical(NA_LOGICAL)); continue; } - int k = first ? ff[i]+w-2 : ff[i]+grpsize[i]-w-1; - if (isunsorted) k = oo[k]-1; - SET_VECTOR_ELT(ans, i, nosubset ? vx[k] : (irows[k]==NA_INTEGER ? ScalarLogical(NA_LOGICAL) : vx[irows[k]-1])); - } - } break; + switch(TYPEOF(x)) { + case LGLSXP: { int *ansd=LOGICAL(ans); DO(int, LOGICAL, NA_LOGICAL, ansd[ansi++]=val) } break; + case INTSXP: { int *ansd=INTEGER(ans); DO(int, INTEGER, NA_INTEGER, ansd[ansi++]=val) } break; + case REALSXP: if (INHERITS(x, char_integer64)) { + int64_t *ansd=(int64_t *)REAL(ans); DO(int64_t, REAL, NA_INTEGER64, ansd[ansi++]=val) } + else { double *ansd=REAL(ans); DO(double, REAL, NA_REAL, ansd[ansi++]=val) } break; + case CPLXSXP: { Rcomplex *ansd=COMPLEX(ans); DO(Rcomplex, COMPLEX, NA_CPLX, ansd[ansi++]=val) } break; + case STRSXP: DO(SEXP, STRING_PTR, NA_STRING, SET_STRING_ELT(ans,ansi++,val)) break; + case VECSXP: DO(SEXP, SEXPPTR_RO, ScalarLogical(NA_LOGICAL), SET_VECTOR_ELT(ans,ansi++,val)) break; default: error(_("Type '%s' not supported by GForce head/tail/first/last/`[`. Either add the prefix utils::head(.) or turn off GForce optimization using options(datatable.optimize=1)"), type2char(TYPEOF(x))); } @@ -984,26 +975,28 @@ static SEXP gfirstlast(SEXP x, const bool first, const int w) { } SEXP glast(SEXP x) { - return gfirstlast(x, false, 1); + return gfirstlast(x, false, 1, false); } SEXP gfirst(SEXP x) { - return gfirstlast(x, true, 1); + return gfirstlast(x, true, 1, false); } -SEXP gtail(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, gtail is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return gfirstlast(x, false, 1); +SEXP gtail(SEXP x, SEXP nArg) { + if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, gtail is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov + const int n=INTEGER(nArg)[0]; + return n==1 ? glast(x) : gfirstlast(x, false, n, true); } -SEXP ghead(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]!=1) error(_("Internal error, ghead is only implemented for n=1. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return gfirstlast(x, true, 1); +SEXP ghead(SEXP x, SEXP nArg) { + if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, gtail is only implemented for n>0. This should have been caught before. please report to data.table issue tracker.")); // # nocov + const int n=INTEGER(nArg)[0]; + return n==1 ? gfirst(x) : gfirstlast(x, true, n, true); } -SEXP gnthvalue(SEXP x, SEXP valArg) { - if (!isInteger(valArg) || LENGTH(valArg)!=1 || INTEGER(valArg)[0]<=0) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov - return gfirstlast(x, true, INTEGER(valArg)[0]); +SEXP gnthvalue(SEXP x, SEXP nArg) { + if (!isInteger(nArg) || LENGTH(nArg)!=1 || INTEGER(nArg)[0]<1) error(_("Internal error, `g[` (gnthvalue) is only implemented single value subsets with positive index, e.g., .SD[2]. This should have been caught before. please report to data.table issue tracker.")); // # nocov + return gfirstlast(x, true, INTEGER(nArg)[0], false); } // TODO: gwhich.min, gwhich.max