Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#### BUG FIXES

1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting.

#### NOTES

1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
Expand Down
11 changes: 8 additions & 3 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -1595,7 +1595,12 @@ replace_dot_alias <- function(e) {
jvnames = ansvarsnew
}
} else if (length(as.character(jsub[[1L]])) == 1L) { # Else expect problems with <jsub[[1L]] == >
if (length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "head" || jsub[[1L]] == "tail") && jsub[[2L]] == ".SD" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") ) {
subopt = length(jsub) == 3L && jsub[[1L]] == "[" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N")
headopt = jsub[[1L]] == "head" || jsub[[1L]] == "tail"
firstopt = jsub[[1L]] == "first" || jsub[[1L]] == "last" # fix for #2030
if ((length(jsub) >= 2L && jsub[[2L]] == ".SD") &&
(subopt || headopt || firstopt)) {
if (headopt && length(jsub)==2L) jsub[["n"]] = 6L # head-tail n=6 when missing #3462
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
jsub = as.call(c(quote(list), lapply(ansvarsnew, function(x) { jsub[[2L]] = as.name(x); jsub })))
jvnames = ansvarsnew
Expand Down Expand Up @@ -1694,7 +1699,7 @@ replace_dot_alias <- function(e) {
else
cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L),"'\n",sep="")
}
dotN <- function(x) if (is.name(x) && x == ".N") TRUE else FALSE # For #5760
dotN <- function(x) is.name(x) && x == ".N" # For #5760
# FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with
# nomatch=0L even now.. but not switching it on yet, will deal it separately.
if (getOption("datatable.optimize")>=2 && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) {
Expand All @@ -1712,7 +1717,7 @@ replace_dot_alias <- function(e) {
# Need is.symbol() check. See #1369, #1974 or #2949 issues and explanation below by searching for one of these issues.
cond = is.call(q) && is.symbol(q[[1]]) && (q1c <- as.character(q[[1]])) %chin% gfuns && !is.call(q[[2L]])
# run GForce for simple f(x) calls and f(x, na.rm = TRUE)-like calls
ans = cond && (length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L)))
ans = cond && (length(q)==2L || identical("na",substring(names(q)[3L], 1L, 2L))) && (!q1c %chin% c("head","tail")) # head-tail uses default value n=6 which as of now should not go gforce
if (identical(ans, TRUE)) return(ans)
# otherwise there must be three arguments, and only in two cases --
# 1) head/tail(x, 1) or 2) x[n], n>0
Expand Down
70 changes: 58 additions & 12 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -13901,30 +13901,76 @@ test(2013.3, DT[2], error="Column 2 ['b'] is length 4 but column 1 is length 3;
## new fread keepLeadingZeros parameter in v1.12.2
# leading zeros in both integer and float numbers are converted to character when keepLeadingZeros=TRUE
test_data_single <- "0, 00, 01, 00010, 002.01\n"
test(1978.1, fread(test_data_single), data.table(0L, 0L, 1L, 10L, 2.01))
test(1978.2, fread(test_data_single, keepLeadingZeros = FALSE), data.table(0L, 0L, 1L, 10L, 2.01))
test(1978.3, fread(test_data_single, keepLeadingZeros = TRUE), data.table(0L, "00","01","00010","002.01"))
test(2014.1, fread(test_data_single), data.table(0L, 0L, 1L, 10L, 2.01))
test(2014.2, fread(test_data_single, keepLeadingZeros = FALSE), data.table(0L, 0L, 1L, 10L, 2.01))
test(2014.3, fread(test_data_single, keepLeadingZeros = TRUE), data.table(0L, "00","01","00010","002.01"))
# converts whole column to character when keepLeadingZeros = TRUE and at least 1 value contains a leading zero
test_data_mult <- paste0(c(sample(1:100),"0010",sample(1:100)), collapse="\n")
test(1978.4, class(fread(test_data_mult, keepLeadingZeros = TRUE)[[1]]), "character")
test(1978.5, class(fread(test_data_mult, keepLeadingZeros = FALSE)[[1]]), "integer")
test(2014.4, class(fread(test_data_mult, keepLeadingZeros = TRUE)[[1]]), "character")
test(2014.5, class(fread(test_data_mult, keepLeadingZeros = FALSE)[[1]]), "integer")

# rbindlist should drop NA from levels of source factors, relied on by package emil
test(1979, levels(rbindlist( list( data.frame(a=factor("a",levels=c("a",NA),exclude=NULL)) ))$a), "a") # the NA level should not be retained
test(2015, levels(rbindlist( list( data.frame(a=factor("a",levels=c("a",NA),exclude=NULL)) ))$a), "a") # the NA level should not be retained

# better save->load->set(<new column>) message, #2996
DT = data.table(a=1:3)
save(list="DT", file=tt<-tempfile())
rm(DT)
name = load(tt)
test(1980.1, name, "DT")
test(1980.2, DT, data.table(a=1:3))
test(1980.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column
test(1980.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column
test(1980.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*alloc.col.*on it first") # just set()
test(1980.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller)
test(2016.1, name, "DT")
test(2016.2, DT, data.table(a=1:3))
test(2016.3, DT[2,a:=4L], data.table(a=INT(1,4,3))) # no error for := when existing column
test(2016.4, set(DT,3L,1L,5L), data.table(a=INT(1,4,5))) # no error for set() when existing column
test(2016.5, set(DT,2L,"newCol",5L), error="either been loaded from disk.*or constructed manually.*Please run setDT.*alloc.col.*on it first") # just set()
test(2016.6, DT[2,newCol:=6L], data.table(a=INT(1,4,5), newCol=INT(NA,6L,NA))) # := ok (it changes DT in caller)
unlink(tt)

# gfirst(.SD) throws an error about not using head(.SD, n), but the latter works #2030
DT = data.table(id = c(1L,1L,2L), v = 1:3)
test(2017.1, DT[, first(.SD), by=id, .SDcols="v", verbose=TRUE], data.table(id=1:2, v=c(1L,3L)), output="optimized j to 'list(gfirst(v))'")
test(2017.2, DT[, first(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to 'gfirst(v)'")
test(2017.3, DT[, last(v), by=id, verbose=TRUE], data.table(id=1:2, V1=c(2L,3L)), output="optimized j to 'glast(v)'")
test(2017.4, DT[, v[1L], by=id, verbose=TRUE], data.table(id=1:2, V1=c(1L,3L)), output="optimized j to '`g[`(v, 1L)'")
DT = data.table(id = c(1L,1L,2L), v = 1:3, y = 3:1, z = c(TRUE, TRUE, FALSE), u = c("a","b","c"))
test(2017.5, DT[, first(.SD), by=id, .SDcols=c("v","y","z","u"), verbose=TRUE],
data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c")),
output="optimized j to 'list(gfirst(v), gfirst(y), gfirst(z), gfirst(u))'")
test(2017.6, DT[, last(.SD), by=id, .SDcols=c("v","y","z","u"), verbose=TRUE],
data.table(id=1:2, v=c(2L,3L), y=c(2L,1L), z=c(TRUE,FALSE), u=c("b","c")),
output="optimized j to 'list(glast(v), glast(y), glast(z), glast(u))'")
test(2017.7, DT[, .SD[1L], by=id, .SDcols=c("v","y","z","u"), verbose=TRUE],
data.table(id=1:2, v=c(1L,3L), y=c(3L,1L), z=c(TRUE,FALSE), u=c("a","c")),
output="optimized j to 'list(`g[`(v, 1L), `g[`(y, 1L), `g[`(z, 1L), `g[`(u, 1L))'")
# ghead argument "n" is missing, with no default #3462
DT = data.table(a=c(rep(1L, 7L), rep(2L, 5L)), b=1:12, d=12:1)
test(2018.1, DT[, head(.SD), a, verbose=TRUE],
data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(1:6, 8:12), d=c(12:7, 5:1)),
output=c("lapply optimization changed j from 'head(.SD)' to 'list(head(b, n = 6L), head(d, n = 6L))'",
"GForce is on, left j unchanged"))
test(2018.2, DT[, head(b), a, verbose=TRUE],
data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(1:6, 8:12)),
output=c("lapply optimization is on, j unchanged as 'head(b)'",
"GForce is on, left j unchanged"))
test(2018.3, DT[, tail(.SD), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), b=c(2:7, 8:12), d=c(11:6, 5:1)))
test(2018.4, DT[, tail(b), a], data.table(a=c(rep(1L, 6L), rep(2L, 5L)), V1=c(2:7, 8:12)))
# gforce tests coverage
if (test_bit64) {
DT = data.table(id=c(rep(1L,3), rep(2L, 3)), v=bit64::as.integer64(c(1:3, 4L, 5:6)))
test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows
}
DT = data.table(id = c(1L,1L,2L), v = c(1i, 2i, 3i))
test(2020.01, DT[, min(v), by=id], error="'complex' not supported by GForce min")
test(2020.02, DT[, max(v), by=id], error="'complex' not supported by GForce max")
test(2020.03, DT[, median(v), by=id], error="'complex' not supported by GForce median")
test(2020.04, DT[, head(v, 1), by=id], error="'complex' not supported by GForce head")
test(2020.05, DT[, tail(v, 1), by=id], error="'complex' not supported by GForce tail")
test(2020.06, DT[, v[1], by=id], error="'complex' not supported by GForce subset")
test(2020.07, DT[, sd(v), by=id], error="'complex' not supported by GForce sd")
test(2020.08, DT[, var(v), by=id], error="'complex' not supported by GForce var")
test(2020.09, DT[, prod(v), by=id], error="'complex' not supported by GForce prod")
DT = data.table(id = c(1L,1L,2L,2L), v = c(1L, 2L, NA, NA))
test(2020.10, DT[, median(v), id], data.table(id=1:2, V1=c(1.5, NA))) # median whole group has NAs


###################################
# Add new tests above this line #
Expand Down
69 changes: 48 additions & 21 deletions src/gsumm.c
Original file line number Diff line number Diff line change
Expand Up @@ -495,8 +495,8 @@ SEXP gmean(SEXP x, SEXP narm)
}
break;
default:
free(s); free(c);
error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
free(s); free(c); // # nocov because it already stops at gsum, remove nocov if gmean will support a type that gsum wont
error("Type '%s' not supported by GForce mean (gmean) na.rm=TRUE. Either add the prefix base::mean(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x))); // # nocov
}
ans = PROTECT(allocVector(REALSXP, ngrp));
for (int i=0; i<ngrp; i++) {
Expand Down Expand Up @@ -955,32 +955,41 @@ SEXP glast(SEXP x) {
SEXP ans;
if (nrow != n) error("nrow [%d] != length(x) [%d] in gtail", nrow, n);
switch(TYPEOF(x)) {
case LGLSXP:
case LGLSXP: {
const int *ix = LOGICAL(x);
ans = PROTECT(allocVector(LGLSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
ians[i] = ix[k];
}
}
break;
case INTSXP:
case INTSXP: {
const int *ix = INTEGER(x);
ans = PROTECT(allocVector(INTSXP, ngrp));
int *ians = INTEGER(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
ians[i] = ix[k];
}
}
break;
case REALSXP:
case REALSXP: {
const double *dx = REAL(x);
ans = PROTECT(allocVector(REALSXP, ngrp));
double *dans = REAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]+grpsize[i]-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
dans[i] = dx[k];
}
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
Expand Down Expand Up @@ -1017,32 +1026,41 @@ SEXP gfirst(SEXP x) {
SEXP ans;
if (nrow != n) error("nrow [%d] != length(x) [%d] in ghead", nrow, n);
switch(TYPEOF(x)) {
case LGLSXP:
case LGLSXP: {
int const *ix = LOGICAL(x);
ans = PROTECT(allocVector(LGLSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
ians[i] = ix[k];
}
}
break;
case INTSXP:
case INTSXP: {
const int *ix = INTEGER(x);
ans = PROTECT(allocVector(INTSXP, ngrp));
int *ians = INTEGER(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
ians[i] = ix[k];
}
}
break;
case REALSXP:
case REALSXP: {
const double *dx = REAL(x);
ans = PROTECT(allocVector(REALSXP, ngrp));
double *dans = REAL(ans);
for (i=0; i<ngrp; i++) {
k = ff[i]-1;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
dans[i] = dx[k];
}
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
Expand Down Expand Up @@ -1088,35 +1106,44 @@ SEXP gnthvalue(SEXP x, SEXP valArg) {
SEXP ans;
if (nrow != n) error("nrow [%d] != length(x) [%d] in ghead", nrow, n);
switch(TYPEOF(x)) {
case LGLSXP:
case LGLSXP: {
const int *ix = LOGICAL(x);
ans = PROTECT(allocVector(LGLSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { LOGICAL(ans)[i] = NA_LOGICAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
LOGICAL(ans)[i] = LOGICAL(x)[k];
ians[i] = ix[k];
}
}
break;
case INTSXP:
case INTSXP: {
const int *ix = LOGICAL(x);
ans = PROTECT(allocVector(INTSXP, ngrp));
int *ians = LOGICAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
INTEGER(ans)[i] = INTEGER(x)[k];
ians[i] = ix[k];
}
}
break;
case REALSXP:
case REALSXP: {
const double *dx = REAL(x);
ans = PROTECT(allocVector(REALSXP, ngrp));
double *dans = REAL(ans);
for (i=0; i<ngrp; i++) {
if (val > grpsize[i]) { REAL(ans)[i] = NA_REAL; continue; }
k = ff[i]+val-2;
if (isunsorted) k = oo[k]-1;
k = (irowslen == -1) ? k : irows[k]-1;
REAL(ans)[i] = REAL(x)[k];
dans[i] = dx[k];
}
}
break;
case STRSXP:
ans = PROTECT(allocVector(STRSXP, ngrp));
Expand Down Expand Up @@ -1270,7 +1297,7 @@ SEXP gvarsd1(SEXP x, SEXP narm, Rboolean isSD)
SETLENGTH(sub, maxgrpn);
break;
default:
if (isSD) {
if (!isSD) {
error("Type '%s' not supported by GForce var (gvar). Either add the prefix stats::var(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
} else {
error("Type '%s' not supported by GForce sd (gsd). Either add the prefix stats::sd(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
Expand Down