From 274eea604a3966961da675dc7429e05f1cecc0f8 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 19 Aug 2021 21:03:17 -0600 Subject: [PATCH 1/2] rework gminmax --- inst/tests/tests.Rraw | 49 ++++---- src/gsumm.c | 275 +++++++++--------------------------------- 2 files changed, 87 insertions(+), 237 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 02b9362822..4f73a19085 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5178,8 +5178,8 @@ test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := INT(NA)] test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-2147483647,Inf)), warning="No non-missing") -test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-2147483647,-Inf)), warning="No non-missing") +test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA))) +test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA))) # for numeric DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA)) @@ -5191,8 +5191,8 @@ test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := NA_real_] test(1313.13, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.14, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,Inf)), warning="No non-missing") -test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,-Inf)), warning="No non-missing") +test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA))) +test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA))) # for date (attribute check.. especially after issues/689 !!!) DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400)) @@ -5215,8 +5215,8 @@ test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := NA_character_] test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")), warning="No non-missing") -test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")), warning="No non-missing") +test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,""))) +test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c"))) # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = @@ -8220,18 +8220,23 @@ test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')]) # handle NULL value correctly #1429 test(1582, uniqueN(NULL), 0L) -# bug fix #1461 -dt = data.table(x=c(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) -# make sure gforce is on -options(datatable.optimize=Inf) -ans1 = suppressWarnings(dt[, base::min(y, na.rm=TRUE), by=x]) -ans2 = suppressWarnings(dt[, base::max(y, na.rm=TRUE), by=x]) -test(1583.1, dt[, min(y, na.rm=TRUE), by=x], ans1, warning="No non-missing values found") -test(1583.2, dt[, max(y, na.rm=TRUE), by=x], ans2, warning="No non-missing values found") -ans3 = suppressWarnings(dt[, base::min(y), by=x]) -ans4 = suppressWarnings(dt[, base::max(y), by=x]) -test(1583.3, dt[, min(y), by=x], ans3) -test(1583.4, dt[, max(y), by=x], ans4) +# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level +# verbatim test from the original report: +options(datatable.optimize=Inf) # ensure gforce is on +DT = data.table( + C1 = c(rep("A", 4), rep("B",4), rep("C", 4)), + C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)), + Val = c(1:5, NaN, NaN, 8,9,10,NaN,12)) +test(1583.1, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')], + data.table(C1=c("A","A","B","B","C","C"), + C2=c("a","b","b","c","c","d"), + agg=c(1,4,5,8,9,10))) +# extra test with a size-1 group containing one NaN too +DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN)) +test(1583.2, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA))) +test(1583.3, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA))) +test(1583.4, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) +test(1583.5, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN))) # Fixed a minor bug in fread when blank.lines.skip=TRUE f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE, logical01=FALSE) @@ -14700,8 +14705,8 @@ if (test_bit64) { test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows } DT = data.table(id = c(1L,1L,2L), v = as.raw(0:2)) -test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min") -test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce max") +test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min/max") +test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce min/max") test(2020.03, DT[, median(v), by=id], error="'raw' not supported by GForce median") test(2020.04, DT[, head(v, 1), by=id], error="'raw' not supported by GForce head") test(2020.05, DT[, tail(v, 1), by=id], error="'raw' not supported by GForce tail") @@ -15820,8 +15825,8 @@ test(2065.7, DT[1L, z_sum := 1i][1L, z_sum], 1i) # GForce for complex columns, part of #3690 DT = data.table(id=c(1L,1L,2L), v=c(1i, 2i, 3i)) -test(2066.01, DT[, min(v), by=id], error="'complex' has no well-defined min") -test(2066.02, DT[, max(v), by=id], error="'complex' has no well-defined max") +test(2066.01, DT[, min(v), by=id], error="'complex' has no well-defined min/max") +test(2066.02, DT[, max(v), by=id], error="'complex' has no well-defined min/max") test(2066.03, DT[, head(v, 1), by=id], data.table(id=1:2, V1=c(1, 3)*1i)) test(2066.04, DT[, tail(v, 1), by=id], data.table(id=1:2, V1=(2:3)*1i)) test(2066.05, DT[, v[2], by=id], data.table(id = 1:2, V1=c(2i, NA))) diff --git a/src/gsumm.c b/src/gsumm.c index 651f1c3385..b925b35fa6 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -713,267 +713,112 @@ SEXP gmean(SEXP x, SEXP narmArg) return(ans); } -// gmin -SEXP gmin(SEXP x, SEXP narm) +static SEXP gminmax(SEXP x, SEXP narm, const bool min) { if (!isLogical(narm) || LENGTH(narm)!=1 || LOGICAL(narm)[0]==NA_LOGICAL) error(_("na.rm must be TRUE or FALSE")); - if (!isVectorAtomic(x)) error(_("GForce min can only be applied to columns, not .SD or similar. To find min of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,min),by=,.SDcols=]'")); - if (inherits(x, "factor") && !inherits(x, "ordered")) error(_("min is not meaningful for factors.")); + if (!isVectorAtomic(x)) error(_("GForce min/max can only be applied to columns, not .SD or similar. To find min/max of all items in a list such as .SD, either add the prefix base::min(.SD) or turn off GForce optimization using options(datatable.optimize=1). More likely, you may be looking for 'DT[,lapply(.SD,min),by=,.SDcols=]'")); + if (inherits(x, "factor") && !inherits(x, "ordered")) error(_("min/max is not meaningful for factors.")); const int n = (irowslen == -1) ? length(x) : irowslen; //clock_t start = clock(); SEXP ans; - if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmin"); - int protecti=0; + if (nrow != n) error(_("nrow [%d] != length(x) [%d] in %s"), nrow, n, "gmin"); + // GForce guarantees each group has at least one value; i.e. we don't need to consider length-0 per group here switch(TYPEOF(x)) { - case LGLSXP: case INTSXP: - ans = PROTECT(allocVector(INTSXP, ngrp)); protecti++; + case LGLSXP: case INTSXP: { + ans = PROTECT(allocVector(INTSXP, ngrp)); + int *ansd = INTEGER(ans); + const int *xd = INTEGER(x); if (!LOGICAL(narm)[0]) { - for (int i=0; i Date: Thu, 19 Aug 2021 23:06:27 -0600 Subject: [PATCH 2/2] add news item --- NEWS.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/NEWS.md b/NEWS.md index 769b98dafa..5e666241af 100644 --- a/NEWS.md +++ b/NEWS.md @@ -185,6 +185,50 @@ 34. `na.omit(DT)` did not remove `NA` in `nanotime` columns, [#4744](https://github.com/Rdatatable/data.table/issues/4744). Thanks Jean-Mathieu Vermosen for reporting, and Michael Chirico for the PR. +35. `DT[, min(intCol, na.rm=TRUE), by=grp]` would return `Inf` for any groups containing all NAs, with a type change from `integer` to `numeric` to hold the `Inf`, and with warning. Now `NA` is returned for such all-NA groups, without warning or type change. This is almost-surely less surprising, more convenient, consistent, and efficient. There was no user request for this, likely because our desire to be consistent with base R in this regard was known (base R's `min(x, na.rm=TRUE)` returns `Inf` with warning for all-NA input). Matt Dowle made this change when reworking internals, [#5105](https://github.com/Rdatatable/data.table/pull/5105). The old behavior seemed so bad, and since there was a warning too, it seemed more appropriate to treat it as a bug. + + ```R + DT + # A B + # + # 1: a 1 + # 2: a NA + # 3: b 2 + # 4: b NA + + DT[, min(B,na.rm=TRUE), by=A] # no change in behavior (no all-NA groups yet) + # A V1 + # + # 1: a 1 + # 2: b 2 + + DT[3, B:=NA] # make an all-NA group + DT + # A B + # + # 1: a 1 + # 2: a NA + # 3: b NA + # 4: b NA + + DT[, min(B,na.rm=TRUE), by=A] # old result + # A V1 + # # V1's type changed to numeric (inconsistent) + # 1: a 1 + # 2: b Inf # Inf surprising + # Warning message: # warning inconvenient + # In gmin(B, na.rm = TRUE) : + # No non-missing values found in at least one group. Coercing to numeric + # type and returning 'Inf' for such groups to be consistent with base + + DT[, min(B,na.rm=TRUE), by=A] # new result + # A V1 + # # V1's type remains integer (consistent) + # 1: a 1 + # 2: b NA # NA because there are no non-NA, naturally + # no inconvenient warning + ``` + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :