Rdatatable · mattdowle · Aug 20, 2021 · Aug 20, 2021 · Aug 20, 2021
@@ -185,6 +185,50 @@
 
 34. `na.omit(DT)` did not remove `NA` in `nanotime` columns, [#4744](https://github.com/Rdatatable/data.table/issues/4744). Thanks Jean-Mathieu Vermosen for reporting, and Michael Chirico for the PR.
 
+35. `DT[, min(intCol, na.rm=TRUE), by=grp]` would return `Inf` for any groups containing all NAs, with a type change from `integer` to `numeric` to hold the `Inf`, and with warning. Now `NA` is returned for such all-NA groups, without warning or type change. This is almost-surely less surprising, more convenient, consistent, and efficient. There was no user request for this, likely because our desire to be consistent with base R in this regard was known (base R's `min(x, na.rm=TRUE)` returns `Inf` with warning for all-NA input). Matt Dowle made this change when reworking internals, [#5105](https://github.com/Rdatatable/data.table/pull/5105). The old behavior seemed so bad, and since there was a warning too, it seemed more appropriate to treat it as a bug.
+
+    ```R
+    DT
+    #         A     B
+    #    <char> <int>
+    # 1:      a     1
+    # 2:      a    NA
+    # 3:      b     2
+    # 4:      b    NA
+
+    DT[, min(B,na.rm=TRUE), by=A]  # no change in behavior (no all-NA groups yet)
+    #         A    V1
+    #    <char> <int>
+    # 1:      a     1
+    # 2:      b     2
+
+    DT[3, B:=NA]                   # make an all-NA group
+    DT
+    #         A     B
+    #    <char> <int>
+    # 1:      a     1
+    # 2:      a    NA
+    # 3:      b    NA
+    # 4:      b    NA
+
+    DT[, min(B,na.rm=TRUE), by=A]  # old result
+    #         A    V1
+    #    <char> <num>              # V1's type changed to numeric (inconsistent)
+    # 1:      a     1
+    # 2:      b   Inf              # Inf surprising
+    # Warning message:             # warning inconvenient
+    # In gmin(B, na.rm = TRUE) :
+    #   No non-missing values found in at least one group. Coercing to numeric
+    #   type and returning 'Inf' for such groups to be consistent with base
+
+    DT[, min(B,na.rm=TRUE), by=A]  # new result
+    #         A    V1
+    #    <char> <int>              # V1's type remains integer (consistent)
+    # 1:      a     1
+    # 2:      b    NA              # NA because there are no non-NA, naturally
+                                   # no inconvenient warning
+    ```
+
 ## NOTES
 
 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.<type>()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :

@@ -5178,8 +5178,8 @@ test(1313.04, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=
 DT[x==6, y := INT(NA)]
 test(1313.05, DT[, min(y), by=x], DT[, base::min(y), by=x])
 test(1313.06, DT[, max(y), by=x], DT[, base::max(y), by=x])
-test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-2147483647,Inf)), warning="No non-missing")
-test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-2147483647,-Inf)), warning="No non-missing")
+test(1313.07, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(-1,4,4,4,-2147483647,NA)))
+test(1313.08, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=INT(4,10,10,10,-2147483647,NA)))
 
 # for numeric
 DT <- data.table(x=rep(1:6, each=3), y=c(4,-1,0, NA,4,10, 4,NA,10, 4,10,NA, -Inf, NA, NA, Inf, NA, NA))
@@ -5191,8 +5191,8 @@ test(1313.12, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=
 DT[x==6, y := NA_real_]
 test(1313.13, DT[, min(y), by=x], DT[, base::min(y), by=x])
 test(1313.14, DT[, max(y), by=x], DT[, base::max(y), by=x])
-test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,Inf)), warning="No non-missing")
-test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,-Inf)), warning="No non-missing")
+test(1313.15, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(-1,4,4,4,-Inf,NA)))
+test(1313.16, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c(4,10,10,10,-Inf,NA)))
 
 # for date (attribute check.. especially after issues/689 !!!)
 DT <- data.table(x = rep(letters[1:2], each=5), y = as.POSIXct('2010-01-01', tz="UTC") + seq(0, 86400*9, 86400))
@@ -5215,8 +5215,8 @@ test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=
 DT[x==6, y := NA_character_]
 test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x])
 test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x])
-test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")), warning="No non-missing")
-test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")), warning="No non-missing")
+test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")))
+test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")))
 
 # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once
 dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names =
@@ -8220,18 +8220,23 @@ test(1581.19, DT, DT0[ , var := c('A', 'A', 'B')])
 # handle NULL value correctly #1429
 test(1582, uniqueN(NULL), 0L)
 
-# bug fix #1461
-dt = data.table(x=c(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN))
-# make sure gforce is on
-options(datatable.optimize=Inf)
-ans1 = suppressWarnings(dt[, base::min(y, na.rm=TRUE), by=x])
-ans2 = suppressWarnings(dt[, base::max(y, na.rm=TRUE), by=x])
-test(1583.1, dt[, min(y, na.rm=TRUE), by=x], ans1, warning="No non-missing values found")
-test(1583.2, dt[, max(y, na.rm=TRUE), by=x], ans2, warning="No non-missing values found")
-ans3 = suppressWarnings(dt[, base::min(y), by=x])
-ans4 = suppressWarnings(dt[, base::max(y), by=x])
-test(1583.3, dt[, min(y), by=x], ans3)
-test(1583.4, dt[, max(y), by=x], ans4)
+# bug fix #1461 related to NaN not being recognized due to ISNA vs ISNAN at C level
+# verbatim test from the original report:
+options(datatable.optimize=Inf)   # ensure gforce is on
+DT = data.table(
+     C1 = c(rep("A", 4), rep("B",4), rep("C", 4)),
+     C2 = c(rep("a", 3), rep("b",3), rep("c",3), rep("d",3)),
+     Val = c(1:5, NaN, NaN, 8,9,10,NaN,12))
+test(1583.1, DT[, .(agg = min(Val, na.rm=TRUE)), by=c('C1', 'C2')],
+             data.table(C1=c("A","A","B","B","C","C"),
+                        C2=c("a","b","b","c","c","d"),
+                        agg=c(1,4,5,8,9,10)))
+# extra test with a size-1 group containing one NaN too
+DT = data.table(x=INT(1,1,1,2,2,2,3,3,3,4,4,4,5), y=c(NaN,1,2, 2,NaN,1, NA,NaN,2, NaN,NA,NaN, NaN))
+test(1583.2, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(1,1,2,NA,NA)))
+test(1583.3, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:5, V1=c(2,2,2,NA,NA)))
+test(1583.4, DT[, min(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN)))
+test(1583.5, DT[, max(y), by=x], data.table(x=1:5, V1=c(NaN,NaN,NA,NaN,NaN)))
 
 # Fixed a minor bug in fread when blank.lines.skip=TRUE
 f1 <- function(x, f=TRUE, b=FALSE) fread(x, fill=f, blank.lines.skip=b, data.table=FALSE, logical01=FALSE)
@@ -14700,8 +14705,8 @@ if (test_bit64) {
   test(2019, DT[2:6, sum(v), id], data.table(id=1:2, V1=bit64::as.integer64(c(5L,15L)))) # gather, case of int64 and irows
 }
 DT = data.table(id = c(1L,1L,2L), v = as.raw(0:2))
-test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min")
-test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce max")
+test(2020.01, DT[, min(v), by=id], error="'raw' not supported by GForce min/max")
+test(2020.02, DT[, max(v), by=id], error="'raw' not supported by GForce min/max")
 test(2020.03, DT[, median(v), by=id], error="'raw' not supported by GForce median")
 test(2020.04, DT[, head(v, 1), by=id], error="'raw' not supported by GForce head")
 test(2020.05, DT[, tail(v, 1), by=id], error="'raw' not supported by GForce tail")
@@ -15820,8 +15825,8 @@ test(2065.7, DT[1L, z_sum := 1i][1L, z_sum], 1i)
 
 # GForce for complex columns, part of #3690
 DT = data.table(id=c(1L,1L,2L), v=c(1i, 2i, 3i))
-test(2066.01, DT[, min(v), by=id], error="'complex' has no well-defined min")
-test(2066.02, DT[, max(v), by=id], error="'complex' has no well-defined max")
+test(2066.01, DT[, min(v), by=id], error="'complex' has no well-defined min/max")
+test(2066.02, DT[, max(v), by=id], error="'complex' has no well-defined min/max")
 test(2066.03, DT[, head(v, 1), by=id], data.table(id=1:2, V1=c(1, 3)*1i))
 test(2066.04, DT[, tail(v, 1), by=id], data.table(id=1:2, V1=(2:3)*1i))
 test(2066.05, DT[, v[2], by=id], data.table(id = 1:2, V1=c(2i, NA)))