From 5cdee25c1e9cccdbbff82b2804823ba4d6a5ebd4 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 20 Feb 2020 21:04:47 +0800 Subject: [PATCH 1/3] add a regression test for fixed dcast fun.aggregate issue --- NEWS.md | 2 ++ inst/tests/tests.Rraw | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/NEWS.md b/NEWS.md index 71fd76aa65..aeb57245ea 100644 --- a/NEWS.md +++ b/NEWS.md @@ -107,6 +107,8 @@ unit = "s") 12. `rbindlist` no longer errors when coercing complex vectors to character vectors, [#4202](https://github.com/Rdatatable/data.table/issues/4202). Thanks to @sritchie73 for reporting and the PR. +13. A type mismatch among levels of the LHS in `dcast` could produce garbage values (some older R versions segfaulted), [#2394](https://github.com/Rdatatable/data.table/issues/2394). Thanks @khotilov for the bug report. + ## NOTES 0. Retrospective license change permission was sought from and granted by 4 contributors who were missed in [PR#2456](https://github.com/Rdatatable/data.table/pull/2456), [#4140](https://github.com/Rdatatable/data.table/pull/4140). We had used [GitHub's contributor page](https://github.com/Rdatatable/data.table/graphs/contributors) which omits 3 of these due to invalid email addresses, unlike GitLab's contributor page which includes the ids. The 4th omission was a PR to a script which should not have been excluded; a script is code too. We are sorry these contributors were not properly credited before. They have now been added to the contributors list as displayed on CRAN. All the contributors of code to data.table hold its copyright jointly; your contributions belong to you. You contributed to data.table when it had a particular license at that time, and you contributed on that basis. This is why in the last license change, all contributors of code were consulted and each had a veto. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7cc6819e8f..8d35b9b7e9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16846,3 +16846,10 @@ A = data.table(A=c(complex(real = 1:3, imaginary=c(0, -1, 1)), NaN)) test(2138.3, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) A = data.table(A=as.complex(rep(NA, 5))) test(2138.4, rbind(A,B), data.table(A=c(as.character(A$A), B$A))) + +# type mismatch in fun.aggregate used to segfault (#2394), now appears fixed; test against regression +agg <- function(x) if(length(x) > 0) min(x) else NA +d <- data.table(id = c(1,1,2,2), x = c('y','y','y','z'), v = c('a','b','c','d')) +dcast(d, formula = id ~ x, fun.aggregate = agg, value.var = 'v') +test(2139, dcast(d, formula = id ~ x, fun.aggregate = agg, value.var = 'v'), + data.table(num = c(1, 2), y = c('a', 'c'), z = c(NA, 'd'))) From 62d17e87ac76094cee966fc71e43fa37e11ba19c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 13 May 2021 15:08:02 -0600 Subject: [PATCH 2/3] moved news item to note and added detail, fixed the test --- NEWS.md | 4 ++-- inst/tests/tests.Rraw | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5a9c97b296..e26cf65c1e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -117,8 +117,6 @@ 14. `by=...get()...` could fail with `object not found`, [#4873](https://github.com/Rdatatable/data.table/issues/4873) [#4981](https://github.com/Rdatatable/data.table/issues/4981). Thanks to @sindribaldur for reporting, and @OfekShilon for fixing. -15. A type mismatch among levels of the LHS in `dcast` could produce garbage values (some older R versions segfaulted), [#2394](https://github.com/Rdatatable/data.table/issues/2394). Thanks @khotilov for the bug report. - ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : @@ -137,6 +135,8 @@ 4. `cube(DT, by="a")` now gives a more helpful error that `j` is missing, [#4282](https://github.com/Rdatatable/data.table/pull/4282). +5. v1.13.0 (July 2020) fixed a segfault/corruption/error (depending on version of R and circumstances) in `dcast()` when `fun.aggregate` returned `NA` (type `logical`) in an otherwise `character` result, [#2394](https://github.com/Rdatatable/data.table/issues/2394). This fix was the result of other internal rework and there was no news item at the time. A new test to cover this case has now been added. Thanks Vadim Khotilovich for reporting, and Michael Chirico for investigating, pinpointing when the fix occurred and adding the test. + # data.table [v1.14.0](https://github.com/Rdatatable/data.table/milestone/23?closed=1) (21 Feb 2021) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4c660d77f1..d1a1f1b28b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17579,8 +17579,8 @@ ans = data.table(round = c(3, 3, 3, 2, 2, 4, 2, 4), k = c(6, 7, 8, 5, 7, 7, 6, 8 test(2184.5, IDT[(virginca), .N, by = .(round(Sepal.Width), k = round(Sepal.Length), kar = get(vr))] , ans) # type mismatch in fun.aggregate used to segfault (#2394), now appears fixed; test against regression -agg <- function(x) if(length(x) > 0) min(x) else NA -d <- data.table(id = c(1,1,2,2), x = c('y','y','y','z'), v = c('a','b','c','d')) -dcast(d, formula = id ~ x, fun.aggregate = agg, value.var = 'v') -test(2185, dcast(d, formula = id ~ x, fun.aggregate = agg, value.var = 'v'), - data.table(num = c(1, 2), y = c('a', 'c'), z = c(NA, 'd'))) +agg = function(x) if(length(x) > 0) min(x) else NA +DT = data.table(id=c(1,1,2,2), x=c('y','y','y','z'), v=c('a','b','c','d')) +test(2185, dcast(DT, formula=id~x, fun.aggregate=agg, value.var='v'), + data.table(id=c(1,2), y=c('a','c'), z=c(NA,'d'), key="id")) + From e8f66ec7c6ab03a589f137791bfd1985b2354bb3 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 13 May 2021 15:12:09 -0600 Subject: [PATCH 3/3] tweak comment on test --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d1a1f1b28b..a14dbe2928 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17578,7 +17578,7 @@ IDT[, virginca := get(vr) == "virginica"] ans = data.table(round = c(3, 3, 3, 2, 2, 4, 2, 4), k = c(6, 7, 8, 5, 7, 7, 6, 8), kar = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("setosa", "versicolor", "virginica"), class = "factor"), N = c(24L, 14L, 4L, 1L, 1L, 1L, 3L, 2L)) test(2184.5, IDT[(virginca), .N, by = .(round(Sepal.Width), k = round(Sepal.Length), kar = get(vr))] , ans) -# type mismatch in fun.aggregate used to segfault (#2394), now appears fixed; test against regression +# dcast() segfault or 'STRING_ELT() can only be applied to character not logical' fixed in v1.13.0, #2394 agg = function(x) if(length(x) > 0) min(x) else NA DT = data.table(id=c(1,1,2,2), x=c('y','y','y','z'), v=c('a','b','c','d')) test(2185, dcast(DT, formula=id~x, fun.aggregate=agg, value.var='v'),