From 711029fc794adb3c40fdd669c9b315d4f1c72c4b Mon Sep 17 00:00:00 2001 From: Vadim Khotilovich Date: Tue, 15 Dec 2020 03:25:21 -0600 Subject: [PATCH 1/9] Fix gmin bug: result depended on blank strings position --- inst/tests/tests.Rraw | 7 ++++--- src/gsumm.c | 10 +++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d26f7ff509..785b4a9390 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5121,7 +5121,8 @@ test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)]) # for character set.seed(1L) -DT <- data.table(x=rep(1:6, each=3), y=sample(c("", letters[1:3], NA), 18, TRUE)) +DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE)) +DT[x==7, y := c("","b","c")] test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x]) test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x]) @@ -5129,8 +5130,8 @@ test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by= DT[x==6, y := NA_character_] test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x]) test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x]) -test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("a","a","c","","a",NA)), warning="No non-missing") -test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("b","a","c","a","c",NA)), warning="No non-missing") +test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")), warning="No non-missing") +test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")), warning="No non-missing") # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names = diff --git a/src/gsumm.c b/src/gsumm.c index ed34e76207..f57312bfaa 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -726,22 +726,26 @@ SEXP gmin(SEXP x, SEXP narm) break; case STRSXP: ans = PROTECT(allocVector(STRSXP, ngrp)); protecti++; + for (i=0; i Date: Tue, 15 Dec 2020 11:41:23 -0600 Subject: [PATCH 2/9] address review comments --- src/gsumm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index f57312bfaa..0cf6f884af 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -728,23 +728,23 @@ SEXP gmin(SEXP x, SEXP narm) ans = PROTECT(allocVector(STRSXP, ngrp)); protecti++; for (i=0; i Date: Fri, 14 May 2021 17:34:59 -0600 Subject: [PATCH 3/9] add Vadim to DESCRIPTION --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9e5302f2bb..8ab2deaa0d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -63,7 +63,8 @@ Authors@R: c( person("Dirk","Eddelbuettel", role="ctb"), person("Ben","Schwen", role="ctb"), person("Tony","Fischetti", role="ctb"), - person("Ofek","Shilon", role="ctb")) + person("Ofek","Shilon", role="ctb"), + person("Vadim","Khotilovich", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown From 187e1d030c86139c54eb482c3080c9432d3eab1a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 14 May 2021 17:45:54 -0600 Subject: [PATCH 4/9] added news item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index b80eb35c63..ddfe4ce129 100644 --- a/NEWS.md +++ b/NEWS.md @@ -125,6 +125,8 @@ 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. +16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From 572e40bf2d32600d92c1d72ea33ba0927c2a402f Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 14 May 2021 17:52:33 -0600 Subject: [PATCH 5/9] added max() to news item --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ddfe4ce129..c6969b4beb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -125,7 +125,7 @@ 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. -16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. +16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Similarly for `max()` too. Thanks to Vadim Khotilovich for reporting and for the PR fixing it. ## NOTES From ed5385e982ca168790a945cacbb233f2ebfe8924 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 14 May 2021 19:20:35 -0600 Subject: [PATCH 6/9] revert adding max() to news item; it's just min --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c6969b4beb..ddfe4ce129 100644 --- a/NEWS.md +++ b/NEWS.md @@ -125,7 +125,7 @@ 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. -16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Similarly for `max()` too. Thanks to Vadim Khotilovich for reporting and for the PR fixing it. +16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. ## NOTES From 1facefd76b96c588324b555970f9af7fa37046e4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 14 May 2021 20:55:06 -0600 Subject: [PATCH 7/9] simpler fix to avoid calloc --- src/data.table.h | 1 + src/gsumm.c | 12 ++++-------- src/init.c | 2 ++ 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 9fb386567d..50d43a34a5 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -89,6 +89,7 @@ extern SEXP char_ordered; extern SEXP char_datatable; extern SEXP char_dataframe; extern SEXP char_NULL; +extern SEXP char_maxString; extern SEXP sym_sorted; extern SEXP sym_index; extern SEXP sym_BY; diff --git a/src/gsumm.c b/src/gsumm.c index b5ac98afce..064131f3cd 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -759,26 +759,22 @@ SEXP gmin(SEXP x, SEXP narm) break; case STRSXP: ans = PROTECT(allocVector(STRSXP, ngrp)); protecti++; - for (i=0; i Date: Fri, 14 May 2021 20:58:56 -0600 Subject: [PATCH 8/9] news item tweak --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ddfe4ce129..a83e4ccaac 100644 --- a/NEWS.md +++ b/NEWS.md @@ -125,7 +125,7 @@ 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. -16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. +16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. ## NOTES From ad08153774bc5fbbc75b599d2565b238a079ea1b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 14 May 2021 21:03:05 -0600 Subject: [PATCH 9/9] news item tweak --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index a83e4ccaac..fd48fdcade 100644 --- a/NEWS.md +++ b/NEWS.md @@ -125,7 +125,7 @@ 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing. -16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. +16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank instead of blank, [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it. ## NOTES