From 711029fc794adb3c40fdd669c9b315d4f1c72c4b Mon Sep 17 00:00:00 2001
From: Vadim Khotilovich <khotilovich@gmail.com>
Date: Tue, 15 Dec 2020 03:25:21 -0600
Subject: [PATCH 1/9] Fix gmin bug: result depended on blank strings position

---
 inst/tests/tests.Rraw |  7 ++++---
 src/gsumm.c           | 10 +++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index d26f7ff509..785b4a9390 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -5121,7 +5121,8 @@ test(1313.22, DT[, list(y=max(y, na.rm=TRUE)), by=x], DT[c(5,10)])
 
 # for character
 set.seed(1L)
-DT <- data.table(x=rep(1:6, each=3), y=sample(c("", letters[1:3], NA), 18, TRUE))
+DT <- data.table(x=rep(1:7, each=3), y=sample(c("", letters[1:3], NA), 21, TRUE))
+DT[x==7, y := c("","b","c")]
 test(1313.23, DT[, min(y), by=x], DT[, base::min(y), by=x])
 test(1313.24, DT[, max(y), by=x], DT[, base::max(y), by=x])
 test(1313.25, DT[, min(y, na.rm=TRUE), by=x], DT[, base::min(y, na.rm=TRUE), by=x])
@@ -5129,8 +5130,8 @@ test(1313.26, DT[, max(y, na.rm=TRUE), by=x], DT[, base::max(y, na.rm=TRUE), by=
 DT[x==6, y := NA_character_]
 test(1313.27, DT[, min(y), by=x], DT[, base::min(y), by=x])
 test(1313.28, DT[, max(y), by=x], DT[, base::max(y), by=x])
-test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("a","a","c","","a",NA)), warning="No non-missing")
-test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:6, V1=c("b","a","c","a","c",NA)), warning="No non-missing")
+test(1313.29, DT[, min(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("a","a","c","","a",NA,"")), warning="No non-missing")
+test(1313.30, DT[, max(y, na.rm=TRUE), by=x], data.table(x=1:7, V1=c("b","a","c","a","c",NA,"c")), warning="No non-missing")
 
 # bug 700 - bmerge, roll=TRUE and nomatch=0L when i's key group occurs more than once
 dt1 <- data.table(structure(list(x = c(7L, 33L), y = structure(c(15912, 15912), class = "Date"), z = c(626550.35284, 7766.385)), .Names =
diff --git a/src/gsumm.c b/src/gsumm.c
index ed34e76207..f57312bfaa 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -726,22 +726,26 @@ SEXP gmin(SEXP x, SEXP narm)
     break;
   case STRSXP:
     ans = PROTECT(allocVector(STRSXP, ngrp)); protecti++;
+    for (i=0; i<ngrp; i++) SET_STRING_ELT(ans, i, NA_STRING);
     if (!LOGICAL(narm)[0]) {
-      for (i=0; i<ngrp; i++) SET_STRING_ELT(ans, i, R_BlankString);
+      Rboolean *upd = calloc(ngrp, sizeof(Rboolean));
+      if (!upd) error(_("Unable to allocate %d * %d bytes for the update mask in gmin na.rm=FALSE"), ngrp, sizeof(Rboolean));
       for (i=0; i<n; i++) {
         thisgrp = grp[i];
         ix = (irowslen == -1) ? i : irows[i]-1;
         if (STRING_ELT(x, ix) == NA_STRING) {
           SET_STRING_ELT(ans, thisgrp, NA_STRING);
+          upd[thisgrp] = TRUE;
         } else {
-          if (STRING_ELT(ans, thisgrp) == R_BlankString ||
+          if (!upd[thisgrp] ||
             (STRING_ELT(ans, thisgrp) != NA_STRING && strcmp(CHAR(STRING_ELT(x, ix)), CHAR(STRING_ELT(ans, thisgrp))) < 0 )) {
             SET_STRING_ELT(ans, thisgrp, STRING_ELT(x, ix));
+            upd[thisgrp] = TRUE;
           }
         }
       }
+      free(upd);
     } else {
-      for (i=0; i<ngrp; i++) SET_STRING_ELT(ans, i, NA_STRING);
       for (i=0; i<n; i++) {
         thisgrp = grp[i];
         ix = (irowslen == -1) ? i : irows[i]-1;

From 09720268347f6a4a36e43ee337943a64f561cd1f Mon Sep 17 00:00:00 2001
From: Vadim Khotilovich <khotilovich@gmail.com>
Date: Tue, 15 Dec 2020 11:41:23 -0600
Subject: [PATCH 2/9] address review comments

---
 src/gsumm.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index f57312bfaa..0cf6f884af 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -728,23 +728,23 @@ SEXP gmin(SEXP x, SEXP narm)
     ans = PROTECT(allocVector(STRSXP, ngrp)); protecti++;
     for (i=0; i<ngrp; i++) SET_STRING_ELT(ans, i, NA_STRING);
     if (!LOGICAL(narm)[0]) {
-      Rboolean *upd = calloc(ngrp, sizeof(Rboolean));
-      if (!upd) error(_("Unable to allocate %d * %d bytes for the update mask in gmin na.rm=FALSE"), ngrp, sizeof(Rboolean));
+      bool *updated = calloc(ngrp, sizeof(bool));
+      if (!updated) error(_("Unable to allocate %d * %d bytes for the update mask in gmin na.rm=FALSE"), ngrp, sizeof(bool));
       for (i=0; i<n; i++) {
         thisgrp = grp[i];
         ix = (irowslen == -1) ? i : irows[i]-1;
         if (STRING_ELT(x, ix) == NA_STRING) {
           SET_STRING_ELT(ans, thisgrp, NA_STRING);
-          upd[thisgrp] = TRUE;
+          updated[thisgrp] = true;
         } else {
-          if (!upd[thisgrp] ||
+          if (!updated[thisgrp] ||
             (STRING_ELT(ans, thisgrp) != NA_STRING && strcmp(CHAR(STRING_ELT(x, ix)), CHAR(STRING_ELT(ans, thisgrp))) < 0 )) {
             SET_STRING_ELT(ans, thisgrp, STRING_ELT(x, ix));
-            upd[thisgrp] = TRUE;
+            updated[thisgrp] = true;
           }
         }
       }
-      free(upd);
+      free(updated);
     } else {
       for (i=0; i<n; i++) {
         thisgrp = grp[i];

From e557bbb5959cc085260a3a67f81360ac9b84236d Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 14 May 2021 17:34:59 -0600
Subject: [PATCH 3/9] add Vadim to DESCRIPTION

---
 DESCRIPTION | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9e5302f2bb..8ab2deaa0d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -63,7 +63,8 @@ Authors@R: c(
   person("Dirk","Eddelbuettel",    role="ctb"),
   person("Ben","Schwen",           role="ctb"),
   person("Tony","Fischetti",       role="ctb"),
-  person("Ofek","Shilon",          role="ctb"))
+  person("Ofek","Shilon",          role="ctb"),
+  person("Vadim","Khotilovich",    role="ctb"))
 Depends: R (>= 3.1.0)
 Imports: methods
 Suggests: bit64 (>= 4.0.0), bit (>= 4.0.4), curl, R.utils, xts, nanotime, zoo (>= 1.8-1), yaml, knitr, rmarkdown, markdown

From 187e1d030c86139c54eb482c3080c9432d3eab1a Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 14 May 2021 17:45:54 -0600
Subject: [PATCH 4/9] added news item

---
 NEWS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index b80eb35c63..ddfe4ce129 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -125,6 +125,8 @@
 
 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing.
 
+16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
+
 ## NOTES
 
 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.<type>()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example :

From 572e40bf2d32600d92c1d72ea33ba0927c2a402f Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 14 May 2021 17:52:33 -0600
Subject: [PATCH 5/9] added max() to news item

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index ddfe4ce129..c6969b4beb 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -125,7 +125,7 @@
 
 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing.
 
-16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
+16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Similarly for `max()` too. Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
 
 ## NOTES
 

From ed5385e982ca168790a945cacbb233f2ebfe8924 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 14 May 2021 19:20:35 -0600
Subject: [PATCH 6/9] revert adding max() to news item; it's just min

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index c6969b4beb..ddfe4ce129 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -125,7 +125,7 @@
 
 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing.
 
-16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Similarly for `max()` too. Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
+16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
 
 ## NOTES
 

From 1facefd76b96c588324b555970f9af7fa37046e4 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 14 May 2021 20:55:06 -0600
Subject: [PATCH 7/9] simpler fix to avoid calloc

---
 src/data.table.h |  1 +
 src/gsumm.c      | 12 ++++--------
 src/init.c       |  2 ++
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/data.table.h b/src/data.table.h
index 9fb386567d..50d43a34a5 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -89,6 +89,7 @@ extern SEXP char_ordered;
 extern SEXP char_datatable;
 extern SEXP char_dataframe;
 extern SEXP char_NULL;
+extern SEXP char_maxString;
 extern SEXP sym_sorted;
 extern SEXP sym_index;
 extern SEXP sym_BY;
diff --git a/src/gsumm.c b/src/gsumm.c
index b5ac98afce..064131f3cd 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -759,26 +759,22 @@ SEXP gmin(SEXP x, SEXP narm)
     break;
   case STRSXP:
     ans = PROTECT(allocVector(STRSXP, ngrp)); protecti++;
-    for (i=0; i<ngrp; i++) SET_STRING_ELT(ans, i, NA_STRING);
     if (!LOGICAL(narm)[0]) {
-      bool *updated = calloc(ngrp, sizeof(bool));
-      if (!updated) error(_("Unable to allocate %d * %d bytes for the update mask in gmin na.rm=FALSE"), ngrp, sizeof(bool));
+      for (i=0; i<ngrp; i++) SET_STRING_ELT(ans, i, char_maxString); // char_maxString == "\xFF\xFF..." in init.c
       for (i=0; i<n; i++) {
         thisgrp = grp[i];
         ix = (irowslen == -1) ? i : irows[i]-1;
         if (STRING_ELT(x, ix) == NA_STRING) {
           SET_STRING_ELT(ans, thisgrp, NA_STRING);
-          updated[thisgrp] = true;
         } else {
-          if (!updated[thisgrp] ||
-            (STRING_ELT(ans, thisgrp) != NA_STRING && strcmp(CHAR(STRING_ELT(x, ix)), CHAR(STRING_ELT(ans, thisgrp))) < 0 )) {
+          if (STRING_ELT(ans, thisgrp)==char_maxString || 
+             (STRING_ELT(ans, thisgrp)!=NA_STRING && strcmp(CHAR(STRING_ELT(x, ix)), CHAR(STRING_ELT(ans, thisgrp))) < 0)) {
             SET_STRING_ELT(ans, thisgrp, STRING_ELT(x, ix));
-            updated[thisgrp] = true;
           }
         }
       }
-      free(updated);
     } else {
+      for (i=0; i<ngrp; i++) SET_STRING_ELT(ans, i, NA_STRING);
       for (i=0; i<n; i++) {
         thisgrp = grp[i];
         ix = (irowslen == -1) ? i : irows[i]-1;
diff --git a/src/init.c b/src/init.c
index 5e4c854962..2c65f1d980 100644
--- a/src/init.c
+++ b/src/init.c
@@ -22,6 +22,7 @@ SEXP char_ordered;
 SEXP char_datatable;
 SEXP char_dataframe;
 SEXP char_NULL;
+SEXP char_maxString;
 SEXP sym_sorted;
 SEXP sym_index;
 SEXP sym_BY;
@@ -337,6 +338,7 @@ void attribute_visible R_init_datatable(DllInfo *info)
   char_datatable = PRINTNAME(install("data.table"));
   char_dataframe = PRINTNAME(install("data.frame"));
   char_NULL =      PRINTNAME(install("NULL"));
+  char_maxString = PRINTNAME(install("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"));
 
   if (TYPEOF(char_integer64) != CHARSXP) {
     // checking one is enough in case of any R-devel changes

From 7fdcbc965f1fc825a6e2f38861dcbc7db1cb1e32 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 14 May 2021 20:58:56 -0600
Subject: [PATCH 8/9] news item tweak

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index ddfe4ce129..a83e4ccaac 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -125,7 +125,7 @@
 
 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing.
 
-16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) if they are only present in that column at the top of the table and return the smallest non-blank incorrectly (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
+16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
 
 ## NOTES
 

From ad08153774bc5fbbc75b599d2565b238a079ea1b Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 14 May 2021 21:03:05 -0600
Subject: [PATCH 9/9] news item tweak

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index a83e4ccaac..fd48fdcade 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -125,7 +125,7 @@
 
 15. `print(x, col.names='none')` now removes the column names as intended for wide `data.table`s whose column names don't fit on a single line, [#4270](https://github.com/Rdatatable/data.table/issues/4270). Thanks to @tdhock for the report, and Michael Chirico for fixing.
 
-16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank (instead of blank), [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
+16. `DT[, min(colB), by=colA]` when `colB` is type `character` would miss blank strings (`""`) at the beginning of a group and return the smallest non-blank instead of blank, [#4848](https://github.com/Rdatatable/data.table/issues/4848). Thanks to Vadim Khotilovich for reporting and for the PR fixing it.
 
 ## NOTES