From 6e855c70ada4c7fc86d90420915b99cb18b34351 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Fri, 16 Mar 2018 02:01:37 +0800
Subject: [PATCH 01/11] prevent the utf8 string from being collected by the
 garbage collector in forder()

---
 src/forder.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/forder.c b/src/forder.c
index 470ac322bd..2659f2c8ed 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n)
   /* can't use otmp, since iradix might be called here and that uses otmp (and xtmp).
      alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column,
      or n=maxgrpn if onwards columns */
-  for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(ENC2UTF8(x[i]));
+  for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(x[i]);
   if (nalast == 0 && n == 2) {                        // special case for nalast==0. n==1 is handled inside forder. at least 1 will be NA here
     if (o[0] == -1) for (i=0; i<n; i++) o[i] = i+1;    // else use o from caller directly (not 1st column)
     for (int i=0; i<n; i++) if (csort_otmp[i] == NA_INTEGER) o[i] = 0;
@@ -899,7 +899,7 @@ static void csort_pre(SEXP *x, int n)
   // savetl_init() is called once at the start of forder
   old_un = ustr_n;
   for(i=0; i<n; i++) {
-    s = ENC2UTF8(x[i]);
+    s = x[i];
     if (TRUELENGTH(s)<0) continue;   // this case first as it's the most frequent. Already in ustr, this negative is its ordering.
     if (TRUELENGTH(s)>0) {  // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore
       savetl(s);          // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.
@@ -1087,8 +1087,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
 {
   int i, j, k, grp, ngrp, tmp, *osub, thisgrpn, n, col;
   Rboolean isSorted = TRUE;
-  SEXP x, class;
-  void *xd;
+  SEXP x, ux, class;
+  void *xd, *uxd;
 #ifdef TIMING_ON
   memset(tblock, 0, NBLOCK*sizeof(clock_t));
   memset(nblock, 0, NBLOCK*sizeof(int));
@@ -1168,8 +1168,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
     case REALSXP :
       dsort(xd, o, n); break;
     case STRSXP :
-      if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(n); csort(xd, o, n); }
-      else cgroup(xd, o, n);
+      ux = PROTECT(allocVector(STRSXP, n));
+      for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i)));
+      uxd = DATAPTR(ux);
+      if (sortStr) { csort_pre(uxd, n); alloc_csort_otmp(n); csort(uxd, o, n); }
+      else cgroup(uxd, o, n);
+      UNPROTECT(1);
       break;
     default :
       Error("Internal error: previous default should have caught unsupported type");

From c717a8b887a75c02a098654443731379fe0b9637 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Fri, 16 Mar 2018 14:24:33 +0800
Subject: [PATCH 02/11] add tests and entries in NEWS.md

---
 NEWS.md               |  3 +++
 inst/tests/tests.Rraw | 25 +++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 47da1d2c8b..1f20b6f187 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -143,6 +143,9 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat
 Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match
 the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631). 
 
+35. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678),  [#2674](https://github.com/Rdatatable/data.table/issues/2674).
+
+
 #### NOTES
 
 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change.
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 93ea0f50cc..af3c0be267 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11727,6 +11727,31 @@ test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=
              warning = "column names.*are duplicated in the result")
 
 
+# Ensure data.table won't break even if garbage collection gets triggered during sorting
+# a large numbers of non-ASCII characters.
+utf8_strings <- c(
+  '\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca',
+  '\u7ea2\u5229\u6536\u5165',
+  '\u4ef7\u5dee\u6536\u5165',
+  '\u5176\u4ed6\u4e1a\u52a1\u652f\u51fa',
+  '\u8d44\u4ea7\u51cf\u503c\u635f\u5931')
+if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) {
+  # The only meaningful environment for this test case is on a Simplified Chinese Language Windows Machine.
+  # Technically speaking, the native encoding should be an encoding that supports
+  # Simplified Chinese other that UTF-8.
+  native_strings <- enc2native(utf8_strings)
+  # 1e7 length character should be large enough to trigger the garbage collecting
+  DT <- data.table(x = rep(native_strings, 0.2e7), key = "x")
+  test(1881.1, unique(DT$x), sort(utf8_strings, method = "radix"))
+
+  # by, keyby should treat the string with different encoding as the same
+  mixed_strings <- c(utf8_strings, native_strings)
+  DT <- data.table(x = mixed_strings)
+  test(1881.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
+  test(1881.3, DT[, uniqueN(x)], 5L)
+}
+
+
 ###################################
 #  Add new tests above this line  #
 ###################################

From 803b5f2c9c4b2a11e804eff05cff6a8d8a970091 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Fri, 16 Mar 2018 14:30:47 +0800
Subject: [PATCH 03/11] bumper the test number

---
 inst/tests/tests.Rraw | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 50fbbef78d..13a5d93900 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11510,13 +11510,13 @@ if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) {
   native_strings <- enc2native(utf8_strings)
   # 1e7 length character should be large enough to trigger the garbage collecting
   DT <- data.table(x = rep(native_strings, 0.2e7), key = "x")
-  test(1881.1, unique(DT$x), sort(utf8_strings, method = "radix"))
+  test(1891.1, unique(DT$x), sort(utf8_strings, method = "radix"))
 
   # by, keyby should treat the string with different encoding as the same
   mixed_strings <- c(utf8_strings, native_strings)
   DT <- data.table(x = mixed_strings)
-  test(1881.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
-  test(1881.3, DT[, uniqueN(x)], 5L)
+  test(1891.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
+  test(1891.3, DT[, uniqueN(x)], 5L)
 }
 
 

From be9a6dec14dc29adacb106f5101afc81497f5420 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Fri, 16 Mar 2018 16:00:37 +0800
Subject: [PATCH 04/11] mixed encoded chars should be compared under UTF8

---
 inst/tests/tests.Rraw |  5 +++++
 src/forder.c          | 23 ++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 13a5d93900..b5c50139fc 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11517,6 +11517,11 @@ if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) {
   DT <- data.table(x = mixed_strings)
   test(1891.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
   test(1891.3, DT[, uniqueN(x)], 5L)
+
+  DT <- data.table(x = mixed_strings, y = c(native_strings, utf8_strings), z = 1)
+  test(1881.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
+  test(1881.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
+  test(1881.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)
 }
 
 
diff --git a/src/forder.c b/src/forder.c
index 7030b6ef6f..062736018b 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -1161,6 +1161,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   default :
     Error("First column being ordered is type '%s', not yet supported", type2char(TYPEOF(x)));
   }
+
+  int n_protect = 0;
   if (tmp) {                                  // -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs
     if (tmp == 1) {                         // same as expected in 'order' (1 = increasing, -1 = decreasing)
       isSorted = TRUE;
@@ -1181,11 +1183,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       dsort(xd, o, n); break;
     case STRSXP :
       ux = PROTECT(allocVector(STRSXP, n));
+      n_protect++;
       for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i)));
       uxd = DATAPTR(ux);
       if (sortStr) { csort_pre(uxd, n); alloc_csort_otmp(n); csort(uxd, o, n); }
       else cgroup(uxd, o, n);
-      UNPROTECT(1);
       break;
     default :
       Error("Internal error: previous default should have caught unsupported type");
@@ -1228,7 +1230,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       f = &dsorted; g = &dsort; break;
     case STRSXP :
       f = &csorted;
-      if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(gsmax[1-flip]); g = &csort; }
+      ux = PROTECT(allocVector(STRSXP, n));
+      n_protect++;
+      for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i)));
+      uxd = DATAPTR(ux);
+      if (sortStr) { csort_pre(uxd, n); alloc_csort_otmp(gsmax[1-flip]); g = &csort; }
       else g = &cgroup; // no increasing/decreasing order required if sortStr = FALSE, just a dummy argument
       break;
     default:
@@ -1261,10 +1267,16 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       osub = o+i;
       // ** TO DO **: if isSorted,  we can just point xsub into x directly. If (*f)() returns 0, though, will have to copy x at that point
       //        When doing this,  xsub could be allocated at that point for the first time.
-      if (size==4)
+      if (size==4) {
         for (j=0; j<thisgrpn; j++) ((int *)xsub)[j] = ((int *)xd)[o[i++]-1];
-      else
-        for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)xd)[o[i++]-1];
+      } else {
+        if (TYPEOF(x) == STRSXP) {
+          for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)uxd)[o[i++]-1];
+        } else {
+          for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)xd)[o[i++]-1];
+        }
+      }
+
       TEND(2)
 
       // continue;  // BASELINE short circuit timing point. Up to here is the cost of creating xsub.
@@ -1315,6 +1327,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   maxlen = 1;  // reset global. Minimum needed to count "" and NA
   ustr_n = 0;
   savetl_end();
+  if (n_protect) UNPROTECT(n_protect); // Should be safe to remove after truelength being set back
   free(ustr);                ustr=NULL;          ustr_alloc=0;
 
   if (isSorted) {

From de3cc86a2c920a15b385dc26e6b467451e187fe8 Mon Sep 17 00:00:00 2001
From: Xianying Tan <shrektan@126.com>
Date: Sat, 17 Mar 2018 19:01:21 +0800
Subject: [PATCH 05/11] fix a typo on test number

---
 inst/tests/tests.Rraw | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index b5c50139fc..f144c1c924 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11519,9 +11519,9 @@ if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) {
   test(1891.3, DT[, uniqueN(x)], 5L)
 
   DT <- data.table(x = mixed_strings, y = c(native_strings, utf8_strings), z = 1)
-  test(1881.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
-  test(1881.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
-  test(1881.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)
+  test(1891.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
+  test(1891.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
+  test(1891.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)
 }
 
 

From 84ca0b06d1414cfea00ad88ecb2ad162082f7ad7 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Sat, 17 Mar 2018 22:25:57 +0800
Subject: [PATCH 06/11] reduce overheads for all ascii/utf8 columns

---
 src/data.table.h |  4 +++-
 src/forder.c     | 24 ++++++++++++++++++------
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/data.table.h b/src/data.table.h
index 3bdcc38fa5..0330b292a2 100644
--- a/src/data.table.h
+++ b/src/data.table.h
@@ -47,7 +47,8 @@ typedef R_xlen_t RLEN;
 // This IS_ASCII will dereference s and that cache fetch is the part that may bite more than the branch, though. Without a call to
 // to ENC2UTF as all, the pointer value can just be compared by the calling code without deferencing it. It may still be worth
 // timing the impact and manually avoiding (is there an IS_ASCII on the character vector rather than testing each item every time?)
-#define ENC2UTF8(s) ((IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))
+#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s))
+#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))
 
 // init.c
 void setSizes();
@@ -89,6 +90,7 @@ unsigned long long dtwiddle(void *p, int i, int order);
 unsigned long long i64twiddle(void *p, int i, int order);
 unsigned long long (*twiddle)(void *, int, int);
 SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg);
+bool need2utf8(SEXP x, int n);
 
 // reorder.c
 SEXP reorder(SEXP x, SEXP order);
diff --git a/src/forder.c b/src/forder.c
index 062736018b..d768a0842d 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -1082,6 +1082,12 @@ static void dsort(double *x, int *o, int n)
   }
 }
 
+bool need2utf8(SEXP x, int n)
+{
+  for (int i=0; i<n; i++) if (NEED2UTF8(STRING_ELT(x, i))) return(true);
+  return(false);
+}
+
 SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg)
 // sortStr TRUE from setkey, FALSE from by=
 {
@@ -1182,9 +1188,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
     case REALSXP :
       dsort(xd, o, n); break;
     case STRSXP :
-      ux = PROTECT(allocVector(STRSXP, n));
-      n_protect++;
-      for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i)));
+      if (need2utf8(x, n)) {
+        ux = PROTECT(allocVector(STRSXP, n)); n_protect++; 
+        for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i))); 
+      } else {
+        ux = x;
+      }
       uxd = DATAPTR(ux);
       if (sortStr) { csort_pre(uxd, n); alloc_csort_otmp(n); csort(uxd, o, n); }
       else cgroup(uxd, o, n);
@@ -1230,9 +1239,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       f = &dsorted; g = &dsort; break;
     case STRSXP :
       f = &csorted;
-      ux = PROTECT(allocVector(STRSXP, n));
-      n_protect++;
-      for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i)));
+      if (need2utf8(x, n)) {
+        ux = PROTECT(allocVector(STRSXP, n)); n_protect++; 
+        for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i))); 
+      } else {
+        ux = x;
+      }
       uxd = DATAPTR(ux);
       if (sortStr) { csort_pre(uxd, n); alloc_csort_otmp(gsmax[1-flip]); g = &csort; }
       else g = &cgroup; // no increasing/decreasing order required if sortStr = FALSE, just a dummy argument

From 3f36a4b3d1c9dfc23c4b8912b6cb571f61a010fa Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Tue, 20 Mar 2018 09:13:11 +0800
Subject: [PATCH 07/11] erase the gcc warning that uxd may be used  before
 initilizing

---
 src/forder.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/forder.c b/src/forder.c
index d768a0842d..90ff613811 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -1144,7 +1144,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   int *o = INTEGER(ans);                      // TO DO: save allocation if NULL is returned (isSorted==TRUE)
   o[0] = -1;                                  // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order
                                               // using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values.
-  xd = DATAPTR(x);
+  xd = DATAPTR(x); uxd = xd;
   stackgrps = length(by)>1 || LOGICAL(retGrp)[0];
   savetl_init();   // from now on use Error not error.
 

From 8e04d53496432f66c1f1655e1aa0ab1d8f01c70a Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Fri, 30 Mar 2018 21:33:24 +0800
Subject: [PATCH 08/11] use latin1 encoding example so that it can be tested on
 a linux machine

---
 inst/tests/tests.Rraw | 46 +++++++++++++++++--------------------------
 1 file changed, 18 insertions(+), 28 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index abf4e0b014..fcd7c28cb2 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11546,34 +11546,24 @@ test(1894.12, DT[, sum(y)*..z], error="..z in j is looking for z in calling scop
 
 test(1895, getDTthreads(verbose=TRUE), output="omp_get_max_threads.*omp_get_thread_limit.*DTthreads")
 
-# Ensure data.table won't break even if garbage collection gets triggered during sorting
-# a large numbers of non-ASCII characters.
-utf8_strings <- c(
-  '\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca',
-  '\u7ea2\u5229\u6536\u5165',
-  '\u4ef7\u5dee\u6536\u5165',
-  '\u5176\u4ed6\u4e1a\u52a1\u652f\u51fa',
-  '\u8d44\u4ea7\u51cf\u503c\u635f\u5931')
-if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) {
-  # The only meaningful environment for this test case is on a Simplified Chinese Language Windows Machine.
-  # Technically speaking, the native encoding should be an encoding that supports
-  # Simplified Chinese other that UTF-8.
-  native_strings <- enc2native(utf8_strings)
-  # 1e7 length character should be large enough to trigger the garbage collecting
-  DT <- data.table(x = rep(native_strings, 0.2e7), key = "x")
-  test(1896.1, unique(DT$x), sort(utf8_strings, method = "radix"))
-
-  # by, keyby should treat the string with different encoding as the same
-  mixed_strings <- c(utf8_strings, native_strings)
-  DT <- data.table(x = mixed_strings)
-  test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
-  test(1896.3, DT[, uniqueN(x)], 5L)
-
-  DT <- data.table(x = mixed_strings, y = c(native_strings, utf8_strings), z = 1)
-  test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
-  test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
-  test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)
-}
+utf8_strings <- c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de")
+latin1_strings <- iconv(utf8_strings, from = "UTF-8", to = "latin1")
+# 1e7 length character should be large enough to trigger the garbage collecting
+DT <- data.table(x = rep(latin1_strings, 0.2e7), key = "x")
+test(1896.1, enc2utf8(unique(DT$x)), sort(utf8_strings, method = "radix"))
+
+# by, keyby should treat the string with different encoding as the same
+mixed_strings <- c(utf8_strings, latin1_strings)
+DT <- data.table(x = mixed_strings)
+test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
+test(1896.3, DT[, uniqueN(x)], 5L)
+
+DT <- data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1)
+test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
+test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
+test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)
+
+
 
 ###################################
 #  Add new tests above this line  #

From 886fa68cf3c29c05dcf06679f128d9b1b1306714 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Fri, 30 Mar 2018 23:39:10 +0800
Subject: [PATCH 09/11] fix the encoding related error on the i386 version of R

---
 src/forder.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/forder.c b/src/forder.c
index 90ff613811..1867db00dc 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -1144,7 +1144,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   int *o = INTEGER(ans);                      // TO DO: save allocation if NULL is returned (isSorted==TRUE)
   o[0] = -1;                                  // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order
                                               // using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values.
-  xd = DATAPTR(x); uxd = xd;
+  xd = DATAPTR(x); uxd = xd; // init uxd with the same value as xd. the only case that they will differ is the string needs to be re-encoded as UTF8
   stackgrps = length(by)>1 || LOGICAL(retGrp)[0];
   savetl_init();   // from now on use Error not error.
 
@@ -1218,7 +1218,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
 
   for (col=2; col<=length(by); col++) {
     x = VECTOR_ELT(DT,INTEGER(by)[col-1]-1);
-    xd = DATAPTR(x);
+    xd = DATAPTR(x); uxd = xd; // init uxd with the same value as xd. the only case that they will differ is the string needs to be re-encoded as UTF8
     ngrp = gsngrp[flip];
     if (ngrp == n && nalast != 0) break;
     flipflop();
@@ -1280,13 +1280,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       // ** TO DO **: if isSorted,  we can just point xsub into x directly. If (*f)() returns 0, though, will have to copy x at that point
       //        When doing this,  xsub could be allocated at that point for the first time.
       if (size==4) {
-        for (j=0; j<thisgrpn; j++) ((int *)xsub)[j] = ((int *)xd)[o[i++]-1];
+        for (j=0; j<thisgrpn; j++) ((int *)xsub)[j] = ((int *)uxd)[o[i++]-1];
       } else {
-        if (TYPEOF(x) == STRSXP) {
-          for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)uxd)[o[i++]-1];
-        } else {
-          for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)xd)[o[i++]-1];
-        }
+        for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)uxd)[o[i++]-1];
       }
 
       TEND(2)

From ecf92e9ac3f34fa3e884c6e347a8624ac2e3b314 Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 30 Mar 2018 13:46:21 -0700
Subject: [PATCH 10/11] Minor simplification.

---
 src/forder.c | 44 ++++++++++++++++++++------------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/src/forder.c b/src/forder.c
index 1867db00dc..8ad06bbaa6 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -1093,8 +1093,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
 {
   int i, j, k, grp, ngrp, tmp, *osub, thisgrpn, n, col;
   Rboolean isSorted = TRUE;
-  SEXP x, ux, class;
-  void *xd, *uxd;
+  SEXP x, class;
+  void *xd;
 #ifdef TIMING_ON
   memset(tblock, 0, NBLOCK*sizeof(clock_t));
   memset(nblock, 0, NBLOCK*sizeof(int));
@@ -1141,10 +1141,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   // if n==1, the code is left to proceed below in case one or more of the 1-row by= columns are NA and na.last=NA. Otherwise it would be easy to return now.
 
   SEXP ans = PROTECT(allocVector(INTSXP, n)); // once for the result, needs to be length n.
+  int n_protect = 1;
   int *o = INTEGER(ans);                      // TO DO: save allocation if NULL is returned (isSorted==TRUE)
   o[0] = -1;                                  // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order
                                               // using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values.
-  xd = DATAPTR(x); uxd = xd; // init uxd with the same value as xd. the only case that they will differ is the string needs to be re-encoded as UTF8
+  xd = DATAPTR(x);
   stackgrps = length(by)>1 || LOGICAL(retGrp)[0];
   savetl_init();   // from now on use Error not error.
 
@@ -1168,7 +1169,6 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
     Error("First column being ordered is type '%s', not yet supported", type2char(TYPEOF(x)));
   }
 
-  int n_protect = 0;
   if (tmp) {                                  // -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs
     if (tmp == 1) {                         // same as expected in 'order' (1 = increasing, -1 = decreasing)
       isSorted = TRUE;
@@ -1189,14 +1189,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       dsort(xd, o, n); break;
     case STRSXP :
       if (need2utf8(x, n)) {
-        ux = PROTECT(allocVector(STRSXP, n)); n_protect++; 
-        for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i))); 
-      } else {
-        ux = x;
+        SEXP tt = PROTECT(allocVector(STRSXP, n)); n_protect++;
+        for (int i=0; i<n; i++) SET_STRING_ELT(tt, i, ENC2UTF8(STRING_ELT(x, i)));
+        xd = DATAPTR(tt);
       }
-      uxd = DATAPTR(ux);
-      if (sortStr) { csort_pre(uxd, n); alloc_csort_otmp(n); csort(uxd, o, n); }
-      else cgroup(uxd, o, n);
+      if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(n); csort(xd, o, n); }
+      else cgroup(xd, o, n);
       break;
     default :
       Error("Internal error: previous default should have caught unsupported type");
@@ -1218,7 +1216,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
 
   for (col=2; col<=length(by); col++) {
     x = VECTOR_ELT(DT,INTEGER(by)[col-1]-1);
-    xd = DATAPTR(x); uxd = xd; // init uxd with the same value as xd. the only case that they will differ is the string needs to be re-encoded as UTF8
+    xd = DATAPTR(x);
     ngrp = gsngrp[flip];
     if (ngrp == n && nalast != 0) break;
     flipflop();
@@ -1240,13 +1238,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
     case STRSXP :
       f = &csorted;
       if (need2utf8(x, n)) {
-        ux = PROTECT(allocVector(STRSXP, n)); n_protect++; 
-        for (int i=0; i<n; i++) SET_STRING_ELT(ux, i, ENC2UTF8(STRING_ELT(x, i))); 
-      } else {
-        ux = x;
+        SEXP tt = PROTECT(allocVector(STRSXP, n)); n_protect++;
+        for (int i=0; i<n; i++) SET_STRING_ELT(tt, i, ENC2UTF8(STRING_ELT(x, i)));
+        xd = DATAPTR(tt);
       }
-      uxd = DATAPTR(ux);
-      if (sortStr) { csort_pre(uxd, n); alloc_csort_otmp(gsmax[1-flip]); g = &csort; }
+      if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(gsmax[1-flip]); g = &csort; }
       else g = &cgroup; // no increasing/decreasing order required if sortStr = FALSE, just a dummy argument
       break;
     default:
@@ -1280,9 +1276,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       // ** TO DO **: if isSorted,  we can just point xsub into x directly. If (*f)() returns 0, though, will have to copy x at that point
       //        When doing this,  xsub could be allocated at that point for the first time.
       if (size==4) {
-        for (j=0; j<thisgrpn; j++) ((int *)xsub)[j] = ((int *)uxd)[o[i++]-1];
+        for (j=0; j<thisgrpn; j++) ((int *)xsub)[j] = ((int *)xd)[o[i++]-1];
       } else {
-        for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)uxd)[o[i++]-1];
+        for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)xd)[o[i++]-1];
       }
 
       TEND(2)
@@ -1335,12 +1331,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   maxlen = 1;  // reset global. Minimum needed to count "" and NA
   ustr_n = 0;
   savetl_end();
-  if (n_protect) UNPROTECT(n_protect); // Should be safe to remove after truelength being set back
   free(ustr);                ustr=NULL;          ustr_alloc=0;
 
   if (isSorted) {
-    UNPROTECT(1);  // The existing o vector, which we may save in future, if in future we only create when isSorted becomes FALSE
+    // the o vector created earlier could be avoided in this case if we only create it when isSorted becomes FALSE
     ans = PROTECT(allocVector(INTSXP, 0));  // Can't attach attributes to NULL
+    n_protect++;
   }
   if (LOGICAL(retGrp)[0]) {
     ngrp = gsngrp[flip];
@@ -1366,8 +1362,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   free(cradix_counts);       cradix_counts=NULL; cradix_counts_alloc=0;
   free(cradix_xtmp);         cradix_xtmp=NULL;   cradix_xtmp_alloc=0;   // TO DO: use xtmp already got
 
-  UNPROTECT(1);
-  return( ans );
+  UNPROTECT(n_protect);
+  return ans;
 }
 
 // TODO: implement 'order' argument to 'fsorted'

From 1ca89ba788cc99d15cd06ebd861ddeb3a9d138bc Mon Sep 17 00:00:00 2001
From: Matt Dowle <mattjdowle@gmail.com>
Date: Fri, 30 Mar 2018 14:12:33 -0700
Subject: [PATCH 11/11] Reduced size of new test 1896 from 60s down to under 1s
 for CRAN. Added reminder to benchmarks.Rraw.

---
 inst/tests/benchmark.Rraw |  2 ++
 inst/tests/tests.Rraw     | 14 +++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw
index 16b7c1efa2..d37dd24252 100644
--- a/inst/tests/benchmark.Rraw
+++ b/inst/tests/benchmark.Rraw
@@ -166,3 +166,5 @@ test(1742.3, L[[1L]], c(27L,38L))
 test(1742.4, L[[1000000L]], c(76L, 40L))
 test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40"))
 
+# Add scaled-up non-ASCII forder test 1896
+
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index fcd7c28cb2..4a256381b2 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11546,19 +11546,19 @@ test(1894.12, DT[, sum(y)*..z], error="..z in j is looking for z in calling scop
 
 test(1895, getDTthreads(verbose=TRUE), output="omp_get_max_threads.*omp_get_thread_limit.*DTthreads")
 
-utf8_strings <- c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de")
-latin1_strings <- iconv(utf8_strings, from = "UTF-8", to = "latin1")
-# 1e7 length character should be large enough to trigger the garbage collecting
-DT <- data.table(x = rep(latin1_strings, 0.2e7), key = "x")
+# Non ascii missing protects on ENC2UTF8; issue #2674
+utf8_strings = c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de")
+latin1_strings = iconv(utf8_strings, from = "UTF-8", to = "latin1")
+DT = data.table(x = sample(latin1_strings, 1000, replace=TRUE), key = "x")
 test(1896.1, enc2utf8(unique(DT$x)), sort(utf8_strings, method = "radix"))
 
 # by, keyby should treat the string with different encoding as the same
-mixed_strings <- c(utf8_strings, latin1_strings)
-DT <- data.table(x = mixed_strings)
+mixed_strings = c(utf8_strings, latin1_strings)
+DT = data.table(x = mixed_strings)
 test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
 test(1896.3, DT[, uniqueN(x)], 5L)
 
-DT <- data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1)
+DT = data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1)
 test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
 test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
 test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)