Rdatatable · mattdowle · Mar 30, 2018 · Mar 15, 2018 · Mar 16, 2018 · Mar 16, 2018
@@ -195,6 +195,8 @@ Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdat
 
 37. `getDTthreads()` could return INT_MAX (2 billion) after an explicit call to `setDTthreads(0)`, [PR#2708](https://github.com/Rdatatable/data.table/pull/2708).
 
+38. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678),  [#2674](https://github.com/Rdatatable/data.table/issues/2674).
+
 #### NOTES
 
 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change.

@@ -166,3 +166,5 @@ test(1742.3, L[[1L]], c(27L,38L))
 test(1742.4, L[[1000000L]], c(76L, 40L))
 test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40"))
 
+# Add scaled-up non-ASCII forder test 1896
+
@@ -11546,6 +11546,25 @@ test(1894.12, DT[, sum(y)*..z], error="..z in j is looking for z in calling scop
 
 test(1895, getDTthreads(verbose=TRUE), output="omp_get_max_threads.*omp_get_thread_limit.*DTthreads")
 
+# Non ascii missing protects on ENC2UTF8; issue #2674
+utf8_strings = c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de")
+latin1_strings = iconv(utf8_strings, from = "UTF-8", to = "latin1")
+DT = data.table(x = sample(latin1_strings, 1000, replace=TRUE), key = "x")
+test(1896.1, enc2utf8(unique(DT$x)), sort(utf8_strings, method = "radix"))
+
+# by, keyby should treat the string with different encoding as the same
+mixed_strings = c(utf8_strings, latin1_strings)
+DT = data.table(x = mixed_strings)
+test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
+test(1896.3, DT[, uniqueN(x)], 5L)
+
+DT = data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1)
+test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
+test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
+test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)
+
+
+
 ###################################
 #  Add new tests above this line  #
 ###################################

@@ -47,7 +47,8 @@ typedef R_xlen_t RLEN;
 // This IS_ASCII will dereference s and that cache fetch is the part that may bite more than the branch, though. Without a call to
 // to ENC2UTF as all, the pointer value can just be compared by the calling code without deferencing it. It may still be worth
 // timing the impact and manually avoiding (is there an IS_ASCII on the character vector rather than testing each item every time?)
-#define ENC2UTF8(s) ((IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))
+#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s))
+#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))
 
 // init.c
 void setSizes();
@@ -89,6 +90,7 @@ unsigned long long dtwiddle(void *p, int i, int order);
 unsigned long long i64twiddle(void *p, int i, int order);
 unsigned long long (*twiddle)(void *, int, int);
 SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg);
+bool need2utf8(SEXP x, int n);
 
 // reorder.c
 SEXP reorder(SEXP x, SEXP order);

@@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n)
   /* can't use otmp, since iradix might be called here and that uses otmp (and xtmp).
      alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column,
      or n=maxgrpn if onwards columns */
-  for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(ENC2UTF8(x[i]));
+  for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(x[i]);
   if (nalast == 0 && n == 2) {                        // special case for nalast==0. n==1 is handled inside forder. at least 1 will be NA here
     if (o[0] == -1) for (i=0; i<n; i++) o[i] = i+1;    // else use o from caller directly (not 1st column)
     for (int i=0; i<n; i++) if (csort_otmp[i] == NA_INTEGER) o[i] = 0;
@@ -899,7 +899,7 @@ static void csort_pre(SEXP *x, int n)
   // savetl_init() is called once at the start of forder
   old_un = ustr_n;
   for(i=0; i<n; i++) {
-    s = ENC2UTF8(x[i]);
+    s = x[i];
     if (TRUELENGTH(s)<0) continue;   // this case first as it's the most frequent. Already in ustr, this negative is its ordering.
     if (TRUELENGTH(s)>0) {  // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore
       savetl(s);          // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.
@@ -1082,6 +1082,12 @@ static void dsort(double *x, int *o, int n)
   }
 }
 
+bool need2utf8(SEXP x, int n)
+{
+  for (int i=0; i<n; i++) if (NEED2UTF8(STRING_ELT(x, i))) return(true);
+  return(false);
+}
+
 SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg)
 // sortStr TRUE from setkey, FALSE from by=
 {
@@ -1135,6 +1141,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   // if n==1, the code is left to proceed below in case one or more of the 1-row by= columns are NA and na.last=NA. Otherwise it would be easy to return now.
 
   SEXP ans = PROTECT(allocVector(INTSXP, n)); // once for the result, needs to be length n.
+  int n_protect = 1;
   int *o = INTEGER(ans);                      // TO DO: save allocation if NULL is returned (isSorted==TRUE)
   o[0] = -1;                                  // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order
                                               // using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values.
@@ -1161,6 +1168,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   default :
     Error("First column being ordered is type '%s', not yet supported", type2char(TYPEOF(x)));
   }
+
   if (tmp) {                                  // -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs
     if (tmp == 1) {                         // same as expected in 'order' (1 = increasing, -1 = decreasing)
       isSorted = TRUE;
@@ -1180,6 +1188,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
     case REALSXP :
       dsort(xd, o, n); break;
     case STRSXP :
+      if (need2utf8(x, n)) {
+        SEXP tt = PROTECT(allocVector(STRSXP, n)); n_protect++;
+        for (int i=0; i<n; i++) SET_STRING_ELT(tt, i, ENC2UTF8(STRING_ELT(x, i)));
+        xd = DATAPTR(tt);
+      }
       if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(n); csort(xd, o, n); }
       else cgroup(xd, o, n);
       break;
@@ -1224,6 +1237,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       f = &dsorted; g = &dsort; break;
     case STRSXP :
       f = &csorted;
+      if (need2utf8(x, n)) {
+        SEXP tt = PROTECT(allocVector(STRSXP, n)); n_protect++;
+        for (int i=0; i<n; i++) SET_STRING_ELT(tt, i, ENC2UTF8(STRING_ELT(x, i)));
+        xd = DATAPTR(tt);
+      }
       if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(gsmax[1-flip]); g = &csort; }
       else g = &cgroup; // no increasing/decreasing order required if sortStr = FALSE, just a dummy argument
       break;
@@ -1257,10 +1275,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
       osub = o+i;
       // ** TO DO **: if isSorted,  we can just point xsub into x directly. If (*f)() returns 0, though, will have to copy x at that point
       //        When doing this,  xsub could be allocated at that point for the first time.
-      if (size==4)
+      if (size==4) {
         for (j=0; j<thisgrpn; j++) ((int *)xsub)[j] = ((int *)xd)[o[i++]-1];
-      else
+      } else {
         for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)xd)[o[i++]-1];
+      }
+
       TEND(2)
 
       // continue;  // BASELINE short circuit timing point. Up to here is the cost of creating xsub.
@@ -1314,8 +1334,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   free(ustr);                ustr=NULL;          ustr_alloc=0;
 
   if (isSorted) {
-    UNPROTECT(1);  // The existing o vector, which we may save in future, if in future we only create when isSorted becomes FALSE
+    // the o vector created earlier could be avoided in this case if we only create it when isSorted becomes FALSE
     ans = PROTECT(allocVector(INTSXP, 0));  // Can't attach attributes to NULL
+    n_protect++;
   }
   if (LOGICAL(retGrp)[0]) {
     ngrp = gsngrp[flip];
@@ -1341,8 +1362,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
   free(cradix_counts);       cradix_counts=NULL; cradix_counts_alloc=0;
   free(cradix_xtmp);         cradix_xtmp=NULL;   cradix_xtmp_alloc=0;   // TO DO: use xtmp already got
 
-  UNPROTECT(1);
-  return( ans );
+  UNPROTECT(n_protect);
+  return ans;
 }
 
 // TODO: implement 'order' argument to 'fsorted'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -166,3 +166,5 @@ test(1742.3, L[[1L]], c(27L,38L))
		test(1742.4, L[[1000000L]], c(76L, 40L))
		test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40"))

		# Add scaled-up non-ASCII forder test 1896