diff --git a/NEWS.md b/NEWS.md index 3b25ec1369..cd0a372f6d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -195,6 +195,8 @@ Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdat 37. `getDTthreads()` could return INT_MAX (2 billion) after an explicit call to `setDTthreads(0)`, [PR#2708](https://github.com/Rdatatable/data.table/pull/2708). +38. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678), [#2674](https://github.com/Rdatatable/data.table/issues/2674). + #### NOTES 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 16b7c1efa2..d37dd24252 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -166,3 +166,5 @@ test(1742.3, L[[1L]], c(27L,38L)) test(1742.4, L[[1000000L]], c(76L, 40L)) test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40")) +# Add scaled-up non-ASCII forder test 1896 + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f7f431e1a1..4a256381b2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11546,6 +11546,25 @@ test(1894.12, DT[, sum(y)*..z], error="..z in j is looking for z in calling scop test(1895, getDTthreads(verbose=TRUE), output="omp_get_max_threads.*omp_get_thread_limit.*DTthreads") +# Non ascii missing protects on ENC2UTF8; issue #2674 +utf8_strings = c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de") +latin1_strings = iconv(utf8_strings, from = "UTF-8", to = "latin1") +DT = data.table(x = sample(latin1_strings, 1000, replace=TRUE), key = "x") +test(1896.1, enc2utf8(unique(DT$x)), sort(utf8_strings, method = "radix")) + +# by, keyby should treat the string with different encoding as the same +mixed_strings = c(utf8_strings, latin1_strings) +DT = data.table(x = mixed_strings) +test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) +test(1896.3, DT[, uniqueN(x)], 5L) + +DT = data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1) +test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L) +test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L) +test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) + + + ################################### # Add new tests above this line # ################################### diff --git a/src/data.table.h b/src/data.table.h index 3bdcc38fa5..0330b292a2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -47,7 +47,8 @@ typedef R_xlen_t RLEN; // This IS_ASCII will dereference s and that cache fetch is the part that may bite more than the branch, though. Without a call to // to ENC2UTF as all, the pointer value can just be compared by the calling code without deferencing it. It may still be worth // timing the impact and manually avoiding (is there an IS_ASCII on the character vector rather than testing each item every time?) -#define ENC2UTF8(s) ((IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8)) +#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) +#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8)) // init.c void setSizes(); @@ -89,6 +90,7 @@ unsigned long long dtwiddle(void *p, int i, int order); unsigned long long i64twiddle(void *p, int i, int order); unsigned long long (*twiddle)(void *, int, int); SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); +bool need2utf8(SEXP x, int n); // reorder.c SEXP reorder(SEXP x, SEXP order); diff --git a/src/forder.c b/src/forder.c index c11611ed59..8ad06bbaa6 100644 --- a/src/forder.c +++ b/src/forder.c @@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n) /* can't use otmp, since iradix might be called here and that uses otmp (and xtmp). alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column, or n=maxgrpn if onwards columns */ - for(i=0; i0) { // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much. @@ -1082,6 +1082,12 @@ static void dsort(double *x, int *o, int n) } } +bool need2utf8(SEXP x, int n) +{ + for (int i=0; i