From 6e855c70ada4c7fc86d90420915b99cb18b34351 Mon Sep 17 00:00:00 2001 From: shrektan Date: Fri, 16 Mar 2018 02:01:37 +0800 Subject: [PATCH 01/11] prevent the utf8 string from being collected by the garbage collector in forder() --- src/forder.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/forder.c b/src/forder.c index 470ac322bd..2659f2c8ed 100644 --- a/src/forder.c +++ b/src/forder.c @@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n) /* can't use otmp, since iradix might be called here and that uses otmp (and xtmp). alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column, or n=maxgrpn if onwards columns */ - for(i=0; i0) { // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much. @@ -1087,8 +1087,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP { int i, j, k, grp, ngrp, tmp, *osub, thisgrpn, n, col; Rboolean isSorted = TRUE; - SEXP x, class; - void *xd; + SEXP x, ux, class; + void *xd, *uxd; #ifdef TIMING_ON memset(tblock, 0, NBLOCK*sizeof(clock_t)); memset(nblock, 0, NBLOCK*sizeof(int)); @@ -1168,8 +1168,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP case REALSXP : dsort(xd, o, n); break; case STRSXP : - if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(n); csort(xd, o, n); } - else cgroup(xd, o, n); + ux = PROTECT(allocVector(STRSXP, n)); + for (int i=0; i Date: Fri, 16 Mar 2018 14:24:33 +0800 Subject: [PATCH 02/11] add tests and entries in NEWS.md --- NEWS.md | 3 +++ inst/tests/tests.Rraw | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/NEWS.md b/NEWS.md index 47da1d2c8b..1f20b6f187 100644 --- a/NEWS.md +++ b/NEWS.md @@ -143,6 +143,9 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631). +35. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678), [#2674](https://github.com/Rdatatable/data.table/issues/2674). + + #### NOTES 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 93ea0f50cc..af3c0be267 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11727,6 +11727,31 @@ test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes= warning = "column names.*are duplicated in the result") +# Ensure data.table won't break even if garbage collection gets triggered during sorting +# a large numbers of non-ASCII characters. +utf8_strings <- c( + '\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca', + '\u7ea2\u5229\u6536\u5165', + '\u4ef7\u5dee\u6536\u5165', + '\u5176\u4ed6\u4e1a\u52a1\u652f\u51fa', + '\u8d44\u4ea7\u51cf\u503c\u635f\u5931') +if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) { + # The only meaningful environment for this test case is on a Simplified Chinese Language Windows Machine. + # Technically speaking, the native encoding should be an encoding that supports + # Simplified Chinese other that UTF-8. + native_strings <- enc2native(utf8_strings) + # 1e7 length character should be large enough to trigger the garbage collecting + DT <- data.table(x = rep(native_strings, 0.2e7), key = "x") + test(1881.1, unique(DT$x), sort(utf8_strings, method = "radix")) + + # by, keyby should treat the string with different encoding as the same + mixed_strings <- c(utf8_strings, native_strings) + DT <- data.table(x = mixed_strings) + test(1881.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) + test(1881.3, DT[, uniqueN(x)], 5L) +} + + ################################### # Add new tests above this line # ################################### From 803b5f2c9c4b2a11e804eff05cff6a8d8a970091 Mon Sep 17 00:00:00 2001 From: shrektan Date: Fri, 16 Mar 2018 14:30:47 +0800 Subject: [PATCH 03/11] bumper the test number --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 50fbbef78d..13a5d93900 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11510,13 +11510,13 @@ if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) { native_strings <- enc2native(utf8_strings) # 1e7 length character should be large enough to trigger the garbage collecting DT <- data.table(x = rep(native_strings, 0.2e7), key = "x") - test(1881.1, unique(DT$x), sort(utf8_strings, method = "radix")) + test(1891.1, unique(DT$x), sort(utf8_strings, method = "radix")) # by, keyby should treat the string with different encoding as the same mixed_strings <- c(utf8_strings, native_strings) DT <- data.table(x = mixed_strings) - test(1881.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) - test(1881.3, DT[, uniqueN(x)], 5L) + test(1891.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) + test(1891.3, DT[, uniqueN(x)], 5L) } From be9a6dec14dc29adacb106f5101afc81497f5420 Mon Sep 17 00:00:00 2001 From: shrektan Date: Fri, 16 Mar 2018 16:00:37 +0800 Subject: [PATCH 04/11] mixed encoded chars should be compared under UTF8 --- inst/tests/tests.Rraw | 5 +++++ src/forder.c | 23 ++++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 13a5d93900..b5c50139fc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11517,6 +11517,11 @@ if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) { DT <- data.table(x = mixed_strings) test(1891.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) test(1891.3, DT[, uniqueN(x)], 5L) + + DT <- data.table(x = mixed_strings, y = c(native_strings, utf8_strings), z = 1) + test(1881.4, nrow(DT[, .N, by = .(z, x, y)]), 5L) + test(1881.5, nrow(DT[, .N, by = .(y, x, z)]), 5L) + test(1881.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) } diff --git a/src/forder.c b/src/forder.c index 7030b6ef6f..062736018b 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1161,6 +1161,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP default : Error("First column being ordered is type '%s', not yet supported", type2char(TYPEOF(x))); } + + int n_protect = 0; if (tmp) { // -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs if (tmp == 1) { // same as expected in 'order' (1 = increasing, -1 = decreasing) isSorted = TRUE; @@ -1181,11 +1183,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP dsort(xd, o, n); break; case STRSXP : ux = PROTECT(allocVector(STRSXP, n)); + n_protect++; for (int i=0; i Date: Sat, 17 Mar 2018 19:01:21 +0800 Subject: [PATCH 05/11] fix a typo on test number --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b5c50139fc..f144c1c924 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11519,9 +11519,9 @@ if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) { test(1891.3, DT[, uniqueN(x)], 5L) DT <- data.table(x = mixed_strings, y = c(native_strings, utf8_strings), z = 1) - test(1881.4, nrow(DT[, .N, by = .(z, x, y)]), 5L) - test(1881.5, nrow(DT[, .N, by = .(y, x, z)]), 5L) - test(1881.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) + test(1891.4, nrow(DT[, .N, by = .(z, x, y)]), 5L) + test(1891.5, nrow(DT[, .N, by = .(y, x, z)]), 5L) + test(1891.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) } From 84ca0b06d1414cfea00ad88ecb2ad162082f7ad7 Mon Sep 17 00:00:00 2001 From: shrektan Date: Sat, 17 Mar 2018 22:25:57 +0800 Subject: [PATCH 06/11] reduce overheads for all ascii/utf8 columns --- src/data.table.h | 4 +++- src/forder.c | 24 ++++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 3bdcc38fa5..0330b292a2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -47,7 +47,8 @@ typedef R_xlen_t RLEN; // This IS_ASCII will dereference s and that cache fetch is the part that may bite more than the branch, though. Without a call to // to ENC2UTF as all, the pointer value can just be compared by the calling code without deferencing it. It may still be worth // timing the impact and manually avoiding (is there an IS_ASCII on the character vector rather than testing each item every time?) -#define ENC2UTF8(s) ((IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8)) +#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) +#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8)) // init.c void setSizes(); @@ -89,6 +90,7 @@ unsigned long long dtwiddle(void *p, int i, int order); unsigned long long i64twiddle(void *p, int i, int order); unsigned long long (*twiddle)(void *, int, int); SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg); +bool need2utf8(SEXP x, int n); // reorder.c SEXP reorder(SEXP x, SEXP order); diff --git a/src/forder.c b/src/forder.c index 062736018b..d768a0842d 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1082,6 +1082,12 @@ static void dsort(double *x, int *o, int n) } } +bool need2utf8(SEXP x, int n) +{ + for (int i=0; i Date: Tue, 20 Mar 2018 09:13:11 +0800 Subject: [PATCH 07/11] erase the gcc warning that uxd may be used before initilizing --- src/forder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/forder.c b/src/forder.c index d768a0842d..90ff613811 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1144,7 +1144,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP int *o = INTEGER(ans); // TO DO: save allocation if NULL is returned (isSorted==TRUE) o[0] = -1; // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order // using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values. - xd = DATAPTR(x); + xd = DATAPTR(x); uxd = xd; stackgrps = length(by)>1 || LOGICAL(retGrp)[0]; savetl_init(); // from now on use Error not error. From 8e04d53496432f66c1f1655e1aa0ab1d8f01c70a Mon Sep 17 00:00:00 2001 From: shrektan Date: Fri, 30 Mar 2018 21:33:24 +0800 Subject: [PATCH 08/11] use latin1 encoding example so that it can be tested on a linux machine --- inst/tests/tests.Rraw | 46 +++++++++++++++++-------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index abf4e0b014..fcd7c28cb2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11546,34 +11546,24 @@ test(1894.12, DT[, sum(y)*..z], error="..z in j is looking for z in calling scop test(1895, getDTthreads(verbose=TRUE), output="omp_get_max_threads.*omp_get_thread_limit.*DTthreads") -# Ensure data.table won't break even if garbage collection gets triggered during sorting -# a large numbers of non-ASCII characters. -utf8_strings <- c( - '\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca', - '\u7ea2\u5229\u6536\u5165', - '\u4ef7\u5dee\u6536\u5165', - '\u5176\u4ed6\u4e1a\u52a1\u652f\u51fa', - '\u8d44\u4ea7\u51cf\u503c\u635f\u5931') -if (identical(enc2utf8(enc2native(utf8_strings)), utf8_strings)) { - # The only meaningful environment for this test case is on a Simplified Chinese Language Windows Machine. - # Technically speaking, the native encoding should be an encoding that supports - # Simplified Chinese other that UTF-8. - native_strings <- enc2native(utf8_strings) - # 1e7 length character should be large enough to trigger the garbage collecting - DT <- data.table(x = rep(native_strings, 0.2e7), key = "x") - test(1896.1, unique(DT$x), sort(utf8_strings, method = "radix")) - - # by, keyby should treat the string with different encoding as the same - mixed_strings <- c(utf8_strings, native_strings) - DT <- data.table(x = mixed_strings) - test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) - test(1896.3, DT[, uniqueN(x)], 5L) - - DT <- data.table(x = mixed_strings, y = c(native_strings, utf8_strings), z = 1) - test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L) - test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L) - test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) -} +utf8_strings <- c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de") +latin1_strings <- iconv(utf8_strings, from = "UTF-8", to = "latin1") +# 1e7 length character should be large enough to trigger the garbage collecting +DT <- data.table(x = rep(latin1_strings, 0.2e7), key = "x") +test(1896.1, enc2utf8(unique(DT$x)), sort(utf8_strings, method = "radix")) + +# by, keyby should treat the string with different encoding as the same +mixed_strings <- c(utf8_strings, latin1_strings) +DT <- data.table(x = mixed_strings) +test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) +test(1896.3, DT[, uniqueN(x)], 5L) + +DT <- data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1) +test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L) +test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L) +test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L) + + ################################### # Add new tests above this line # From 886fa68cf3c29c05dcf06679f128d9b1b1306714 Mon Sep 17 00:00:00 2001 From: shrektan Date: Fri, 30 Mar 2018 23:39:10 +0800 Subject: [PATCH 09/11] fix the encoding related error on the i386 version of R --- src/forder.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/forder.c b/src/forder.c index 90ff613811..1867db00dc 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1144,7 +1144,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP int *o = INTEGER(ans); // TO DO: save allocation if NULL is returned (isSorted==TRUE) o[0] = -1; // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order // using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values. - xd = DATAPTR(x); uxd = xd; + xd = DATAPTR(x); uxd = xd; // init uxd with the same value as xd. the only case that they will differ is the string needs to be re-encoded as UTF8 stackgrps = length(by)>1 || LOGICAL(retGrp)[0]; savetl_init(); // from now on use Error not error. @@ -1218,7 +1218,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP for (col=2; col<=length(by); col++) { x = VECTOR_ELT(DT,INTEGER(by)[col-1]-1); - xd = DATAPTR(x); + xd = DATAPTR(x); uxd = xd; // init uxd with the same value as xd. the only case that they will differ is the string needs to be re-encoded as UTF8 ngrp = gsngrp[flip]; if (ngrp == n && nalast != 0) break; flipflop(); @@ -1280,13 +1280,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP // ** TO DO **: if isSorted, we can just point xsub into x directly. If (*f)() returns 0, though, will have to copy x at that point // When doing this, xsub could be allocated at that point for the first time. if (size==4) { - for (j=0; j Date: Fri, 30 Mar 2018 13:46:21 -0700 Subject: [PATCH 10/11] Minor simplification. --- src/forder.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/forder.c b/src/forder.c index 1867db00dc..8ad06bbaa6 100644 --- a/src/forder.c +++ b/src/forder.c @@ -1093,8 +1093,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP { int i, j, k, grp, ngrp, tmp, *osub, thisgrpn, n, col; Rboolean isSorted = TRUE; - SEXP x, ux, class; - void *xd, *uxd; + SEXP x, class; + void *xd; #ifdef TIMING_ON memset(tblock, 0, NBLOCK*sizeof(clock_t)); memset(nblock, 0, NBLOCK*sizeof(int)); @@ -1141,10 +1141,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP // if n==1, the code is left to proceed below in case one or more of the 1-row by= columns are NA and na.last=NA. Otherwise it would be easy to return now. SEXP ans = PROTECT(allocVector(INTSXP, n)); // once for the result, needs to be length n. + int n_protect = 1; int *o = INTEGER(ans); // TO DO: save allocation if NULL is returned (isSorted==TRUE) o[0] = -1; // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order // using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values. - xd = DATAPTR(x); uxd = xd; // init uxd with the same value as xd. the only case that they will differ is the string needs to be re-encoded as UTF8 + xd = DATAPTR(x); stackgrps = length(by)>1 || LOGICAL(retGrp)[0]; savetl_init(); // from now on use Error not error. @@ -1168,7 +1169,6 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP Error("First column being ordered is type '%s', not yet supported", type2char(TYPEOF(x))); } - int n_protect = 0; if (tmp) { // -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs if (tmp == 1) { // same as expected in 'order' (1 = increasing, -1 = decreasing) isSorted = TRUE; @@ -1189,14 +1189,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP dsort(xd, o, n); break; case STRSXP : if (need2utf8(x, n)) { - ux = PROTECT(allocVector(STRSXP, n)); n_protect++; - for (int i=0; i Date: Fri, 30 Mar 2018 14:12:33 -0700 Subject: [PATCH 11/11] Reduced size of new test 1896 from 60s down to under 1s for CRAN. Added reminder to benchmarks.Rraw. --- inst/tests/benchmark.Rraw | 2 ++ inst/tests/tests.Rraw | 14 +++++++------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/inst/tests/benchmark.Rraw b/inst/tests/benchmark.Rraw index 16b7c1efa2..d37dd24252 100644 --- a/inst/tests/benchmark.Rraw +++ b/inst/tests/benchmark.Rraw @@ -166,3 +166,5 @@ test(1742.3, L[[1L]], c(27L,38L)) test(1742.4, L[[1000000L]], c(76L, 40L)) test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40")) +# Add scaled-up non-ASCII forder test 1896 + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fcd7c28cb2..4a256381b2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11546,19 +11546,19 @@ test(1894.12, DT[, sum(y)*..z], error="..z in j is looking for z in calling scop test(1895, getDTthreads(verbose=TRUE), output="omp_get_max_threads.*omp_get_thread_limit.*DTthreads") -utf8_strings <- c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de") -latin1_strings <- iconv(utf8_strings, from = "UTF-8", to = "latin1") -# 1e7 length character should be large enough to trigger the garbage collecting -DT <- data.table(x = rep(latin1_strings, 0.2e7), key = "x") +# Non ascii missing protects on ENC2UTF8; issue #2674 +utf8_strings = c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de") +latin1_strings = iconv(utf8_strings, from = "UTF-8", to = "latin1") +DT = data.table(x = sample(latin1_strings, 1000, replace=TRUE), key = "x") test(1896.1, enc2utf8(unique(DT$x)), sort(utf8_strings, method = "radix")) # by, keyby should treat the string with different encoding as the same -mixed_strings <- c(utf8_strings, latin1_strings) -DT <- data.table(x = mixed_strings) +mixed_strings = c(utf8_strings, latin1_strings) +DT = data.table(x = mixed_strings) test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5)) test(1896.3, DT[, uniqueN(x)], 5L) -DT <- data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1) +DT = data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1) test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L) test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L) test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)