Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@ Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdat

37. `getDTthreads()` could return INT_MAX (2 billion) after an explicit call to `setDTthreads(0)`, [PR#2708](https://github.com/Rdatatable/data.table/pull/2708).

38. Fixed a bug on Windows that `data.table` may break if the garbage collecting was triggered when sorting a large number of non-ASCII characters. Thanks to @shrektan for reporting and fixing [PR#2678](https://github.com/Rdatatable/data.table/pull/2678), [#2674](https://github.com/Rdatatable/data.table/issues/2674).

#### NOTES

0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change.
Expand Down
2 changes: 2 additions & 0 deletions inst/tests/benchmark.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,5 @@ test(1742.3, L[[1L]], c(27L,38L))
test(1742.4, L[[1000000L]], c(76L, 40L))
test(1742.5, substring(x,nchar(x)-10,nchar(x)), c("50,28,95,76","62,87,23,40"))

# Add scaled-up non-ASCII forder test 1896

19 changes: 19 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -11546,6 +11546,25 @@ test(1894.12, DT[, sum(y)*..z], error="..z in j is looking for z in calling scop

test(1895, getDTthreads(verbose=TRUE), output="omp_get_max_threads.*omp_get_thread_limit.*DTthreads")

# Non ascii missing protects on ENC2UTF8; issue #2674
utf8_strings = c("\u00e7ile", "fa\u00e7ile", "El. pa\u00c5\u00a1tas", "\u00a1tas", "\u00de")
latin1_strings = iconv(utf8_strings, from = "UTF-8", to = "latin1")
DT = data.table(x = sample(latin1_strings, 1000, replace=TRUE), key = "x")
test(1896.1, enc2utf8(unique(DT$x)), sort(utf8_strings, method = "radix"))

# by, keyby should treat the string with different encoding as the same
mixed_strings = c(utf8_strings, latin1_strings)
DT = data.table(x = mixed_strings)
test(1896.2, DT[, .(CT = .N), keyby = x]$CT, rep(2L, 5))
test(1896.3, DT[, uniqueN(x)], 5L)

DT = data.table(x = mixed_strings, y = c(latin1_strings, utf8_strings), z = 1)
test(1896.4, nrow(DT[, .N, by = .(z, x, y)]), 5L)
test(1896.5, nrow(DT[, .N, by = .(y, x, z)]), 5L)
test(1896.6, nrow(DT[, .N, by = .(y, z, x)]), 5L)



###################################
# Add new tests above this line #
###################################
Expand Down
4 changes: 3 additions & 1 deletion src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ typedef R_xlen_t RLEN;
// This IS_ASCII will dereference s and that cache fetch is the part that may bite more than the branch, though. Without a call to
// to ENC2UTF as all, the pointer value can just be compared by the calling code without deferencing it. It may still be worth
// timing the impact and manually avoiding (is there an IS_ASCII on the character vector rather than testing each item every time?)
#define ENC2UTF8(s) ((IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))
#define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s))
#define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8))

// init.c
void setSizes();
Expand Down Expand Up @@ -89,6 +90,7 @@ unsigned long long dtwiddle(void *p, int i, int order);
unsigned long long i64twiddle(void *p, int i, int order);
unsigned long long (*twiddle)(void *, int, int);
SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg);
bool need2utf8(SEXP x, int n);

// reorder.c
SEXP reorder(SEXP x, SEXP order);
Expand Down
35 changes: 28 additions & 7 deletions src/forder.c
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n)
/* can't use otmp, since iradix might be called here and that uses otmp (and xtmp).
alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column,
or n=maxgrpn if onwards columns */
for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(ENC2UTF8(x[i]));
for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(x[i]);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

great catch 👏

if (nalast == 0 && n == 2) { // special case for nalast==0. n==1 is handled inside forder. at least 1 will be NA here
if (o[0] == -1) for (i=0; i<n; i++) o[i] = i+1; // else use o from caller directly (not 1st column)
for (int i=0; i<n; i++) if (csort_otmp[i] == NA_INTEGER) o[i] = 0;
Expand Down Expand Up @@ -899,7 +899,7 @@ static void csort_pre(SEXP *x, int n)
// savetl_init() is called once at the start of forder
old_un = ustr_n;
for(i=0; i<n; i++) {
s = ENC2UTF8(x[i]);
s = x[i];
if (TRUELENGTH(s)<0) continue; // this case first as it's the most frequent. Already in ustr, this negative is its ordering.
if (TRUELENGTH(s)>0) { // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore
savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.
Expand Down Expand Up @@ -1082,6 +1082,12 @@ static void dsort(double *x, int *o, int n)
}
}

bool need2utf8(SEXP x, int n)
{
for (int i=0; i<n; i++) if (NEED2UTF8(STRING_ELT(x, i))) return(true);
return(false);
}

SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP naArg)
// sortStr TRUE from setkey, FALSE from by=
{
Expand Down Expand Up @@ -1135,6 +1141,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
// if n==1, the code is left to proceed below in case one or more of the 1-row by= columns are NA and na.last=NA. Otherwise it would be easy to return now.

SEXP ans = PROTECT(allocVector(INTSXP, n)); // once for the result, needs to be length n.
int n_protect = 1;
int *o = INTEGER(ans); // TO DO: save allocation if NULL is returned (isSorted==TRUE)
o[0] = -1; // so [i|c|d]sort know they can populate o directly with no working memory needed to reorder existing order
// using -1 rather than 0 because 'nalast = 0' replaces 'o[.]' with 0 values.
Expand All @@ -1161,6 +1168,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
default :
Error("First column being ordered is type '%s', not yet supported", type2char(TYPEOF(x)));
}

if (tmp) { // -1 or 1. NEW: or -2 in case of nalast == 0 and all NAs
if (tmp == 1) { // same as expected in 'order' (1 = increasing, -1 = decreasing)
isSorted = TRUE;
Expand All @@ -1180,6 +1188,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
case REALSXP :
dsort(xd, o, n); break;
case STRSXP :
if (need2utf8(x, n)) {
SEXP tt = PROTECT(allocVector(STRSXP, n)); n_protect++;
for (int i=0; i<n; i++) SET_STRING_ELT(tt, i, ENC2UTF8(STRING_ELT(x, i)));
xd = DATAPTR(tt);
}
if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(n); csort(xd, o, n); }
else cgroup(xd, o, n);
break;
Expand Down Expand Up @@ -1224,6 +1237,11 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
f = &dsorted; g = &dsort; break;
case STRSXP :
f = &csorted;
if (need2utf8(x, n)) {
SEXP tt = PROTECT(allocVector(STRSXP, n)); n_protect++;
for (int i=0; i<n; i++) SET_STRING_ELT(tt, i, ENC2UTF8(STRING_ELT(x, i)));
xd = DATAPTR(tt);
}
if (sortStr) { csort_pre(xd, n); alloc_csort_otmp(gsmax[1-flip]); g = &csort; }
else g = &cgroup; // no increasing/decreasing order required if sortStr = FALSE, just a dummy argument
break;
Expand Down Expand Up @@ -1257,10 +1275,12 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
osub = o+i;
// ** TO DO **: if isSorted, we can just point xsub into x directly. If (*f)() returns 0, though, will have to copy x at that point
// When doing this, xsub could be allocated at that point for the first time.
if (size==4)
if (size==4) {
for (j=0; j<thisgrpn; j++) ((int *)xsub)[j] = ((int *)xd)[o[i++]-1];
else
} else {
for (j=0; j<thisgrpn; j++) ((double *)xsub)[j] = ((double *)xd)[o[i++]-1];
}

TEND(2)

// continue; // BASELINE short circuit timing point. Up to here is the cost of creating xsub.
Expand Down Expand Up @@ -1314,8 +1334,9 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
free(ustr); ustr=NULL; ustr_alloc=0;

if (isSorted) {
UNPROTECT(1); // The existing o vector, which we may save in future, if in future we only create when isSorted becomes FALSE
// the o vector created earlier could be avoided in this case if we only create it when isSorted becomes FALSE
ans = PROTECT(allocVector(INTSXP, 0)); // Can't attach attributes to NULL
n_protect++;
}
if (LOGICAL(retGrp)[0]) {
ngrp = gsngrp[flip];
Expand All @@ -1341,8 +1362,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrp, SEXP sortStrArg, SEXP orderArg, SEXP
free(cradix_counts); cradix_counts=NULL; cradix_counts_alloc=0;
free(cradix_xtmp); cradix_xtmp=NULL; cradix_xtmp_alloc=0; // TO DO: use xtmp already got

UNPROTECT(1);
return( ans );
UNPROTECT(n_protect);
return ans;
}

// TODO: implement 'order' argument to 'fsorted'
Expand Down