diff --git a/NEWS.md b/NEWS.md index cea0c7a35e..2fa99fb206 100644 --- a/NEWS.md +++ b/NEWS.md @@ -124,6 +124,9 @@ 30. "Negative length vectors not allowed" error when grouping `median` and `var` fixed, [#2046](https://github.com/Rdatatable/data.table/issues/2046) and [#2111](https://github.com/Rdatatable/data.table/issues/2111). Thanks to @caneff and @osofr for reporting and to @kmillar for debugging and explaining the cause. +31. Fixed a bug on Windows where `data.table`s containing non-UTF8 strings in `key`s were not properly sorted, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing. + + #### NOTES 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change. diff --git a/inst/tests/issue_2566.csv b/inst/tests/issue_2566.csv new file mode 100644 index 0000000000..a9f5448d3d --- /dev/null +++ b/inst/tests/issue_2566.csv @@ -0,0 +1,6 @@ +x,y,z +公允价值变动损益,公允价值变动损益,1 +红利收入,红利收入,2 +价差收入,价差收入,3 +其他业务支出,其他业务支出,4 +资产减值损失,资产减值损失,5 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a524246114..c7310bf562 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11258,6 +11258,18 @@ test(1863.1, DT[, median(y), by=x], data.table(x=1, V1=0, key="x")) DT = data.table(col1 = c(1,1,1, 2,2,2), col2 = c(2,2,2,1,1,1), ID = c(rep(1,3), rep(2,3)), key="ID") test(1863.2, DT[, lapply(.SD, var), by=ID], data.table(ID=c(1,2), col1=0, col2=0, key="ID")) +# Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826 +# Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding. +DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8") +# `fread` return a utf-8 encoded data, we should convert x to native encoding. +# However, we need this condition to ensure the native encoding can be used for Chinese characters. +# Otherwise, the test will fail because the strings have been damaged. +if (identical(enc2utf8(enc2native(DT$x)), DT$x)) DT[, x:= enc2native(x)] +setkey(DT, x) +test(1864.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) +setkey(DT, y) +test(1864.2, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) + ########################## diff --git a/src/forder.c b/src/forder.c index 19172221fe..470ac322bd 100644 --- a/src/forder.c +++ b/src/forder.c @@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n) /* can't use otmp, since iradix might be called here and that uses otmp (and xtmp). alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column, or n=maxgrpn if onwards columns */ - for(i=0; i0) { // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.