From c9664d5b148451c11f8ee6ec25d1bfb354365796 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Sat, 13 Jan 2018 23:05:43 +0800 Subject: [PATCH 1/7] fix the bug when keys contain non UTF8 strings --- src/forder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/forder.c b/src/forder.c index 19172221fe..470ac322bd 100644 --- a/src/forder.c +++ b/src/forder.c @@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n) /* can't use otmp, since iradix might be called here and that uses otmp (and xtmp). alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column, or n=maxgrpn if onwards columns */ - for(i=0; i0) { // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore savetl(s); // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much. From 6f65cd168d3c3c632f2d2537660782b9bb40954e Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 14 Jan 2018 00:11:26 +0800 Subject: [PATCH 2/7] add news and tests --- NEWS.md | 1 + inst/tests/issue_2566.csv | 6 ++++++ inst/tests/tests.Rraw | 7 +++++++ 3 files changed, 14 insertions(+) create mode 100644 inst/tests/issue_2566.csv diff --git a/NEWS.md b/NEWS.md index 7ca93a8ad6..4ddcccef8f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -122,6 +122,7 @@ 29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again. +30. on Windows, `data.table` contain non-UTF8 strings in keys can be correctly ordered now, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing, [PR#2566](https://github.com/Rdatatable/data.table/pull/2566)" #### NOTES diff --git a/inst/tests/issue_2566.csv b/inst/tests/issue_2566.csv new file mode 100644 index 0000000000..a9f5448d3d --- /dev/null +++ b/inst/tests/issue_2566.csv @@ -0,0 +1,6 @@ +x,y,z +公允价值变动损益,公允价值变动损益,1 +红利收入,红利收入,2 +价差收入,价差收入,3 +其他业务支出,其他业务支出,4 +资产减值损失,资产减值损失,5 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index db76e96a89..e1bcd6696a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11254,6 +11254,13 @@ test(1862.1, unique(DT), data.table(A=1:2, B=3:4, key="A"), warning="deprecated options(datatable.old.unique.by.key=NULL) test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A")) +# Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826 +# Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding. +DT <- fread(file = testDir("inst/tests/issue_2566.csv"), encoding = "UTF-8") +setkey(DT, x) +test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) +setkey(DT, y) +test(1863.2, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) ########################## From cf601828c7292e984b7e1aefe1a9448bf6520ee3 Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 14 Jan 2018 00:18:33 +0800 Subject: [PATCH 3/7] correct the test csv's path --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e1bcd6696a..0a8be50099 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11256,7 +11256,7 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A")) # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding. -DT <- fread(file = testDir("inst/tests/issue_2566.csv"), encoding = "UTF-8") +DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8") setkey(DT, x) test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) setkey(DT, y) From 8697453b2abab11e1ca8da8c32ad9114e80bf0f0 Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 14 Jan 2018 00:38:29 +0800 Subject: [PATCH 4/7] improve the test --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0a8be50099..73762f235b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11257,6 +11257,7 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A")) # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding. DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8") +DT[, x := enc2native(x)] # because fread return's UTF-8 encoding column even on windows setkey(DT, x) test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) setkey(DT, y) From 454b2320cb460d52652748b4c846042417a3bbf7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 13 Jan 2018 11:50:21 -0500 Subject: [PATCH 5/7] grammar --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 4ddcccef8f..0a5d66356e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -122,7 +122,7 @@ 29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again. -30. on Windows, `data.table` contain non-UTF8 strings in keys can be correctly ordered now, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing, [PR#2566](https://github.com/Rdatatable/data.table/pull/2566)" +30. Fixed a bug on Windows where `data.table`s containing non-UTF8 strings in `key`s were not properly sorted, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826) and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing. #### NOTES From 516a88cce87b02399c1665e2571ad1fc9af83e5e Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 14 Jan 2018 00:55:00 +0800 Subject: [PATCH 6/7] enc2native may cause info loss so should use read.csv and then enc2utf8 --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 73762f235b..e88c79e86c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11256,8 +11256,8 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A")) # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding. -DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8") -DT[, x := enc2native(x)] # because fread return's UTF-8 encoding column even on windows +DT <- setDT(read.csv(file = testDir("issue_2566.csv"), fileEncoding = "UTF-8", as.is = TRUE)) +DT[, y := enc2utf8(y)] setkey(DT, x) test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) setkey(DT, y) From 7a39e8c51abd4c25500dc6c9e08510388942edae Mon Sep 17 00:00:00 2001 From: shrektan Date: Sun, 14 Jan 2018 01:29:36 +0800 Subject: [PATCH 7/7] final shoot for tests on a windows machine with an USA locale where Chinese strings can't be expressed in native encoding... --- inst/tests/tests.Rraw | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e88c79e86c..30ff72b989 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11256,8 +11256,11 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A")) # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding. -DT <- setDT(read.csv(file = testDir("issue_2566.csv"), fileEncoding = "UTF-8", as.is = TRUE)) -DT[, y := enc2utf8(y)] +DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8") +# `fread` return a utf-8 encoded data, we should convert x to native encoding. +# However, we need this condition to ensure the native encoding can be used for Chinese characters. +# Otherwise, the test will fail because the strings have been damaged. +if (identical(enc2utf8(enc2native(DT$x)), DT$x)) DT[, x:= enc2native(x)] setkey(DT, x) test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) setkey(DT, y)