From c9664d5b148451c11f8ee6ec25d1bfb354365796 Mon Sep 17 00:00:00 2001
From: Xianying Tan <shrektan@126.com>
Date: Sat, 13 Jan 2018 23:05:43 +0800
Subject: [PATCH 1/7] fix the bug when keys contain non UTF8 strings

---
 src/forder.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/forder.c b/src/forder.c
index 19172221fe..470ac322bd 100644
--- a/src/forder.c
+++ b/src/forder.c
@@ -866,7 +866,7 @@ static void csort(SEXP *x, int *o, int n)
   /* can't use otmp, since iradix might be called here and that uses otmp (and xtmp).
      alloc_csort_otmp(n) is called from forder for either n=nrow if 1st column,
      or n=maxgrpn if onwards columns */
-  for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(x[i]);
+  for(i=0; i<n; i++) csort_otmp[i] = (x[i] == NA_STRING) ? NA_INTEGER : -TRUELENGTH(ENC2UTF8(x[i]));
   if (nalast == 0 && n == 2) {                        // special case for nalast==0. n==1 is handled inside forder. at least 1 will be NA here
     if (o[0] == -1) for (i=0; i<n; i++) o[i] = i+1;    // else use o from caller directly (not 1st column)
     for (int i=0; i<n; i++) if (csort_otmp[i] == NA_INTEGER) o[i] = 0;
@@ -899,7 +899,7 @@ static void csort_pre(SEXP *x, int n)
   // savetl_init() is called once at the start of forder
   old_un = ustr_n;
   for(i=0; i<n; i++) {
-    s = x[i];
+    s = ENC2UTF8(x[i]);
     if (TRUELENGTH(s)<0) continue;   // this case first as it's the most frequent. Already in ustr, this negative is its ordering.
     if (TRUELENGTH(s)>0) {  // Save any of R's own usage of tl (assumed positive, so we can both count and save in one scan), to restore
       savetl(s);          // afterwards. From R 2.14.0, tl is initialized to 0, prior to that it was random so this step saved too much.

From 6f65cd168d3c3c632f2d2537660782b9bb40954e Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Sun, 14 Jan 2018 00:11:26 +0800
Subject: [PATCH 2/7] add news and tests

---
 NEWS.md                   | 1 +
 inst/tests/issue_2566.csv | 6 ++++++
 inst/tests/tests.Rraw     | 7 +++++++
 3 files changed, 14 insertions(+)
 create mode 100644 inst/tests/issue_2566.csv

diff --git a/NEWS.md b/NEWS.md
index 7ca93a8ad6..4ddcccef8f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -122,6 +122,7 @@
 
 29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again.
 
+30. on Windows, `data.table` contain non-UTF8 strings in keys can be correctly ordered now, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826)  and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing, [PR#2566](https://github.com/Rdatatable/data.table/pull/2566)"
 
 #### NOTES
 
diff --git a/inst/tests/issue_2566.csv b/inst/tests/issue_2566.csv
new file mode 100644
index 0000000000..a9f5448d3d
--- /dev/null
+++ b/inst/tests/issue_2566.csv
@@ -0,0 +1,6 @@
+x,y,z
+公允价值变动损益,公允价值变动损益,1
+红利收入,红利收入,2
+价差收入,价差收入,3
+其他业务支出,其他业务支出,4
+资产减值损失,资产减值损失,5
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index db76e96a89..e1bcd6696a 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11254,6 +11254,13 @@ test(1862.1, unique(DT), data.table(A=1:2, B=3:4, key="A"), warning="deprecated
 options(datatable.old.unique.by.key=NULL)
 test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A"))
 
+# Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826
+# Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding.
+DT <- fread(file = testDir("inst/tests/issue_2566.csv"), encoding = "UTF-8")
+setkey(DT, x)
+test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L)
+setkey(DT, y)
+test(1863.2, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L)
 
 ##########################
 

From cf601828c7292e984b7e1aefe1a9448bf6520ee3 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Sun, 14 Jan 2018 00:18:33 +0800
Subject: [PATCH 3/7] correct the test csv's path

---
 inst/tests/tests.Rraw | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index e1bcd6696a..0a8be50099 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11256,7 +11256,7 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A"))
 
 # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826
 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding.
-DT <- fread(file = testDir("inst/tests/issue_2566.csv"), encoding = "UTF-8")
+DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8")
 setkey(DT, x)
 test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L)
 setkey(DT, y)

From 8697453b2abab11e1ca8da8c32ad9114e80bf0f0 Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Sun, 14 Jan 2018 00:38:29 +0800
Subject: [PATCH 4/7] improve the test

---
 inst/tests/tests.Rraw | 1 +
 1 file changed, 1 insertion(+)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 0a8be50099..73762f235b 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11257,6 +11257,7 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A"))
 # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826
 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding.
 DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8")
+DT[, x := enc2native(x)] # because fread return's UTF-8 encoding column even on windows
 setkey(DT, x)
 test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L)
 setkey(DT, y)

From 454b2320cb460d52652748b4c846042417a3bbf7 Mon Sep 17 00:00:00 2001
From: Michael Chirico <michaelchirico4@gmail.com>
Date: Sat, 13 Jan 2018 11:50:21 -0500
Subject: [PATCH 5/7] grammar

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 4ddcccef8f..0a5d66356e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -122,7 +122,7 @@
 
 29. `:=` assignment of one vector to two or more columns, e.g. `DT[, c("x", "y") := 1:10]`, failed to copy the `1:10` data causing errors later if and when those columns were updated by reference, [#2540](https://github.com/Rdatatable/data.table/issues/2540). This is an old issue ([#185](https://github.com/Rdatatable/data.table/issues/185)) that had been fixed but reappeared when code was refactored. Thanks to @patrickhowerter for the detailed report with reproducible example and to @MarkusBonsch for fixing and strengthening tests so it doesn't reappear again.
 
-30. on Windows, `data.table` contain non-UTF8 strings in keys can be correctly ordered now, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826)  and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing, [PR#2566](https://github.com/Rdatatable/data.table/pull/2566)"
+30. Fixed a bug on Windows where `data.table`s containing non-UTF8 strings in `key`s were not properly sorted, [#2462](https://github.com/Rdatatable/data.table/issues/2462), [#1826](https://github.com/Rdatatable/data.table/issues/1826)  and [StackOverflow](https://stackoverflow.com/questions/47599934/why-doesnt-r-data-table-support-well-for-non-ascii-keys-on-windows). Thanks to @shrektan for reporting and fixing.
 
 #### NOTES
 

From 516a88cce87b02399c1665e2571ad1fc9af83e5e Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Sun, 14 Jan 2018 00:55:00 +0800
Subject: [PATCH 6/7] enc2native may cause info loss so should use read.csv and
 then enc2utf8

---
 inst/tests/tests.Rraw | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 73762f235b..e88c79e86c 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11256,8 +11256,8 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A"))
 
 # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826
 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding.
-DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8")
-DT[, x := enc2native(x)] # because fread return's UTF-8 encoding column even on windows
+DT <- setDT(read.csv(file = testDir("issue_2566.csv"), fileEncoding = "UTF-8", as.is = TRUE))
+DT[, y := enc2utf8(y)]
 setkey(DT, x)
 test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L)
 setkey(DT, y)

From 7a39e8c51abd4c25500dc6c9e08510388942edae Mon Sep 17 00:00:00 2001
From: shrektan <shrektan@126.com>
Date: Sun, 14 Jan 2018 01:29:36 +0800
Subject: [PATCH 7/7] final shoot for tests on a windows machine with an USA
 locale where Chinese strings can't be expressed in native encoding...

---
 inst/tests/tests.Rraw | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index e88c79e86c..30ff72b989 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -11256,8 +11256,11 @@ test(1862.2, unique(DT,by=key(DT)), data.table(A=1:2, B=3:4, key="A"))
 
 # Fix the bug when keys contain non UTF8 strings #2566 #2462 #1826
 # Only on Windows platform it might fail, because other platforms use UTF8 as the native encoding.
-DT <- setDT(read.csv(file = testDir("issue_2566.csv"), fileEncoding = "UTF-8", as.is = TRUE))
-DT[, y := enc2utf8(y)]
+DT <- fread(file = testDir("issue_2566.csv"), encoding = "UTF-8")
+# `fread` return a utf-8 encoded data, we should convert x to native encoding.
+# However, we need this condition to ensure the native encoding can be used for Chinese characters.
+# Otherwise, the test will fail because the strings have been damaged.
+if (identical(enc2utf8(enc2native(DT$x)), DT$x)) DT[, x:= enc2native(x)]
 setkey(DT, x)
 test(1863.1, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L)
 setkey(DT, y)