From 2129494213803eb08040724738ff174a49cb0a0d Mon Sep 17 00:00:00 2001 From: Daniel Possenriede Date: Sun, 11 Nov 2018 12:09:42 +0100 Subject: [PATCH 1/5] workaround for fwrite file path encoding issue #3078 --- R/fwrite.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/fwrite.R b/R/fwrite.R index 37ebf6c874..155b750861 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -64,6 +64,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", return(invisible()) } } + file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, showProgress, verbose) From 905b6267dfdc977bf0f14b4ec048648eaed4ab2c Mon Sep 17 00:00:00 2001 From: Daniel Possenriede Date: Sun, 11 Nov 2018 21:10:52 +0100 Subject: [PATCH 2/5] workaround for fread file path encoding issue #3078 --- R/fread.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/fread.R b/R/fread.R index 214bb866ae..8371594b14 100644 --- a/R/fread.R +++ b/R/fread.R @@ -106,6 +106,8 @@ fread <- function(input="",file=NULL,text=NULL,cmd=NULL,sep="auto",sep2="auto",d file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) } + file = enc2native(file) # CfreadR cannot handle UTF-8 if that is not the native encoding, see #3078. + input = file } if (!missing(autostart)) warning("'autostart' is now deprecated and ignored. Consider skip='string' or skip=n"); From 018490ade81185098832d9e751feee7272b45c95 Mon Sep 17 00:00:00 2001 From: Daniel Possenriede Date: Sun, 2 Dec 2018 19:22:24 +0100 Subject: [PATCH 3/5] add tests --- inst/tests/tests.Rraw | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 08858e7e5d..c42853adb7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12354,6 +12354,34 @@ test(1953.4, melt.data.table(DT, id.vars = 'id', measure.vars = 'a'), DT = data.table(A=INT(1,3,2,3,2), B=1:5) # respect groups in 1st column (3's and 2's) test(1954, forderv(DT, sort=FALSE, retGrp=TRUE), structure(INT(1,2,4,3,5), starts=1:5, maxgrpn=1L)) +# fread can handle file name in native and utf-8 encoding, #3078, pr#3141 +if (.Platform$OS.type=="windows") { + f = tempfile("\u00f6"); cat("3.14", file = f); + fn = enc2native(f); f8 = enc2utf8(f); + data.table:::test(1960.1, fread(fn), data.table(V1=3.14)); + data.table:::test(1960.2, fread(f8), data.table(V1=3.14)); + unlink(c(fn, f8)) +} + +# fwrite can handle file names and paths in native and utf-8 encoding, #3078, pr#3141 +if (.Platform$OS.type=="windows") { + DT = data.table("a"); pth = tempdir(); + f = "\u00f6.csv"; fp = file.path(pth, f); + fpn = enc2native(fp); fp8 = enc2utf8(fp); + fwrite(DT, fpn); + data.table:::test(1961.1, list.files(path = pth, pattern = "\\.csv$"), f); + unlink(c(fp, file.path(pth, "\u00c3\u00b6.csv"))); + fwrite(DT, fp8); + data.table:::test(1961.2, list.files(path = pth, pattern = "\\.csv$"), f); + unlink(c(fp, file.path(pth, "\u00c3\u00b6.csv"))); + p = file.path(pth, "\u00fc"); dir.create(p); f = tempfile(tmpdir = p); + data.table:::test(1961.3, fwrite(DT, enc2native(f)), NULL) + unlink(f); + data.table:::test(1961.4, fwrite(DT, enc2utf8(f)), NULL) + unlink(p, recursive = TRUE) +} + + ################################### # Add new tests above this line # From 4d1cdd6fa79593c93d25c4d686b16fc273c998e0 Mon Sep 17 00:00:00 2001 From: Daniel Possenriede Date: Sun, 2 Dec 2018 19:33:40 +0100 Subject: [PATCH 4/5] add myself to NEWS.md --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 7ebd055486..dbf6b59cca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -17,6 +17,8 @@ 3. Unmatched `patterns` in `measure.vars` fail early and with feedback, [#3106](https://github.com/Rdatatable/data.table/issues/3092). +4. `fread()` and `fwrite()` can now handle file names and paths in native and utf-8 encoding. Thanks to Daniel Possenriede (@dpprdan) for reporting, [#3078](https://github.com/Rdatatable/data.table/issues/3078), and fixing, [PR#3141](https://github.com/Rdatatable/data.table/pull/3141). + #### NOTES 1. When data.table first loads it now checks the DLL's MD5. This is to detect installation issues on Windows when you upgrade and i) the DLL is in use by another R session and ii) the CRAN source version > CRAN binary binary which happens just after a new release (R prompts users to install from source until the CRAN binary is available). This situation can lead to a state where the package's new R code calls old C code in the old DLL; [R#17478](https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17478), [#3056](https://github.com/Rdatatable/data.table/issues/3056). This broken state can persist until, hopefully, you experience a strange error caused by the mismatch. Otherwise, wrong results may occur silently. This situation applies to any R package with compiled code not just data.table, is Windows-only, and is long-standing. It has only recently been understood as it typically only occurs during the few days after each new release until binaries are available on CRAN. Thanks to Gabor Csardi for the suggestion to use `tools::checkMD5sums()`. From 7888ccd83610aa6ea9202907a99b2790b21e87ef Mon Sep 17 00:00:00 2001 From: Daniel Possenriede Date: Sun, 2 Dec 2018 20:29:53 +0100 Subject: [PATCH 5/5] fine tuning --- NEWS.md | 2 +- inst/tests/tests.Rraw | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 7aca5f244e..0cf9e56ac5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,7 +22,7 @@ 4. `fread(..., skip=)` now skips non-standard `\r` and `\n\r` line endings properly again, [#3006](https://github.com/Rdatatable/data.table/issues/3006). Standard line endings (`\n` Linux/Mac and `\r\n` Windows) were skipped ok. Thanks to @brattono and @tbrycekelly for providing reproducible examples, and @st-pasha for fixing. -4. `fread()` and `fwrite()` can now handle file names and paths in native and utf-8 encoding. Thanks to Daniel Possenriede (@dpprdan) for reporting, [#3078](https://github.com/Rdatatable/data.table/issues/3078), and fixing, [PR#3141](https://github.com/Rdatatable/data.table/pull/3141). +5. `fread()` and `fwrite()` can now handle file names and paths in native and utf-8 encoding. Thanks to Daniel Possenriede (@dpprdan) for reporting, [#3078](https://github.com/Rdatatable/data.table/issues/3078), and fixing, [PR#3141](https://github.com/Rdatatable/data.table/pull/3141). #### NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a5112b36ea..287820d4ce 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12455,8 +12455,6 @@ if (.Platform$OS.type=="windows") { unlink(p, recursive = TRUE) } - - ################################### # Add new tests above this line # ###################################