From 9eca5d0de0bcecb4e81111269185f2b5d3c49460 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 21 May 2019 22:20:25 +0200 Subject: [PATCH 01/27] Implements with_bom in fwrite Add a new with_bom parameter to create a UTF-8 file with BOM. By default, with_bom is FALSE and fwrite creates a UTF-8 file without BOM. When with_bom is TRUE, BOM sequence (EF BB BF) is added at the beginning of the file but only when col.names is TRUE (default). --- R/fwrite.R | 5 ++++- src/fwrite.c | 16 ++++++++++++++++ src/fwrite.h | 1 + src/fwriteR.c | 2 ++ 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index 542ca4b75d..bba8687025 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -9,6 +9,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", showProgress=getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), yaml = FALSE, + with_bom = FALSE, verbose=getOption("datatable.verbose", FALSE)) { na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] @@ -43,6 +44,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(compress) == 1L && compress %chin% c("auto", "none", "gzip"), isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names), isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01), + isTRUEorFALSE(with_bom), length(na) == 1L, #1725, handles NULL or character(0) input is.character(file) && length(file)==1L && !is.na(file), length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024, @@ -54,6 +56,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", file <- path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) col.names = FALSE # test 1658.16 checks this + if (with_bom && !col.names) stop("with_bom can be TRUE only if col.names is TRUE") if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time @@ -109,6 +112,6 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, verbose) + showProgress, is_gzip, with_bom, verbose) invisible() } diff --git a/src/fwrite.c b/src/fwrite.c index df6addb96c..1b1f1eb86f 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -533,6 +533,16 @@ void writeCategString(void *col, int64_t row, char **pch) write_string(getCategString(col, row), pch); } +void writeBom(char **pch) +{ + char *ch = *pch; + *ch++ = 0xEF; + *ch++ = 0xBB; + *ch++ = 0xBF; + *pch = ch; +} + + int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen) { z_stream stream; @@ -672,6 +682,10 @@ void fwriteMain(fwriteMainArgs args) } } + if (args.verbose) { + DTPRINT("Writing BOM ..."); + if (f==-1) DTPRINT("\n"); + } if (args.verbose) { DTPRINT("Writing column names ... "); if (f==-1) DTPRINT("\n"); @@ -683,6 +697,8 @@ void fwriteMain(fwriteMainArgs args) char *buff = malloc(headerLen); if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno)); char *ch = buff; + if (args.with_bom) + writeBom(&ch); if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv diff --git a/src/fwrite.h b/src/fwrite.h index 406cdf9d7d..dc1004584b 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -98,6 +98,7 @@ typedef struct fwriteMainArgs int nth; bool showProgress; bool is_gzip; + bool with_bom; bool verbose; } fwriteMainArgs; diff --git a/src/fwriteR.c b/src/fwriteR.c index ddcf7e29cf..b661336999 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -158,12 +158,14 @@ SEXP fwriteR( SEXP nThread_Arg, SEXP showProgress_Arg, SEXP is_gzip_Arg, + SEXP with_bom_Arg, SEXP verbose_Arg ) { if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table"); fwriteMainArgs args; args.is_gzip = LOGICAL(is_gzip_Arg)[0]; + args.with_bom = LOGICAL(with_bom_Arg)[0]; args.verbose = LOGICAL(verbose_Arg)[0]; args.filename = CHAR(STRING_ELT(filename_Arg, 0)); args.ncol = length(DF); From 18ff4a0812288c240226f8956c494427d7998ab6 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 21 May 2019 23:19:37 +0200 Subject: [PATCH 02/27] with_bom is compatible with yaml --- R/fwrite.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index bba8687025..470a640081 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -104,8 +104,16 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, dec = dec, qmethod = qmethod, logical01 = logical01 ) + if (with_bom) { + bom <- raw(3) + bom[1] <- as.raw(0xEF) + bom[2] <- as.raw(0xBB) + bom[3] <- as.raw(0xBF) + writeBin(bom, file) + with_bom <- FALSE + } # NB: as.yaml adds trailing newline - cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file) + cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file, append = TRUE) append = TRUE } } From dd17a423d8452a2d490d1e47f186b9b86d5a144b Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 21 May 2019 23:20:26 +0200 Subject: [PATCH 03/27] Add with_bom in documentation --- man/fwrite.Rd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index a4c6bbb703..76dbdad056 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -19,6 +19,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", showProgress = getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), yaml = FALSE, + with_bom = FALSE, verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -55,7 +56,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} - \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. } + \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. } + \item{with_bom}{If \code{TRUE} and if \code{col.names} is \code{TRUE}, a BOM sequence (EF BB BF) is added at the beginning of the file and \code{fwrite} will output a file in format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} } \details{ From e8d92c242318f95d3b536f96822e6617728a7931 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 22 May 2019 11:24:51 +0200 Subject: [PATCH 04/27] Add tests for fwrite UTF-8 with bom --- inst/tests/tests.Rraw | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 812a970c87..b073874ac1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9505,6 +9505,23 @@ test(1658.46, fwrite(DT), error="Row 3 of list column is type 'complex'") DT[3,b:=factor(letters[1:3])] test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") +# fwrite bom +DT <- data.table(l=letters, n=1:26) + +fwrite(DT, f1 <- tempfile(), with_bom=T) +test(1658.48, identical(readBin(f1, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))), TRUE) + +fwrite(DT, f2 <- tempfile(), with_bom=T, yaml=T) +test(1658.49, identical(readBin(f2, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))), TRUE) + +fwrite(DT, f3 <- tempfile(), with_bom=F) +test(1658.50, identical(readBin(f3, raw(), 3), as.raw(c(0x6c, 0x2c, 0x6e))), TRUE) + +fwrite(DT, f4 <- tempfile(), yaml=T) +test(1658.51, identical(readBin(f4, raw(), 3), as.raw(c(0x2d, 0x2d, 0x2d))), TRUE) + +unlink(c(f1, f2, f3, f4)) + ## End fwrite tests # tests for #679, inrange(), FR #707 From 0d9a272d2152326d74f5c216c41187413ce69f3c Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 22 May 2019 11:35:49 +0200 Subject: [PATCH 05/27] Remove useless debug --- src/fwrite.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 1b1f1eb86f..97984e27df 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -682,10 +682,6 @@ void fwriteMain(fwriteMainArgs args) } } - if (args.verbose) { - DTPRINT("Writing BOM ..."); - if (f==-1) DTPRINT("\n"); - } if (args.verbose) { DTPRINT("Writing column names ... "); if (f==-1) DTPRINT("\n"); From 322aa4194b7b72223b3f6093feb9085a236509fa Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 22 May 2019 18:05:28 +0800 Subject: [PATCH 06/27] using integers --- R/fwrite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index 470a640081..ddf4236c38 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -105,7 +105,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dec = dec, qmethod = qmethod, logical01 = logical01 ) if (with_bom) { - bom <- raw(3) + bom <- raw(3L) bom[1] <- as.raw(0xEF) bom[2] <- as.raw(0xBB) bom[3] <- as.raw(0xBF) From 4e28d619c1765e790c92ff41443561ba01f7d505 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 22 May 2019 14:27:06 +0200 Subject: [PATCH 07/27] Rename 'with_bom' parameter in 'bom' --- R/fwrite.R | 12 ++++++------ inst/tests/tests.Rraw | 6 +++--- man/fwrite.Rd | 4 ++-- src/fwrite.c | 2 +- src/fwrite.h | 2 +- src/fwriteR.c | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index ddf4236c38..ce23cd91e8 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -9,7 +9,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", showProgress=getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), yaml = FALSE, - with_bom = FALSE, + bom = FALSE, verbose=getOption("datatable.verbose", FALSE)) { na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] @@ -44,7 +44,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(compress) == 1L && compress %chin% c("auto", "none", "gzip"), isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names), isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01), - isTRUEorFALSE(with_bom), + isTRUEorFALSE(bom), length(na) == 1L, #1725, handles NULL or character(0) input is.character(file) && length(file)==1L && !is.na(file), length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024, @@ -56,7 +56,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", file <- path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) col.names = FALSE # test 1658.16 checks this - if (with_bom && !col.names) stop("with_bom can be TRUE only if col.names is TRUE") + if (bom && !col.names) stop("bom can be TRUE only if col.names is TRUE") if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time @@ -104,13 +104,13 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, dec = dec, qmethod = qmethod, logical01 = logical01 ) - if (with_bom) { + if (bom) { bom <- raw(3L) bom[1] <- as.raw(0xEF) bom[2] <- as.raw(0xBB) bom[3] <- as.raw(0xBF) writeBin(bom, file) - with_bom <- FALSE + bom <- FALSE } # NB: as.yaml adds trailing newline cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file, append = TRUE) @@ -120,6 +120,6 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, with_bom, verbose) + showProgress, is_gzip, bom, verbose) invisible() } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b073874ac1..d436d351ee 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9508,13 +9508,13 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") # fwrite bom DT <- data.table(l=letters, n=1:26) -fwrite(DT, f1 <- tempfile(), with_bom=T) +fwrite(DT, f1 <- tempfile(), bom=T) test(1658.48, identical(readBin(f1, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))), TRUE) -fwrite(DT, f2 <- tempfile(), with_bom=T, yaml=T) +fwrite(DT, f2 <- tempfile(), bom=T, yaml=T) test(1658.49, identical(readBin(f2, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))), TRUE) -fwrite(DT, f3 <- tempfile(), with_bom=F) +fwrite(DT, f3 <- tempfile(), bom=F) test(1658.50, identical(readBin(f3, raw(), 3), as.raw(c(0x6c, 0x2c, 0x6e))), TRUE) fwrite(DT, f4 <- tempfile(), yaml=T) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 76dbdad056..ea203b6fbe 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -19,7 +19,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", showProgress = getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), yaml = FALSE, - with_bom = FALSE, + bom = FALSE, verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -57,7 +57,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. } - \item{with_bom}{If \code{TRUE} and if \code{col.names} is \code{TRUE}, a BOM sequence (EF BB BF) is added at the beginning of the file and \code{fwrite} will output a file in format 'UTF-8 with BOM'.} + \item{bom}{If \code{TRUE} and if \code{col.names} is \code{TRUE}, a BOM sequence (EF BB BF) is added at the beginning of the file and \code{fwrite} will output a file in format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} } \details{ diff --git a/src/fwrite.c b/src/fwrite.c index 97984e27df..12057a742c 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -693,7 +693,7 @@ void fwriteMain(fwriteMainArgs args) char *buff = malloc(headerLen); if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno)); char *ch = buff; - if (args.with_bom) + if (args.bom) writeBom(&ch); if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column diff --git a/src/fwrite.h b/src/fwrite.h index dc1004584b..eb19785cb3 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -98,7 +98,7 @@ typedef struct fwriteMainArgs int nth; bool showProgress; bool is_gzip; - bool with_bom; + bool bom; bool verbose; } fwriteMainArgs; diff --git a/src/fwriteR.c b/src/fwriteR.c index b661336999..8c6efd9881 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -158,14 +158,14 @@ SEXP fwriteR( SEXP nThread_Arg, SEXP showProgress_Arg, SEXP is_gzip_Arg, - SEXP with_bom_Arg, + SEXP bom_Arg, SEXP verbose_Arg ) { if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table"); fwriteMainArgs args; args.is_gzip = LOGICAL(is_gzip_Arg)[0]; - args.with_bom = LOGICAL(with_bom_Arg)[0]; + args.bom = LOGICAL(bom_Arg)[0]; args.verbose = LOGICAL(verbose_Arg)[0]; args.filename = CHAR(STRING_ELT(filename_Arg, 0)); args.ncol = length(DF); From cd6dd91e93cc9024a33d839408ffc4d0320b1d6f Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 22 May 2019 14:44:42 +0200 Subject: [PATCH 08/27] Use TRUE and FALSE in tests --- inst/tests/tests.Rraw | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d436d351ee..db153fb252 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9508,16 +9508,16 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") # fwrite bom DT <- data.table(l=letters, n=1:26) -fwrite(DT, f1 <- tempfile(), bom=T) +fwrite(DT, f1 <- tempfile(), bom=TRUE) test(1658.48, identical(readBin(f1, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))), TRUE) -fwrite(DT, f2 <- tempfile(), bom=T, yaml=T) +fwrite(DT, f2 <- tempfile(), bom=TRUE, yaml=TRUE) test(1658.49, identical(readBin(f2, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))), TRUE) -fwrite(DT, f3 <- tempfile(), bom=F) +fwrite(DT, f3 <- tempfile(), bom=FALSE) test(1658.50, identical(readBin(f3, raw(), 3), as.raw(c(0x6c, 0x2c, 0x6e))), TRUE) -fwrite(DT, f4 <- tempfile(), yaml=T) +fwrite(DT, f4 <- tempfile(), yaml=TRUE) test(1658.51, identical(readBin(f4, raw(), 3), as.raw(c(0x2d, 0x2d, 0x2d))), TRUE) unlink(c(f1, f2, f3, f4)) From 9b67de88b039e2796c3c06cd1db8c8fe295ac1ac Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 22 May 2019 15:03:37 +0200 Subject: [PATCH 09/27] Remove useless test --- inst/tests/tests.Rraw | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index db153fb252..58c210a1b6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9517,10 +9517,7 @@ test(1658.49, identical(readBin(f2, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, fwrite(DT, f3 <- tempfile(), bom=FALSE) test(1658.50, identical(readBin(f3, raw(), 3), as.raw(c(0x6c, 0x2c, 0x6e))), TRUE) -fwrite(DT, f4 <- tempfile(), yaml=TRUE) -test(1658.51, identical(readBin(f4, raw(), 3), as.raw(c(0x2d, 0x2d, 0x2d))), TRUE) - -unlink(c(f1, f2, f3, f4)) +unlink(c(f1, f2, f3)) ## End fwrite tests From f8775587d9945fffa9abf181bf97e71a33659ace Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 22 May 2019 15:04:34 +0200 Subject: [PATCH 10/27] When appending in existing file, bom is set to FALSE In some cases of appending, `append && missing(col.names) && (file=="" || file.exists(file))` col.names is set to FALSE. Now bom is silently set to FALSE too. --- R/fwrite.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index ce23cd91e8..5af1004c63 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -54,8 +54,10 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) file <- path.expand(file) # "~/foo/bar" - if (append && missing(col.names) && (file=="" || file.exists(file))) + if (append && missing(col.names) && (file=="" || file.exists(file))) { col.names = FALSE # test 1658.16 checks this + bom = FALSE + } if (bom && !col.names) stop("bom can be TRUE only if col.names is TRUE") if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { From bebaced55d13cce3ee7818ebac11bd8fa8a3b4d1 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 22 May 2019 12:52:47 -0700 Subject: [PATCH 11/27] Made tests 2033.06 and 2033.07 pass but I don't follow why. Will follow up in PR. --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9f26f47698..2e760cf877 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14461,13 +14461,13 @@ if (test_yaml) { # csvy; #1701 # windows eol fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') - test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"') + test(2033.06, readLines(tmp)[18L], 'eol: |2+') # was 'eol: "\\r\\n"' before this PR (#3580) # multi-class columns DT[ , t := .POSIXct(1:5, tz = 'UTC')] fwrite(DT, tmp, yaml = TRUE) as_read = readLines(tmp) - test(2033.07, as_read[13L], " type: POSIXct") + test(2033.07, as_read[74L], " type: POSIXct") # was as_read[13L] before this PR (#3580) # ~invertibility~ # fread side needs to be improved for Hugh's colClasses update From e6f12d746d9460662af88d3c28a0089e249dcb53 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 23 May 2019 17:59:07 +0800 Subject: [PATCH 12/27] fix append problem for bom writing --- R/fwrite.R | 13 +++++-------- inst/tests/tests.Rraw | 21 ++++++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 5af1004c63..cf1cd50b0d 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -107,15 +107,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dec = dec, qmethod = qmethod, logical01 = logical01 ) if (bom) { - bom <- raw(3L) - bom[1] <- as.raw(0xEF) - bom[2] <- as.raw(0xBB) - bom[3] <- as.raw(0xBF) - writeBin(bom, file) - bom <- FALSE - } + bom_char = rawToChar(as.raw(c(0xEF, 0xBB, 0xBF))) + bom = FALSE + } else bom_char = '' # NB: as.yaml adds trailing newline - cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file, append = TRUE) + cat(paste0(bom_char, '---'), yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file) + bom = FALSE append = TRUE } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2e760cf877..9c927cb357 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9515,15 +9515,12 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") DT <- data.table(l=letters, n=1:26) fwrite(DT, f1 <- tempfile(), bom=TRUE) -test(1658.48, identical(readBin(f1, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))), TRUE) +test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) -fwrite(DT, f2 <- tempfile(), bom=TRUE, yaml=TRUE) -test(1658.49, identical(readBin(f2, raw(), 6), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))), TRUE) +fwrite(DT, f2 <- tempfile(), bom=FALSE) +test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) -fwrite(DT, f3 <- tempfile(), bom=FALSE) -test(1658.50, identical(readBin(f3, raw(), 3), as.raw(c(0x6c, 0x2c, 0x6e))), TRUE) - -unlink(c(f1, f2, f3)) +unlink(c(f1, f2)) ## End fwrite tests @@ -14461,13 +14458,13 @@ if (test_yaml) { # csvy; #1701 # windows eol fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') - test(2033.06, readLines(tmp)[18L], 'eol: |2+') # was 'eol: "\\r\\n"' before this PR (#3580) + test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"') # multi-class columns DT[ , t := .POSIXct(1:5, tz = 'UTC')] fwrite(DT, tmp, yaml = TRUE) as_read = readLines(tmp) - test(2033.07, as_read[74L], " type: POSIXct") # was as_read[13L] before this PR (#3580) + test(2033.07, as_read[13L], " type: POSIXct") # ~invertibility~ # fread side needs to be improved for Hugh's colClasses update @@ -14483,6 +14480,12 @@ if (test_yaml) { # csvy; #1701 warning = 'Skipping yaml writing because append = TRUE') test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, warning = 'Skipping yaml writing because is_gzip = TRUE') + + # yaml + bom arguments + DT <- data.table(l=letters, n=1:26) + fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) + test(2033.11, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) + unlink(f) } # fcast coverage From 5447e91f6280fd9f6284652a5e50bee1810f1979 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 23 May 2019 18:03:31 +0800 Subject: [PATCH 13/27] add some tests --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9c927cb357..40675e7be1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9520,6 +9520,10 @@ test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6 fwrite(DT, f2 <- tempfile(), bom=FALSE) test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) +# re-write to the same file should overwrite +fwrite(DT, f2, bom = TRUE) +test(1658.50, length(readLines(f2)), 27L) + unlink(c(f1, f2)) ## End fwrite tests @@ -14485,6 +14489,9 @@ if (test_yaml) { # csvy; #1701 DT <- data.table(l=letters, n=1:26) fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) test(2033.11, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) + # re-write should have same output (not appended) + fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) + test(2033.12, length(readLines(f)), 50L) unlink(f) } From 951bd57bbdfef8e6d119811e6881faa7a9c41bea Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 23 May 2019 18:27:03 +0800 Subject: [PATCH 14/27] windows problem suggests eol issues --- inst/tests/tests.Rraw | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 40675e7be1..243ba25674 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9518,11 +9518,12 @@ fwrite(DT, f1 <- tempfile(), bom=TRUE) test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) fwrite(DT, f2 <- tempfile(), bom=FALSE) +n_lines = length(readLines(f2)) test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) # re-write to the same file should overwrite fwrite(DT, f2, bom = TRUE) -test(1658.50, length(readLines(f2)), 27L) +test(1658.50, length(readLines(f2)), n_lines) unlink(c(f1, f2)) @@ -14488,10 +14489,11 @@ if (test_yaml) { # csvy; #1701 # yaml + bom arguments DT <- data.table(l=letters, n=1:26) fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) + n_lines = length(readLines(f)) test(2033.11, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) # re-write should have same output (not appended) fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) - test(2033.12, length(readLines(f)), 50L) + test(2033.12, length(readLines(f)), n_lines) unlink(f) } From 49d28f58029a7e1ce9f6fe22c70154ad56c00de9 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 23 May 2019 18:47:08 +0800 Subject: [PATCH 15/27] different file? --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 243ba25674..0dc96a4543 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9515,10 +9515,10 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") DT <- data.table(l=letters, n=1:26) fwrite(DT, f1 <- tempfile(), bom=TRUE) +n_lines = length(readLines(f1)) test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) fwrite(DT, f2 <- tempfile(), bom=FALSE) -n_lines = length(readLines(f2)) test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) # re-write to the same file should overwrite From 2a85b0ad6fbafe341b23abb69249097b2c74fb14 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 23 May 2019 19:05:44 +0800 Subject: [PATCH 16/27] readLines needs warn=FALSE --- inst/tests/tests.Rraw | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0dc96a4543..6a3124cb32 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9515,7 +9515,7 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") DT <- data.table(l=letters, n=1:26) fwrite(DT, f1 <- tempfile(), bom=TRUE) -n_lines = length(readLines(f1)) +n_lines = length(readLines(f1, warn = FALSE)) test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) fwrite(DT, f2 <- tempfile(), bom=FALSE) @@ -9523,7 +9523,7 @@ test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) # re-write to the same file should overwrite fwrite(DT, f2, bom = TRUE) -test(1658.50, length(readLines(f2)), n_lines) +test(1658.50, length(readLines(f2, warn = FALSE)), n_lines) unlink(c(f1, f2)) @@ -14489,11 +14489,11 @@ if (test_yaml) { # csvy; #1701 # yaml + bom arguments DT <- data.table(l=letters, n=1:26) fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) - n_lines = length(readLines(f)) + n_lines = length(readLines(f, warn = FALSE)) test(2033.11, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) # re-write should have same output (not appended) fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) - test(2033.12, length(readLines(f)), n_lines) + test(2033.12, length(readLines(f, warn = FALSE)), n_lines) unlink(f) } From e842474726a895c52fff683fd3a6cedbeaa36069 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 23 May 2019 19:24:00 +0800 Subject: [PATCH 17/27] trying with resetting the file instead of rawToChar --- R/fwrite.R | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index cf1cd50b0d..a74b26f83d 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -82,7 +82,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", # process YAML after potentially short-circuiting due to irregularities if (yaml) { - if (!requireNamespace('yaml', quietly = TRUE)) + if (!requireNamespace('yaml', quietly=TRUE)) stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov if (append || is_gzip) { if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.") @@ -92,26 +92,27 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", # multi-class objects reduced to first class if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) # as.vector strips names - schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec)) + schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec)) yaml_header = list( source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', - R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e)'DEV'))), - creation_time_utc = format(Sys.time(), tz = 'UTC'), + R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))), + creation_time_utc = format(Sys.time(), tz='UTC'), schema = list( fields = lapply( seq_along(x), - function(i) list(name = schema_vec$name[i], type = schema_vec$type[i]) + function(i) list(name=schema_vec$name[i], type=schema_vec$type[i]) ) ), - header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, - dec = dec, qmethod = qmethod, logical01 = logical01 + header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na, + dec=dec, qmethod=qmethod, logical01=logical01 ) if (bom) { - bom_char = rawToChar(as.raw(c(0xEF, 0xBB, 0xBF))) - bom = FALSE - } else bom_char = '' + # writeBin cannot overwrite, so wipe the file + if (file.exists(file)) close(file(file, open='w')) + writeBin(as.raw(c(0xEF, 0xBB, 0xBF)), file) + } # NB: as.yaml adds trailing newline - cat(paste0(bom_char, '---'), yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file) + cat('---', yaml::as.yaml(yaml_header, line.sep=eol), '---', sep=eol, file=file, append=bom) bom = FALSE append = TRUE } From 61d935ab54dfe49c0164e50598978ee032d21f30 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Thu, 23 May 2019 15:14:06 +0200 Subject: [PATCH 18/27] Add nocov --- R/fwrite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fwrite.R b/R/fwrite.R index a74b26f83d..02b5cf9f12 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -58,7 +58,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", col.names = FALSE # test 1658.16 checks this bom = FALSE } - if (bom && !col.names) stop("bom can be TRUE only if col.names is TRUE") + if (bom && !col.names) stop("bom can be TRUE only if col.names is TRUE") # nocov if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time From db39ba3ed420b3a63d38b8cbfe953deffc22306a Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Thu, 23 May 2019 15:20:22 +0200 Subject: [PATCH 19/27] Add a NEWS item for fwrite bom --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8dc12a15f4..9f7cb3f285 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. + * Gains `bom` argument to add a *byte order mark* (BOM) at the beggining of the file : this helps to signal that the file is encoded in UTF-8. + 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). ```R From 50243432f2c20310107989606d56ab52fab28ad4 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 May 2019 16:20:41 -0700 Subject: [PATCH 20/27] moved yaml write to C level as Philippe suggested; now supports gzip'd yaml and bom when no column names too --- R/fwrite.R | 34 ++++++++++++------------------- inst/tests/tests.Rraw | 31 ++++++++++++++-------------- src/fwrite.c | 47 +++++++++++++++++++++---------------------- src/fwrite.h | 1 + src/fwriteR.c | 2 ++ 5 files changed, 54 insertions(+), 61 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 02b5cf9f12..4823a739e8 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -31,7 +31,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix message("x being coerced from class: matrix to data.table") - x <- as.data.table(x) + x = as.data.table(x) } stopifnot(is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), @@ -44,21 +44,20 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(compress) == 1L && compress %chin% c("auto", "none", "gzip"), isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names), isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01), - isTRUEorFALSE(bom), + isTRUEorFALSE(bom), length(na) == 1L, #1725, handles NULL or character(0) input is.character(file) && length(file)==1L && !is.na(file), length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024, length(nThread)==1L && !is.na(nThread) && nThread>=1L ) - is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) + is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) - file <- path.expand(file) # "~/foo/bar" + file = path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) { col.names = FALSE # test 1658.16 checks this bom = FALSE } - if (bom && !col.names) stop("bom can be TRUE only if col.names is TRUE") # nocov if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time @@ -81,12 +80,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", } # process YAML after potentially short-circuiting due to irregularities - if (yaml) { + yaml = if (yaml) { if (!requireNamespace('yaml', quietly=TRUE)) stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov - if (append || is_gzip) { - if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.") - if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.") + if (append && (file=="" || file.exists(file))) { + warning("Ignoring yaml=TRUE because append=TRUE and the file already exists. YAML will only be written to the top of a file.") + "" } else { schema_vec = sapply(x, class) # multi-class objects reduced to first class @@ -106,20 +105,13 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na, dec=dec, qmethod=qmethod, logical01=logical01 ) - if (bom) { - # writeBin cannot overwrite, so wipe the file - if (file.exists(file)) close(file(file, open='w')) - writeBin(as.raw(c(0xEF, 0xBB, 0xBF)), file) - } - # NB: as.yaml adds trailing newline - cat('---', yaml::as.yaml(yaml_header, line.sep=eol), '---', sep=eol, file=file, append=bom) - bom = FALSE - append = TRUE + paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline } - } - file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. + } else "" + file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, bom, verbose) + showProgress, is_gzip, bom, yaml, verbose) invisible() } + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6a3124cb32..ce5a043123 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6514,13 +6514,13 @@ if (test_xts) { setcolorder(dt, c(2, 3, 1)) dt[ , char_col := 'a'] test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') - + # 890 -- key argument for as.data.table.xts x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) test(1465.18, capture.output(as.data.table(x, key="index")), - c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", - " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", - " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", + c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", + " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", + " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", " 9: 1970-01-10 9", "10: 1970-01-11 10")) Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) @@ -9466,7 +9466,7 @@ test(1658.25, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"') # integer NA DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5)) test(1658.26, fwrite(DT), output='A,B\n2,\n,4\n3,5') -test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing column names.*"A","B".*2,NA\nNA,4\n3,5') +test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., yaml .0 characters. and column names .true.*"A","B".*2,NA\nNA,4\n3,5') # wrong argument types test(1658.28, fwrite(ok_dt, 1), error="is.character\\(file\\).*not TRUE") @@ -14448,18 +14448,18 @@ if (test_yaml) { # csvy; #1701 # force eol for platform independence fwrite(DT, tmp, yaml = TRUE, eol = '\n') as_read = readLines(tmp) - test(2033.01, as_read[c(1L, 25L)], c('---', '---')) + test(2033.01, as_read[c(1L, 24L)], c('---', '---')) test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) test(2033.03, grepl('creation_time_utc', as_read[3L])) - test(2033.04, as_read[4:24], + test(2033.04, as_read[4:23], c("schema:", " fields:", " - name: a", " type: integer", " - name: b", " type: numeric", " - name: c", " type: character", "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", # NB: apparently \n is encoded like this in YAML "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", - "logical01: no", "")) + "logical01: no")) tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") - test(2033.05, as_read[26:31], tbl_body) + test(2033.05, as_read[25:30], tbl_body) # windows eol fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') @@ -14480,13 +14480,12 @@ if (test_yaml) { # csvy; #1701 attr(DT2, 'yaml_metadata') = NULL test(2033.08, all.equal(DT, DT2)) - # unsupported operations test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L], - warning = 'Skipping yaml writing because append = TRUE') - test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, - warning = 'Skipping yaml writing because is_gzip = TRUE') - - # yaml + bom arguments + warning = 'Ignoring yaml=TRUE because append=TRUE and the file already exists. YAML.*only.*top of a file') + + # TODO: test gzip'd yaml which is now supported + + # yaml + bom arguments DT <- data.table(l=letters, n=1:26) fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) n_lines = length(readLines(f, warn = FALSE)) @@ -14808,7 +14807,7 @@ test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural joi options(datatable.naturaljoin=FALSE) #tests for adding key to as.data.table, #890 -## as.data.table.numeric (should cover as.data.table.factor, +## as.data.table.numeric (should cover as.data.table.factor, ## *.ordered, *.integer, *.logical, *.character, and *.Date since ## all are the same function in as.data.table.R) nn = c(a=0.1, c=0.2, b=0.3, d=0.4) diff --git a/src/fwrite.c b/src/fwrite.c index 12057a742c..af3fc7fa04 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -533,16 +533,6 @@ void writeCategString(void *col, int64_t row, char **pch) write_string(getCategString(col, row), pch); } -void writeBom(char **pch) -{ - char *ch = *pch; - *ch++ = 0xEF; - *ch++ = 0xBB; - *ch++ = 0xBF; - *pch = ch; -} - - int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen) { z_stream stream; @@ -682,30 +672,39 @@ void fwriteMain(fwriteMainArgs args) } } + int yamlLen = strlen(args.yaml); if (args.verbose) { - DTPRINT("Writing column names ... "); + DTPRINT("Writing bom (%s), yaml (%d characters) and column names (%s) ... ", + args.bom?"true":"false", yamlLen, args.colNames?"true":"false"); if (f==-1) DTPRINT("\n"); } + size_t headerLen = 0; + if (args.bom) headerLen += 3; + headerLen += yamlLen; if (args.colNames) { - size_t headerLen = 0; for (int j=0; j> column name) + } + if (headerLen) { char *buff = malloc(headerLen); if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno)); char *ch = buff; - if (args.bom) - writeBom(&ch); - if (args.doRowNames) { - // Unusual: the extra blank column name when row_names are added as the first column - if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv - *ch++ = sep; - } - for (int j=0; j Date: Thu, 23 May 2019 17:01:14 -0700 Subject: [PATCH 21/27] tidy and trace --- NEWS.md | 2 +- inst/tests/tests.Rraw | 18 ++++++++---------- man/fwrite.Rd | 4 ++-- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 9f7cb3f285..44e469d98b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,7 +34,7 @@ * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. - * Gains `bom` argument to add a *byte order mark* (BOM) at the beggining of the file : this helps to signal that the file is encoded in UTF-8. + * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing. 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ce5a043123..617152028b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9511,21 +9511,19 @@ test(1658.46, fwrite(DT), error="Row 3 of list column is type 'complex'") DT[3,b:=factor(letters[1:3])] test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") +old = options(datatable.verbose=TRUE) # temp trace on windows # fwrite bom -DT <- data.table(l=letters, n=1:26) - -fwrite(DT, f1 <- tempfile(), bom=TRUE) -n_lines = length(readLines(f1, warn = FALSE)) +DT = data.table(l=letters, n=1:26) +fwrite(DT, f1<-tempfile(), bom=TRUE) +n_lines = length(readLines(f1, warn=FALSE)) test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) - -fwrite(DT, f2 <- tempfile(), bom=FALSE) +fwrite(DT, f2<-tempfile(), bom=FALSE) test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) - # re-write to the same file should overwrite -fwrite(DT, f2, bom = TRUE) -test(1658.50, length(readLines(f2, warn = FALSE)), n_lines) - +fwrite(DT, f2, bom=TRUE) +test(1658.50, length(readLines(f2, warn=FALSE)), n_lines) unlink(c(f1, f2)) +options(old) ## End fwrite tests diff --git a/man/fwrite.Rd b/man/fwrite.Rd index ea203b6fbe..c8acd80db4 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -56,8 +56,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} - \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. } - \item{bom}{If \code{TRUE} and if \code{col.names} is \code{TRUE}, a BOM sequence (EF BB BF) is added at the beginning of the file and \code{fwrite} will output a file in format 'UTF-8 with BOM'.} + \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. } + \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} } \details{ From bbeb615287ea7e20504794c2158796ccb879c719 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 May 2019 18:16:20 -0700 Subject: [PATCH 22/27] more tracing --- R/fwrite.R | 55 ++++++++++++++++++++----------------------- inst/tests/tests.Rraw | 9 +++---- 2 files changed, 30 insertions(+), 34 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 4823a739e8..44ae546432 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -54,9 +54,11 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) file = path.expand(file) # "~/foo/bar" - if (append && missing(col.names) && (file=="" || file.exists(file))) { - col.names = FALSE # test 1658.16 checks this + if (append && (file=="" || file.exists(file))) { + if (missing(col.names)) col.names = FALSE + if (verbose) cat("Appending to existing file so setting bom=FALSE and yaml=FALSE\n") bom = FALSE + yaml = FALSE } if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { @@ -78,36 +80,29 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", return(invisible()) } } - - # process YAML after potentially short-circuiting due to irregularities - yaml = if (yaml) { + yaml = if (!yaml) "" else { if (!requireNamespace('yaml', quietly=TRUE)) stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov - if (append && (file=="" || file.exists(file))) { - warning("Ignoring yaml=TRUE because append=TRUE and the file already exists. YAML will only be written to the top of a file.") - "" - } else { - schema_vec = sapply(x, class) - # multi-class objects reduced to first class - if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) - # as.vector strips names - schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec)) - yaml_header = list( - source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', - R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))), - creation_time_utc = format(Sys.time(), tz='UTC'), - schema = list( - fields = lapply( - seq_along(x), - function(i) list(name=schema_vec$name[i], type=schema_vec$type[i]) - ) - ), - header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na, - dec=dec, qmethod=qmethod, logical01=logical01 - ) - paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline - } - } else "" + schema_vec = sapply(x, class) + # multi-class objects reduced to first class + if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) + # as.vector strips names + schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec)) + yaml_header = list( + source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', + R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))), + creation_time_utc = format(Sys.time(), tz='UTC'), + schema = list( + fields = lapply( + seq_along(x), + function(i) list(name=schema_vec$name[i], type=schema_vec$type[i]) + ) + ), + header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na, + dec=dec, qmethod=qmethod, logical01=logical01 + ) + paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline + } file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 617152028b..0714b3c9be 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9511,19 +9511,20 @@ test(1658.46, fwrite(DT), error="Row 3 of list column is type 'complex'") DT[3,b:=factor(letters[1:3])] test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") -old = options(datatable.verbose=TRUE) # temp trace on windows # fwrite bom DT = data.table(l=letters, n=1:26) fwrite(DT, f1<-tempfile(), bom=TRUE) n_lines = length(readLines(f1, warn=FALSE)) test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) fwrite(DT, f2<-tempfile(), bom=FALSE) +print(readBin(f2, raw(), n=1000)) test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) +print(readBin(f2, raw(), n=1000)) # re-write to the same file should overwrite fwrite(DT, f2, bom=TRUE) +print(readBin(f2, raw(), n=1000)) test(1658.50, length(readLines(f2, warn=FALSE)), n_lines) unlink(c(f1, f2)) -options(old) ## End fwrite tests @@ -14478,8 +14479,8 @@ if (test_yaml) { # csvy; #1701 attr(DT2, 'yaml_metadata') = NULL test(2033.08, all.equal(DT, DT2)) - test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L], - warning = 'Ignoring yaml=TRUE because append=TRUE and the file already exists. YAML.*only.*top of a file') + test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), + output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) # TODO: test gzip'd yaml which is now supported From 8181ea1e3abdccbbfb5ea57812558733b66f800e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 May 2019 18:35:25 -0700 Subject: [PATCH 23/27] more tracing --- inst/tests/tests.Rraw | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0714b3c9be..da9b8f066f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9514,7 +9514,9 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") # fwrite bom DT = data.table(l=letters, n=1:26) fwrite(DT, f1<-tempfile(), bom=TRUE) +print(readBin(f1, raw(), n=1000)) n_lines = length(readLines(f1, warn=FALSE)) +cat("n_lines =", n_lines, "\n") test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) fwrite(DT, f2<-tempfile(), bom=FALSE) print(readBin(f2, raw(), n=1000)) @@ -9523,6 +9525,8 @@ print(readBin(f2, raw(), n=1000)) # re-write to the same file should overwrite fwrite(DT, f2, bom=TRUE) print(readBin(f2, raw(), n=1000)) +n_lines2 = length(readLines(f2, warn=FALSE)) +cat("n_lines2 =", n_lines2, "\n") test(1658.50, length(readLines(f2, warn=FALSE)), n_lines) unlink(c(f1, f2)) From 1f0fbb4276dec249d1442fb3673a36ab55aee7af Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 May 2019 19:24:54 -0700 Subject: [PATCH 24/27] one down one to go, hopefully --- inst/tests/tests.Rraw | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index da9b8f066f..54bdb21d9d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9514,20 +9514,19 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") # fwrite bom DT = data.table(l=letters, n=1:26) fwrite(DT, f1<-tempfile(), bom=TRUE) -print(readBin(f1, raw(), n=1000)) -n_lines = length(readLines(f1, warn=FALSE)) -cat("n_lines =", n_lines, "\n") -test(1658.48, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) +f1con = file(f1, encoding="UTF-8") # Windows needs to be told otherwise it thinks n_lines==1 +test(1658.48, length(readLines(f1con)), 27L) +test(1658.49, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) +close(f1con) fwrite(DT, f2<-tempfile(), bom=FALSE) -print(readBin(f2, raw(), n=1000)) -test(1658.49, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) -print(readBin(f2, raw(), n=1000)) -# re-write to the same file should overwrite +test(1658.50, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) +# re-write to the same file should overwrite. +# Windows seems to cache the connection to f2 and fails on a subsequent read, hence using file(,encoding="UTF-8") fwrite(DT, f2, bom=TRUE) -print(readBin(f2, raw(), n=1000)) -n_lines2 = length(readLines(f2, warn=FALSE)) -cat("n_lines2 =", n_lines2, "\n") -test(1658.50, length(readLines(f2, warn=FALSE)), n_lines) +f2con = file(f2, encoding="UTF-8") +test(1658.51, length(readLines(f2con)), 27L) +close(f2con) +test(1658.52, file.info(f1)$size, file.info(f2)$size) unlink(c(f1, f2)) ## End fwrite tests From 28d6b444d9417df8784e5f57e19a8cb6580c9860 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 May 2019 19:49:28 -0700 Subject: [PATCH 25/27] same for-Windows-only fix applied to 2nd test --- inst/tests/tests.Rraw | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 54bdb21d9d..aa21f325e9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9514,7 +9514,7 @@ test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") # fwrite bom DT = data.table(l=letters, n=1:26) fwrite(DT, f1<-tempfile(), bom=TRUE) -f1con = file(f1, encoding="UTF-8") # Windows needs to be told otherwise it thinks n_lines==1 +f1con = file(f1, encoding="UTF-8") # Windows readLines needs to be told otherwise it thinks n_lines==1 test(1658.48, length(readLines(f1con)), 27L) test(1658.49, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) close(f1con) @@ -14488,13 +14488,18 @@ if (test_yaml) { # csvy; #1701 # TODO: test gzip'd yaml which is now supported # yaml + bom arguments - DT <- data.table(l=letters, n=1:26) - fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) - n_lines = length(readLines(f, warn = FALSE)) - test(2033.11, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) + DT = data.table(l=letters, n=1:26) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 + test(2033.11, length(readLines(fcon)), 49L) + close(fcon) + test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) # re-write should have same output (not appended) - fwrite(DT, f <- tempfile(), bom=TRUE, yaml=TRUE) - test(2033.12, length(readLines(f, warn = FALSE)), n_lines) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") + test(2033.13, length(readLines(fcon)), 49L) + close(fcon) + test(2033.14, fread(f), DT) unlink(f) } From 737600bc48d81227b1976456ed0ab3d6eb061723 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 May 2019 20:03:47 -0700 Subject: [PATCH 26/27] trace 48 vs 49 difference on Windows for test 2033.11 and 2033.13 --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index aa21f325e9..d16d0651f1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14491,6 +14491,7 @@ if (test_yaml) { # csvy; #1701 DT = data.table(l=letters, n=1:26) fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 + print(readLines(fcon)) test(2033.11, length(readLines(fcon)), 49L) close(fcon) test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) From 14b6e23b38a2ccdd774c9f5058a18e76b5575820 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 23 May 2019 20:32:44 -0700 Subject: [PATCH 27/27] deal with blank line difference after 'eol: |2+' --- inst/tests/tests.Rraw | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d16d0651f1..aad07e273f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14491,14 +14491,18 @@ if (test_yaml) { # csvy; #1701 DT = data.table(l=letters, n=1:26) fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 - print(readLines(fcon)) - test(2033.11, length(readLines(fcon)), 49L) + lines = readLines(fcon) + lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows + # remove the blank here so we don't need to change this test if/when that changes in yaml package + test(2033.11, length(lines), 48L) close(fcon) test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) # re-write should have same output (not appended) fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) fcon = file(f, encoding="UTF-8") - test(2033.13, length(readLines(fcon)), 49L) + lines = readLines(fcon) + lines = lines[lines!=""] + test(2033.13, length(lines), 48L) close(fcon) test(2033.14, fread(f), DT) unlink(f)