diff --git a/NEWS.md b/NEWS.md index 8dc12a15f4..44e469d98b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. + * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing. + 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). ```R diff --git a/R/fwrite.R b/R/fwrite.R index 542ca4b75d..44ae546432 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -9,6 +9,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", showProgress=getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), yaml = FALSE, + bom = FALSE, verbose=getOption("datatable.verbose", FALSE)) { na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] @@ -30,7 +31,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", # validate arguments if (is.matrix(x)) { # coerce to data.table if input object is matrix message("x being coerced from class: matrix to data.table") - x <- as.data.table(x) + x = as.data.table(x) } stopifnot(is.list(x), identical(quote,"auto") || isTRUEorFALSE(quote), @@ -43,17 +44,22 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(compress) == 1L && compress %chin% c("auto", "none", "gzip"), isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names), isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01), + isTRUEorFALSE(bom), length(na) == 1L, #1725, handles NULL or character(0) input is.character(file) && length(file)==1L && !is.na(file), length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024, length(nThread)==1L && !is.na(nThread) && nThread>=1L ) - is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) + is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) - file <- path.expand(file) # "~/foo/bar" - if (append && missing(col.names) && (file=="" || file.exists(file))) - col.names = FALSE # test 1658.16 checks this + file = path.expand(file) # "~/foo/bar" + if (append && (file=="" || file.exists(file))) { + if (missing(col.names)) col.names = FALSE + if (verbose) cat("Appending to existing file so setting bom=FALSE and yaml=FALSE\n") + bom = FALSE + yaml = FALSE + } if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time @@ -74,41 +80,33 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", return(invisible()) } } - - # process YAML after potentially short-circuiting due to irregularities - if (yaml) { - if (!requireNamespace('yaml', quietly = TRUE)) + yaml = if (!yaml) "" else { + if (!requireNamespace('yaml', quietly=TRUE)) stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov - if (append || is_gzip) { - if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.") - if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.") - } else { - schema_vec = sapply(x, class) - # multi-class objects reduced to first class - if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) - # as.vector strips names - schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec)) - yaml_header = list( - source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', - R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e)'DEV'))), - creation_time_utc = format(Sys.time(), tz = 'UTC'), - schema = list( - fields = lapply( - seq_along(x), - function(i) list(name = schema_vec$name[i], type = schema_vec$type[i]) - ) - ), - header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, - dec = dec, qmethod = qmethod, logical01 = logical01 - ) - # NB: as.yaml adds trailing newline - cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file) - append = TRUE - } + schema_vec = sapply(x, class) + # multi-class objects reduced to first class + if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) + # as.vector strips names + schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec)) + yaml_header = list( + source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', + R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))), + creation_time_utc = format(Sys.time(), tz='UTC'), + schema = list( + fields = lapply( + seq_along(x), + function(i) list(name=schema_vec$name[i], type=schema_vec$type[i]) + ) + ), + header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na, + dec=dec, qmethod=qmethod, logical01=logical01 + ) + paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline } - file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. + file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, verbose) + showProgress, is_gzip, bom, yaml, verbose) invisible() } + diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d56ee7f1ba..aad07e273f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -6514,13 +6514,13 @@ if (test_xts) { setcolorder(dt, c(2, 3, 1)) dt[ , char_col := 'a'] test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric') - + # 890 -- key argument for as.data.table.xts x = xts(1:10, as.Date(1:10, origin = "1970-01-01")) test(1465.18, capture.output(as.data.table(x, key="index")), - c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", - " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", - " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", + c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2", + " 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5", + " 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8", " 9: 1970-01-10 9", "10: 1970-01-11 10")) Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE) @@ -9466,7 +9466,7 @@ test(1658.25, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"') # integer NA DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5)) test(1658.26, fwrite(DT), output='A,B\n2,\n,4\n3,5') -test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing column names.*"A","B".*2,NA\nNA,4\n3,5') +test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., yaml .0 characters. and column names .true.*"A","B".*2,NA\nNA,4\n3,5') # wrong argument types test(1658.28, fwrite(ok_dt, 1), error="is.character\\(file\\).*not TRUE") @@ -9511,6 +9511,24 @@ test(1658.46, fwrite(DT), error="Row 3 of list column is type 'complex'") DT[3,b:=factor(letters[1:3])] test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'") +# fwrite bom +DT = data.table(l=letters, n=1:26) +fwrite(DT, f1<-tempfile(), bom=TRUE) +f1con = file(f1, encoding="UTF-8") # Windows readLines needs to be told otherwise it thinks n_lines==1 +test(1658.48, length(readLines(f1con)), 27L) +test(1658.49, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e))) +close(f1con) +fwrite(DT, f2<-tempfile(), bom=FALSE) +test(1658.50, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e))) +# re-write to the same file should overwrite. +# Windows seems to cache the connection to f2 and fails on a subsequent read, hence using file(,encoding="UTF-8") +fwrite(DT, f2, bom=TRUE) +f2con = file(f2, encoding="UTF-8") +test(1658.51, length(readLines(f2con)), 27L) +close(f2con) +test(1658.52, file.info(f1)$size, file.info(f2)$size) +unlink(c(f1, f2)) + ## End fwrite tests # tests for #679, inrange(), FR #707 @@ -14432,18 +14450,18 @@ if (test_yaml) { # csvy; #1701 # force eol for platform independence fwrite(DT, tmp, yaml = TRUE, eol = '\n') as_read = readLines(tmp) - test(2033.01, as_read[c(1L, 25L)], c('---', '---')) + test(2033.01, as_read[c(1L, 24L)], c('---', '---')) test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) test(2033.03, grepl('creation_time_utc', as_read[3L])) - test(2033.04, as_read[4:24], + test(2033.04, as_read[4:23], c("schema:", " fields:", " - name: a", " type: integer", " - name: b", " type: numeric", " - name: c", " type: character", "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", # NB: apparently \n is encoded like this in YAML "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", - "logical01: no", "")) + "logical01: no")) tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") - test(2033.05, as_read[26:31], tbl_body) + test(2033.05, as_read[25:30], tbl_body) # windows eol fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') @@ -14464,11 +14482,30 @@ if (test_yaml) { # csvy; #1701 attr(DT2, 'yaml_metadata') = NULL test(2033.08, all.equal(DT, DT2)) - # unsupported operations - test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L], - warning = 'Skipping yaml writing because append = TRUE') - test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, - warning = 'Skipping yaml writing because is_gzip = TRUE') + test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE), + output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*")) + + # TODO: test gzip'd yaml which is now supported + + # yaml + bom arguments + DT = data.table(l=letters, n=1:26) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50 + lines = readLines(fcon) + lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows + # remove the blank here so we don't need to change this test if/when that changes in yaml package + test(2033.11, length(lines), 48L) + close(fcon) + test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d))) + # re-write should have same output (not appended) + fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE) + fcon = file(f, encoding="UTF-8") + lines = readLines(fcon) + lines = lines[lines!=""] + test(2033.13, length(lines), 48L) + close(fcon) + test(2033.14, fread(f), DT) + unlink(f) } # fcast coverage @@ -14782,7 +14819,7 @@ test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural joi options(datatable.naturaljoin=FALSE) #tests for adding key to as.data.table, #890 -## as.data.table.numeric (should cover as.data.table.factor, +## as.data.table.numeric (should cover as.data.table.factor, ## *.ordered, *.integer, *.logical, *.character, and *.Date since ## all are the same function in as.data.table.R) nn = c(a=0.1, c=0.2, b=0.3, d=0.4) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index a4c6bbb703..c8acd80db4 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -19,6 +19,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", showProgress = getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), yaml = FALSE, + bom = FALSE, verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -55,7 +56,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} - \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. } + \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. } + \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.} \item{verbose}{Be chatty and report timings?} } \details{ diff --git a/src/fwrite.c b/src/fwrite.c index df6addb96c..af3fc7fa04 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -672,28 +672,39 @@ void fwriteMain(fwriteMainArgs args) } } + int yamlLen = strlen(args.yaml); if (args.verbose) { - DTPRINT("Writing column names ... "); + DTPRINT("Writing bom (%s), yaml (%d characters) and column names (%s) ... ", + args.bom?"true":"false", yamlLen, args.colNames?"true":"false"); if (f==-1) DTPRINT("\n"); } + size_t headerLen = 0; + if (args.bom) headerLen += 3; + headerLen += yamlLen; if (args.colNames) { - size_t headerLen = 0; for (int j=0; j> column name) + } + if (headerLen) { char *buff = malloc(headerLen); if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno)); char *ch = buff; - if (args.doRowNames) { - // Unusual: the extra blank column name when row_names are added as the first column - if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv - *ch++ = sep; - } - for (int j=0; j