Rdatatable · mattdowle · May 24, 2019 · May 21, 2019 · May 21, 2019 · May 21, 2019
@@ -34,6 +34,8 @@
 
     * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature.
 
+    * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing.
+
 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
 
     ```R

@@ -9,6 +9,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
            showProgress=getOption("datatable.showProgress", interactive()),
            compress = c("auto", "none", "gzip"),
            yaml = FALSE,
+           bom = FALSE,
            verbose=getOption("datatable.verbose", FALSE)) {
   na = as.character(na[1L]) # fix for #1725
   if (missing(qmethod)) qmethod = qmethod[1L]
@@ -30,7 +31,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
   # validate arguments
   if (is.matrix(x)) { # coerce to data.table if input object is matrix
     message("x being coerced from class: matrix to data.table")
-    x <- as.data.table(x)
+    x = as.data.table(x)
   }
   stopifnot(is.list(x),
     identical(quote,"auto") || isTRUEorFALSE(quote),
@@ -43,17 +44,22 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
     length(compress) == 1L && compress %chin% c("auto", "none", "gzip"),
     isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names),
     isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01),
+    isTRUEorFALSE(bom),
     length(na) == 1L, #1725, handles NULL or character(0) input
     is.character(file) && length(file)==1L && !is.na(file),
     length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024,
     length(nThread)==1L && !is.na(nThread) && nThread>=1L
     )
 
-  is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))
+  is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))
 
-  file <- path.expand(file)  # "~/foo/bar"
-  if (append && missing(col.names) && (file=="" || file.exists(file)))
-    col.names = FALSE  # test 1658.16 checks this
+  file = path.expand(file)  # "~/foo/bar"
+  if (append && (file=="" || file.exists(file))) {
+    if (missing(col.names)) col.names = FALSE
+    if (verbose) cat("Appending to existing file so setting bom=FALSE and yaml=FALSE\n")
+    bom = FALSE
+    yaml = FALSE
+  }
   if (identical(quote,"auto")) quote=NA  # logical NA
   if (file=="") {
     # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time
@@ -74,41 +80,33 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
       return(invisible())
     }
   }
-
-  # process YAML after potentially short-circuiting due to irregularities
-  if (yaml) {
-    if (!requireNamespace('yaml', quietly = TRUE))
+  yaml = if (!yaml) "" else {
+    if (!requireNamespace('yaml', quietly=TRUE))
       stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
-    if (append || is_gzip) {
-      if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.")
-      if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.")
-    } else {
-      schema_vec = sapply(x, class)
-      # multi-class objects reduced to first class
-      if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L)
-      # as.vector strips names
-      schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec))
-      yaml_header = list(
-        source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
-                         R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e)'DEV'))),
-        creation_time_utc = format(Sys.time(), tz = 'UTC'),
-        schema = list(
-          fields = lapply(
-            seq_along(x),
-            function(i) list(name = schema_vec$name[i], type = schema_vec$type[i])
-          )
-        ),
-        header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na,
-        dec = dec, qmethod = qmethod, logical01 = logical01
-      )
-      # NB: as.yaml adds trailing newline
-      cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file)
-      append = TRUE
-    }
+    schema_vec = sapply(x, class)
+    # multi-class objects reduced to first class
+    if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L)
+    # as.vector strips names
+    schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec))
+    yaml_header = list(
+      source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
+                       R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))),
+      creation_time_utc = format(Sys.time(), tz='UTC'),
+      schema = list(
+        fields = lapply(
+          seq_along(x),
+          function(i) list(name=schema_vec$name[i], type=schema_vec$type[i])
+        )
+      ),
+      header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na,
+      dec=dec, qmethod=qmethod, logical01=logical01
+    )
+    paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline
   }
-  file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
+  file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
   .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
         row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
-        showProgress, is_gzip, verbose)
+        showProgress, is_gzip, bom, yaml, verbose)
   invisible()
 }
+
@@ -6514,13 +6514,13 @@ if (test_xts) {
   setcolorder(dt, c(2, 3, 1))
   dt[ , char_col := 'a']
   test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric')
-  
+
   # 890 -- key argument for as.data.table.xts
   x = xts(1:10, as.Date(1:10, origin = "1970-01-01"))
   test(1465.18, capture.output(as.data.table(x, key="index")),
-       c("         index V1", " 1: 1970-01-02  1", " 2: 1970-01-03  2", 
-         " 3: 1970-01-04  3", " 4: 1970-01-05  4", " 5: 1970-01-06  5", 
-         " 6: 1970-01-07  6", " 7: 1970-01-08  7", " 8: 1970-01-09  8", 
+       c("         index V1", " 1: 1970-01-02  1", " 2: 1970-01-03  2",
+         " 3: 1970-01-04  3", " 4: 1970-01-05  4", " 5: 1970-01-06  5",
+         " 6: 1970-01-07  6", " 7: 1970-01-08  7", " 8: 1970-01-09  8",
          " 9: 1970-01-10  9", "10: 1970-01-11 10"))
 
   Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE)
@@ -9466,7 +9466,7 @@ test(1658.25, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"')
 # integer NA
 DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5))
 test(1658.26, fwrite(DT), output='A,B\n2,\n,4\n3,5')
-test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing column names.*"A","B".*2,NA\nNA,4\n3,5')
+test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., yaml .0 characters. and column names .true.*"A","B".*2,NA\nNA,4\n3,5')
 
 # wrong argument types
 test(1658.28, fwrite(ok_dt, 1), error="is.character\\(file\\).*not TRUE")
@@ -9511,6 +9511,24 @@ test(1658.46, fwrite(DT), error="Row 3 of list column is type 'complex'")
 DT[3,b:=factor(letters[1:3])]
 test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'")
 
+# fwrite bom
+DT = data.table(l=letters, n=1:26)
+fwrite(DT, f1<-tempfile(), bom=TRUE)
+f1con = file(f1, encoding="UTF-8")  # Windows readLines needs to be told otherwise it thinks n_lines==1
+test(1658.48, length(readLines(f1con)), 27L)
+test(1658.49, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e)))
+close(f1con)
+fwrite(DT, f2<-tempfile(), bom=FALSE)
+test(1658.50, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e)))
+# re-write to the same file should overwrite.
+# Windows seems to cache the connection to f2 and fails on a subsequent read, hence using file(,encoding="UTF-8")
+fwrite(DT, f2, bom=TRUE)
+f2con = file(f2, encoding="UTF-8")
+test(1658.51, length(readLines(f2con)), 27L)
+close(f2con)
+test(1658.52, file.info(f1)$size, file.info(f2)$size)
+unlink(c(f1, f2))
+
 ## End fwrite tests
 
 # tests for #679, inrange(), FR #707
@@ -14432,18 +14450,18 @@ if (test_yaml) {  # csvy; #1701
   # force eol for platform independence
   fwrite(DT, tmp, yaml = TRUE, eol = '\n')
   as_read = readLines(tmp)
-  test(2033.01, as_read[c(1L, 25L)], c('---', '---'))
+  test(2033.01, as_read[c(1L, 24L)], c('---', '---'))
   test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
   test(2033.03, grepl('creation_time_utc', as_read[3L]))
-  test(2033.04, as_read[4:24],
+  test(2033.04, as_read[4:23],
        c("schema:", "  fields:", "  - name: a", "    type: integer",
          "  - name: b", "    type: numeric", "  - name: c", "    type: character",
          "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
          # NB: apparently \n is encoded like this in YAML
          "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
-         "logical01: no", ""))
+         "logical01: no"))
   tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
-  test(2033.05, as_read[26:31], tbl_body)
+  test(2033.05, as_read[25:30], tbl_body)
 
   # windows eol
   fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
@@ -14464,11 +14482,30 @@ if (test_yaml) {  # csvy; #1701
   attr(DT2, 'yaml_metadata') = NULL
   test(2033.08, all.equal(DT, DT2))
 
-  # unsupported operations
-  test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L],
-       warning = 'Skipping yaml writing because append = TRUE')
-  test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body,
-       warning = 'Skipping yaml writing because is_gzip = TRUE')
+  test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE),
+       output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*"))
+
+  # TODO: test gzip'd yaml which is now supported
+
+  # yaml + bom arguments
+  DT = data.table(l=letters, n=1:26)
+  fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE)
+  fcon = file(f, encoding="UTF-8")  # Windows readLines needs to be told; see also test 1658.50
+  lines = readLines(fcon)
+  lines = lines[lines!=""]  # an extra "" after "eol: |2+" (line 16) on Linux but not Windows
+                            # remove the blank here so we don't need to change this test if/when that changes in yaml package
+  test(2033.11, length(lines), 48L)
+  close(fcon)
+  test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d)))
+  # re-write should have same output (not appended)
+  fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE)
+  fcon = file(f, encoding="UTF-8")
+  lines = readLines(fcon)
+  lines = lines[lines!=""]
+  test(2033.13, length(lines), 48L)
+  close(fcon)
+  test(2033.14, fread(f), DT)
+  unlink(f)
 }
 
 # fcast coverage
@@ -14782,7 +14819,7 @@ test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural joi
 options(datatable.naturaljoin=FALSE)
 
 #tests for adding key to as.data.table, #890
-## as.data.table.numeric (should cover as.data.table.factor, 
+## as.data.table.numeric (should cover as.data.table.factor,
 ## *.ordered, *.integer, *.logical, *.character, and *.Date since
 ## all are the same function in as.data.table.R)
 nn = c(a=0.1, c=0.2, b=0.3, d=0.4)

@@ -19,6 +19,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
   showProgress = getOption("datatable.showProgress", interactive()),
   compress = c("auto", "none", "gzip"),
   yaml = FALSE,
+  bom = FALSE,
   verbose = getOption("datatable.verbose", FALSE))
 }
 \arguments{
@@ -55,7 +56,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
   \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
   \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
   \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
-  \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. }
+  \item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. }
+  \item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.}
   \item{verbose}{Be chatty and report timings?}
 }
 \details{

@@ -672,28 +672,39 @@ void fwriteMain(fwriteMainArgs args)
     }
   }
 
+  int yamlLen = strlen(args.yaml);
   if (args.verbose) {
-    DTPRINT("Writing column names ... ");
+    DTPRINT("Writing bom (%s), yaml (%d characters) and column names (%s) ... ",
+            args.bom?"true":"false", yamlLen, args.colNames?"true":"false");
     if (f==-1) DTPRINT("\n");
   }
+  size_t headerLen = 0;
+  if (args.bom) headerLen += 3;
+  headerLen += yamlLen;
   if (args.colNames) {
-    size_t headerLen = 0;
     for (int j=0; j<args.ncol; j++) headerLen += getStringLen(args.colNames, j)*2;  // *2 in case quotes are escaped or doubled
     headerLen += args.ncol*(1/*sep*/+(doQuote!=0)*2) + eolLen + 3;  // 3 in case doRowNames and doQuote (the first blank <<"",>> column name)
+  }
+  if (headerLen) {
     char *buff = malloc(headerLen);
     if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno));
     char *ch = buff;
-    if (args.doRowNames) {
-      // Unusual: the extra blank column name when row_names are added as the first column
-      if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv
-      *ch++ = sep;
-    }
-    for (int j=0; j<args.ncol; j++) {
-      writeString(args.colNames, j, &ch);
-      *ch++ = sep;
+    if (args.bom) {*ch++=0xEF; *ch++=0xBB; *ch++=0xBF; }  // 3 appears above (search for "bom")
+    memcpy(ch, args.yaml, yamlLen);
+    ch += yamlLen;
+    if (args.colNames) {
+      if (args.doRowNames) {
+        // Unusual: the extra blank column name when row_names are added as the first column
+        if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv
+        *ch++ = sep;
+      }
+      for (int j=0; j<args.ncol; j++) {
+        writeString(args.colNames, j, &ch);
+        *ch++ = sep;
+      }
+      ch--; // backup over the last sep
+      write_chars(args.eol, &ch);
     }
-    ch--; // backup over the last sep
-    write_chars(args.eol, &ch);
     if (f==-1) {
       *ch = '\0';
       DTPRINT(buff);

@@ -98,6 +98,8 @@ typedef struct fwriteMainArgs
   int nth;
   bool showProgress;
   bool is_gzip;
+  bool bom;
+  const char *yaml;
   bool verbose;
 } fwriteMainArgs;
 

@@ -158,12 +158,16 @@ SEXP fwriteR(
   SEXP nThread_Arg,
   SEXP showProgress_Arg,
   SEXP is_gzip_Arg,
+  SEXP bom_Arg,
+  SEXP yaml_Arg,
   SEXP verbose_Arg
   )
 {
   if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table");
   fwriteMainArgs args;
   args.is_gzip = LOGICAL(is_gzip_Arg)[0];
+  args.bom = LOGICAL(bom_Arg)[0];
+  args.yaml = CHAR(STRING_ELT(yaml_Arg, 0));
   args.verbose = LOGICAL(verbose_Arg)[0];
   args.filename = CHAR(STRING_ELT(filename_Arg, 0));
   args.ncol = length(DF);
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,8 @@ @@
         * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature.
+        * Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing.
 . Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
         ```R
@@ Expand Down @@