Rdatatable · mattdowle · May 11, 2019 · May 3, 2019 · May 3, 2019 · May 3, 2019
@@ -28,6 +28,8 @@
     fwrite(DT, "data.csv.gz")   #   2MB; 1.6s
     identical(fread("data.csv.gz"), DT)
     ```
+
+    * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature.
 
 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
 

@@ -153,10 +153,10 @@ yaml=FALSE, autostart=NA)
     # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804
   }
   if (yaml) {
-    # for tracking which YAML elements may be overridden by being declared explicitly
-    call_args = names(match.call())
     if (!requireNamespace('yaml', quietly = TRUE))
       stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
+    # for tracking which YAML elements may be overridden by being declared explicitly
+    call_args = names(match.call())
     if (is.character(skip))
       warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ",
               "reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ",

@@ -8,6 +8,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
            buffMB=8, nThread=getDTthreads(verbose),
            showProgress=getOption("datatable.showProgress", interactive()),
            compress = c("auto", "none", "gzip"), 
+           yaml = FALSE,
            verbose=getOption("datatable.verbose", FALSE)) {
   na = as.character(na[1L]) # fix for #1725
   if (missing(qmethod)) qmethod = qmethod[1L]
@@ -73,9 +74,41 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
       return(invisible())
     }
   }
+
+  # process YAML after potentially short-circuiting due to irregularities
+  if (yaml) {
+    if (!requireNamespace('yaml', quietly = TRUE))
+      stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
+    if (append || is_gzip) {
+      if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.")
+      if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.")
+    } else {
+      schema_vec = sapply(x, class)
+      # multi-class objects reduced to first class
+      if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L)
+      # as.vector strips names
+      schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec))
+      yaml_header = list(
+        source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', 
+                         R.version$major, R.version$minor, format(utils::packageVersion('data.table'))),
+        creation_time_utc = format(Sys.time(), tz = 'UTC'),
+        schema = list(
+          fields = lapply(
+            seq_along(x), 
+            function(i) list(name = schema_vec$name[i], type = schema_vec$type[i])
+          )
+        ),
+        header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na,
+        dec = dec, qmethod = qmethod, logical01 = logical01
+      )
+      # NB: as.yaml adds trailing newline
+      cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file)
+      append = TRUE
+    }
+  }
   file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
   .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
-          row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
-          showProgress, is_gzip, verbose)
+        row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
+        showProgress, is_gzip, verbose)
   invisible()
 }
@@ -14401,18 +14401,63 @@ if (test_yaml) {  # csvy; #1701
   DT_yaml[ , var2 := as.integer(var2)]
   test(2032.22, fread(f, skip = 'var1,', yaml = TRUE),
        DT_yaml, warning = 'Combining a search.*YAML.*')
+
+
+  # fwrite csvy: #3534
+  tmp = tempfile()
+  DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5])
+  # force eol for platform independence
+  fwrite(DT, tmp, yaml = TRUE, eol = '\n')
+  as_read = readLines(tmp)
+  test(2033.01, as_read[c(1L, 25L)], c('---', '---'))
+  test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
+  test(2033.03, grepl('creation_time_utc', as_read[3L]))
+  test(2033.04, as_read[4:24],
+       c("schema:", "  fields:", "  - name: a", "    type: integer", 
+         "  - name: b", "    type: numeric", "  - name: c", "    type: character", 
+         "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", 
+         # NB: apparently \n is encoded like this in YAML
+         "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", 
+         "logical01: no", ""))
+  tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
+  test(2033.05, as_read[26:31], tbl_body)
+
+  # windows eol
+  fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
+  test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"')
+
+  # multi-class columns
+  DT[ , t := .POSIXct(1:5, tz = 'UTC')]
+  fwrite(DT, tmp, yaml = TRUE)
+  as_read = readLines(tmp)
+  test(2033.07, as_read[13L], "    type: POSIXct")
+
+  # ~invertibility~
+  # fread side needs to be improved for Hugh's colClasses update
+  DT[ , t := NULL]
+  fwrite(DT, tmp, yaml = TRUE)
+  DT2 = fread(tmp, yaml = TRUE)
+  # remove metadata to compare
+  attr(DT2, 'yaml_metadata') = NULL
+  test(2033.08, all.equal(DT, DT2))
+
+  # unsupported operations
+  test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L],
+       warning = 'Skipping yaml writing because append = TRUE')
+  test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body,
+       warning = 'Skipping yaml writing because is_gzip = TRUE')
 }
 
 # fcast coverage
 DT = data.table(a = rep(1:2, each = 2), b = rep(1:2, 2), c = 4:1, d = 5:8)
-test(2033.1,
+test(2034.1,
      dcast(DT, a ~ b, value.var = list('c', 'd'), fun.aggregate = list(sum)),
      error = "When 'fun.aggregate' and 'value.var' are both lists")
 
 # fread no quote coverage
-test(2034.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE')
-test(2034.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"'))
-test(2034.3, fread('A,B\n"foo","ba"r"', quote=""), ans)
+test(2035.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE')
+test(2035.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"'))
+test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans)
 
 
 ###################################

@@ -61,7 +61,7 @@ yaml=FALSE, autostart=NA
   \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
   \item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.}
   \item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.}
-  \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{yaml::yaml.load}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. }
+  \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. }
   \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. }
 }
 \details{

@@ -18,6 +18,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
   buffMB = 8L, nThread = getDTthreads(verbose),
   showProgress = getOption("datatable.showProgress", interactive()),
   compress = c("auto", "none", "gzip"),
+  yaml = FALSE,
   verbose = getOption("datatable.verbose", FALSE))
 }
 \arguments{
@@ -54,10 +55,30 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
   \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
   \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
   \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
+  \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. }
   \item{verbose}{Be chatty and report timings?}
 }
 \details{
 \code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector.
+
+\bold{CSVY Support:}
+
+The following fields will be written to the header of the file and surrounded by \code{---} on top and bottom:
+
+  \itemize{
+    \item{ \code{source} - Contains the R version and \code{data.table} version used to write the file }
+    \item{ \code{creation_time_utc} - Current timestamp in UTC time just before the header is written }
+    \item{ \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. }
+    \item{ \code{header} - same as \code{col.names} (which is \code{header} on input) }
+    \item{ \code{sep} }
+    \item{ \code{sep2} }
+    \item{ \code{eol} }
+    \item{ \code{na.strings} - same as \code{na} }
+    \item{ \code{dec} }
+    \item{ \code{qmethod} }
+    \item{ \code{logical01} }
+  }
+
 }
 \seealso{
   \code{\link{setDTthreads}}, \code{\link{fread}}, \code{\link[utils:write.table]{write.csv}}, \code{\link[utils:write.table]{write.table}}, \href{https://CRAN.R-project.org/package=bit64}{\code{bit64::integer64}}