diff --git a/NEWS.md b/NEWS.md index 478ffb8556..732ad85680 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,6 +28,8 @@ fwrite(DT, "data.csv.gz") # 2MB; 1.6s identical(fread("data.csv.gz"), DT) ``` + + * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). diff --git a/R/fread.R b/R/fread.R index 6a12625f79..3c262503c1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -153,10 +153,10 @@ yaml=FALSE, autostart=NA) # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } if (yaml) { - # for tracking which YAML elements may be overridden by being declared explicitly - call_args = names(match.call()) if (!requireNamespace('yaml', quietly = TRUE)) stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov + # for tracking which YAML elements may be overridden by being declared explicitly + call_args = names(match.call()) if (is.character(skip)) warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ", "reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ", diff --git a/R/fwrite.R b/R/fwrite.R index 4e8c337e59..9b44ef8ec1 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -8,6 +8,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", buffMB=8, nThread=getDTthreads(verbose), showProgress=getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), + yaml = FALSE, verbose=getOption("datatable.verbose", FALSE)) { na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] @@ -73,9 +74,41 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", return(invisible()) } } + + # process YAML after potentially short-circuiting due to irregularities + if (yaml) { + if (!requireNamespace('yaml', quietly = TRUE)) + stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov + if (append || is_gzip) { + if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.") + if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.") + } else { + schema_vec = sapply(x, class) + # multi-class objects reduced to first class + if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) + # as.vector strips names + schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec)) + yaml_header = list( + source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', + R.version$major, R.version$minor, format(utils::packageVersion('data.table'))), + creation_time_utc = format(Sys.time(), tz = 'UTC'), + schema = list( + fields = lapply( + seq_along(x), + function(i) list(name = schema_vec$name[i], type = schema_vec$type[i]) + ) + ), + header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, + dec = dec, qmethod = qmethod, logical01 = logical01 + ) + # NB: as.yaml adds trailing newline + cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file) + append = TRUE + } + } file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, - row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, verbose) + row.names, col.names, logical01, dateTimeAs, buffMB, nThread, + showProgress, is_gzip, verbose) invisible() } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 92178737d7..fa3ef6784b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14401,18 +14401,63 @@ if (test_yaml) { # csvy; #1701 DT_yaml[ , var2 := as.integer(var2)] test(2032.22, fread(f, skip = 'var1,', yaml = TRUE), DT_yaml, warning = 'Combining a search.*YAML.*') + + + # fwrite csvy: #3534 + tmp = tempfile() + DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) + # force eol for platform independence + fwrite(DT, tmp, yaml = TRUE, eol = '\n') + as_read = readLines(tmp) + test(2033.01, as_read[c(1L, 25L)], c('---', '---')) + test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) + test(2033.03, grepl('creation_time_utc', as_read[3L])) + test(2033.04, as_read[4:24], + c("schema:", " fields:", " - name: a", " type: integer", + " - name: b", " type: numeric", " - name: c", " type: character", + "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", + # NB: apparently \n is encoded like this in YAML + "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", + "logical01: no", "")) + tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") + test(2033.05, as_read[26:31], tbl_body) + + # windows eol + fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') + test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"') + + # multi-class columns + DT[ , t := .POSIXct(1:5, tz = 'UTC')] + fwrite(DT, tmp, yaml = TRUE) + as_read = readLines(tmp) + test(2033.07, as_read[13L], " type: POSIXct") + + # ~invertibility~ + # fread side needs to be improved for Hugh's colClasses update + DT[ , t := NULL] + fwrite(DT, tmp, yaml = TRUE) + DT2 = fread(tmp, yaml = TRUE) + # remove metadata to compare + attr(DT2, 'yaml_metadata') = NULL + test(2033.08, all.equal(DT, DT2)) + + # unsupported operations + test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L], + warning = 'Skipping yaml writing because append = TRUE') + test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, + warning = 'Skipping yaml writing because is_gzip = TRUE') } # fcast coverage DT = data.table(a = rep(1:2, each = 2), b = rep(1:2, 2), c = 4:1, d = 5:8) -test(2033.1, +test(2034.1, dcast(DT, a ~ b, value.var = list('c', 'd'), fun.aggregate = list(sum)), error = "When 'fun.aggregate' and 'value.var' are both lists") # fread no quote coverage -test(2034.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE') -test(2034.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"')) -test(2034.3, fread('A,B\n"foo","ba"r"', quote=""), ans) +test(2035.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE') +test(2035.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"')) +test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans) ################################### diff --git a/man/fread.Rd b/man/fread.Rd index 796234471f..88c2c0071f 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -61,7 +61,7 @@ yaml=FALSE, autostart=NA \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.} \item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.} - \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{yaml::yaml.load}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } + \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } } \details{ diff --git a/man/fwrite.Rd b/man/fwrite.Rd index f98314e798..a4c6bbb703 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -18,6 +18,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", buffMB = 8L, nThread = getDTthreads(verbose), showProgress = getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), + yaml = FALSE, verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -54,10 +55,30 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} + \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. } \item{verbose}{Be chatty and report timings?} } \details{ \code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. + +\bold{CSVY Support:} + +The following fields will be written to the header of the file and surrounded by \code{---} on top and bottom: + + \itemize{ + \item{ \code{source} - Contains the R version and \code{data.table} version used to write the file } + \item{ \code{creation_time_utc} - Current timestamp in UTC time just before the header is written } + \item{ \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. } + \item{ \code{header} - same as \code{col.names} (which is \code{header} on input) } + \item{ \code{sep} } + \item{ \code{sep2} } + \item{ \code{eol} } + \item{ \code{na.strings} - same as \code{na} } + \item{ \code{dec} } + \item{ \code{qmethod} } + \item{ \code{logical01} } + } + } \seealso{ \code{\link{setDTthreads}}, \code{\link{fread}}, \code{\link[utils:write.table]{write.csv}}, \code{\link[utils:write.table]{write.table}}, \href{https://CRAN.R-project.org/package=bit64}{\code{bit64::integer64}}