From f74240b0ea033096cef1d8b6040ae5044ad90758 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 3 May 2019 16:15:57 +0800 Subject: [PATCH 1/7] Closes #3534 -- adds csvy support for fwrite() --- R/fread.R | 4 ++-- R/fwrite.R | 35 +++++++++++++++++++++++++++++++++-- man/fread.Rd | 2 +- man/fwrite.Rd | 24 ++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 5 deletions(-) diff --git a/R/fread.R b/R/fread.R index 80062c6b19..01895c8930 100644 --- a/R/fread.R +++ b/R/fread.R @@ -153,10 +153,10 @@ yaml=FALSE, autostart=NA) # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } if (yaml) { - # for tracking which YAML elements may be overridden by being declared explicitly - call_args = names(match.call()) if (!requireNamespace('yaml', quietly = TRUE)) stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov + # for tracking which YAML elements may be overridden by being declared explicitly + call_args = names(match.call()) if (is.character(skip)) warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ", "reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ", diff --git a/R/fwrite.R b/R/fwrite.R index 4e8c337e59..3f760bb59d 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -8,6 +8,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", buffMB=8, nThread=getDTthreads(verbose), showProgress=getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), + yaml = FALSE, verbose=getOption("datatable.verbose", FALSE)) { na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] @@ -73,9 +74,39 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", return(invisible()) } } + + # process YAML after potentially short-circuiting due to irregularities + if (yaml) { + if (!requireNamespace('yaml', quietly = TRUE)) + stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov + if (append || is_gzip) { + if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.") + if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.") + } else { + schema_vec = sapply(x, class) + # multi-class objects reduced to first class + if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L) + # as.vector strips names + schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec)) + yaml_header = list( + source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', + R.version$major, R.version$minor, format(utils::packageVersion('data.table'))), + creation_time_utc = format(Sys.time(), tz = 'UTC'), + schema = list( + fields = lapply( + seq_along(x), + function(i) list(name = schema_vec$name[i], type = schema_vec$type[i]) + ) + ), + header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, + dec = dec, qmethod = qmethod, logical01 = logical01 + ) + cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file) + } + } file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, - row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, is_gzip, verbose) + row.names, col.names, logical01, dateTimeAs, buffMB, nThread, + showProgress, is_gzip, verbose) invisible() } diff --git a/man/fread.Rd b/man/fread.Rd index f0551ff56c..c5694cea9a 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -66,7 +66,7 @@ yaml=FALSE, autostart=NA \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.} \item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.} - \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{yaml::yaml.load}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } + \item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. } \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } } \details{ diff --git a/man/fwrite.Rd b/man/fwrite.Rd index f98314e798..fe3b3b7d8a 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -18,6 +18,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", buffMB = 8L, nThread = getDTthreads(verbose), showProgress = getOption("datatable.showProgress", interactive()), compress = c("auto", "none", "gzip"), + yaml = FALSE, verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -54,10 +55,33 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} + \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE}. See \code{Details}. } \item{verbose}{Be chatty and report timings?} } \details{ \code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector. + +\bold{CSVY Support:} + +The following fields will be written to the header of the file and surrounded by \code{---} on top and bottom: +list( + source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', + R.version$major, R.version$minor, format(utils::packageVersion('data.table'))), + creation_time_utc = format(Sys.time(), tz = 'UTC'), + schema = list( + fields = lapply( + seq_along(x), + function(i) list(name = schema_vec$name[i], type = schema_vec$type[i]) + ) + ), + header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, + dec = dec, qmethod = qmethod, logical01 = logical01 + ) + \itemize{ + \item{\code{source} - Contains the R version and \code{data.table} version used to write the file} + \item{\code{creation_time_utc} - Current timestamp in UTC time just before the header is written + } + } \seealso{ \code{\link{setDTthreads}}, \code{\link{fread}}, \code{\link[utils:write.table]{write.csv}}, \code{\link[utils:write.table]{write.table}}, \href{https://CRAN.R-project.org/package=bit64}{\code{bit64::integer64}} From c720b053794dedeb25eb849085b4bdcb459091bf Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 3 May 2019 18:06:25 +0800 Subject: [PATCH 2/7] first pass finished; now needs tests --- R/fwrite.R | 2 ++ man/fwrite.Rd | 29 +++++++++++++---------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 3f760bb59d..9b44ef8ec1 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -101,7 +101,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, dec = dec, qmethod = qmethod, logical01 = logical01 ) + # NB: as.yaml adds trailing newline cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file) + append = TRUE } } file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. diff --git a/man/fwrite.Rd b/man/fwrite.Rd index fe3b3b7d8a..a4c6bbb703 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -55,7 +55,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} - \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE}. See \code{Details}. } + \item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. } \item{verbose}{Be chatty and report timings?} } \details{ @@ -64,22 +64,19 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \bold{CSVY Support:} The following fields will be written to the header of the file and surrounded by \code{---} on top and bottom: -list( - source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite', - R.version$major, R.version$minor, format(utils::packageVersion('data.table'))), - creation_time_utc = format(Sys.time(), tz = 'UTC'), - schema = list( - fields = lapply( - seq_along(x), - function(i) list(name = schema_vec$name[i], type = schema_vec$type[i]) - ) - ), - header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na, - dec = dec, qmethod = qmethod, logical01 = logical01 - ) + \itemize{ - \item{\code{source} - Contains the R version and \code{data.table} version used to write the file} - \item{\code{creation_time_utc} - Current timestamp in UTC time just before the header is written + \item{ \code{source} - Contains the R version and \code{data.table} version used to write the file } + \item{ \code{creation_time_utc} - Current timestamp in UTC time just before the header is written } + \item{ \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. } + \item{ \code{header} - same as \code{col.names} (which is \code{header} on input) } + \item{ \code{sep} } + \item{ \code{sep2} } + \item{ \code{eol} } + \item{ \code{na.strings} - same as \code{na} } + \item{ \code{dec} } + \item{ \code{qmethod} } + \item{ \code{logical01} } } } From 36a2ccbe06ac55ca8c34d32c158a7eb736ea498e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 3 May 2019 18:38:34 +0800 Subject: [PATCH 3/7] adding tests --- inst/tests/tests.Rraw | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index cd97c0cfc2..38be9af813 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14368,9 +14368,50 @@ if (test_yaml) { # csvy; #1701 DT_yaml[ , var2 := as.integer(var2)] test(2032.22, fread(f, skip = 'var1,', yaml = TRUE), DT_yaml, warning = 'Combining a search.*YAML.*') + + + # fwrite csvy: #3534 + tmp = tempfile() + DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) + fwrite(DT, tmp, yaml = TRUE) + as_read = readLines(tmp) + test(2033.1, as_read[c(1L, 25L)], c('---', '---')) + test(2033.2, grepl('source: R.*data.table.*fwrite', as_read[2L])) + test(2033.3, grepl('creation_time_utc', as_read[3L])) + test(2033.4, as_read[4:24], + c("schema:", " fields:", " - name: a", " type: integer", + " - name: b", " type: numeric", " - name: c", " type: character", + "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", + # NB: apparently \n is encoded like this in YAML + "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", + "logical01: no", "")) + tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") + test(2033.5, as_read[26:31], tbl_body) + + # multi-class columns + DT[ , t := .POSIXct(1:5, tz = 'UTC')] + fwrite(DT, tmp, yaml = TRUE) + as_read = readLines(tmp) + test(2033.5, as_read[13L], " type: POSIXct") + + # ~invertibility~ + # fread side needs to be improved for Hugh's colClasses update + DT[ , t := NULL] + fwrite(DT, tmp, yaml = TRUE) + DT2 = fread(tmp, yaml = TRUE) + # remove metadata to compare + attr(DT2, 'yaml_metadata') = NULL + test(2033.6, all.equal(DT, DT2)) + + # unsupported operations + test(2033.7, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body, + warning = 'Skipping yaml writing because append = TRUE') + test(2033.8, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, + warning = 'Skipping yaml writing because is_gzip = TRUE') } + ################################### # Add new tests above this line # ################################### From ade071bcf69e0dc257fa13a4e9fb188b2f2bb79e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 May 2019 13:53:42 +0800 Subject: [PATCH 4/7] fix append test --- inst/tests/tests.Rraw | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 38be9af813..411bb5984b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14404,14 +14404,12 @@ if (test_yaml) { # csvy; #1701 test(2033.6, all.equal(DT, DT2)) # unsupported operations - test(2033.7, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body, + test(2033.7, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L], warning = 'Skipping yaml writing because append = TRUE') test(2033.8, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, warning = 'Skipping yaml writing because is_gzip = TRUE') } - - ################################### # Add new tests above this line # ################################### From c0532831d5407d4354ba5d4501259ad57d2112dc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 May 2019 14:08:33 +0800 Subject: [PATCH 5/7] add NEWS item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 521ab3907a..86e559e6c7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,8 @@ fwrite(DT, "data.csv.gz") # 2MB; 1.6s identical(fread("data.csv.gz"), DT) ``` + + * Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature. 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). From 3dbcd07ad6a0b5aea669ac24f1537329f2fe2d1c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 May 2019 14:36:26 +0800 Subject: [PATCH 6/7] force eol for windows test of fwrite --- inst/tests/tests.Rraw | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 411bb5984b..90c6946c0f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14373,7 +14373,8 @@ if (test_yaml) { # csvy; #1701 # fwrite csvy: #3534 tmp = tempfile() DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) - fwrite(DT, tmp, yaml = TRUE) + # force eol to compare correctly as.yaml's treatment thereof + fwrite(DT, tmp, yaml = TRUE, eol = '\n') as_read = readLines(tmp) test(2033.1, as_read[c(1L, 25L)], c('---', '---')) test(2033.2, grepl('source: R.*data.table.*fwrite', as_read[2L])) From 42b6df9d1d8011977acb5f15457bf81774e9ebe7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sat, 4 May 2019 18:52:05 +0800 Subject: [PATCH 7/7] new test with windows eol --- inst/tests/tests.Rraw | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 90c6946c0f..a5028c4cb8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14373,13 +14373,13 @@ if (test_yaml) { # csvy; #1701 # fwrite csvy: #3534 tmp = tempfile() DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5]) - # force eol to compare correctly as.yaml's treatment thereof + # force eol for platform independence fwrite(DT, tmp, yaml = TRUE, eol = '\n') as_read = readLines(tmp) - test(2033.1, as_read[c(1L, 25L)], c('---', '---')) - test(2033.2, grepl('source: R.*data.table.*fwrite', as_read[2L])) - test(2033.3, grepl('creation_time_utc', as_read[3L])) - test(2033.4, as_read[4:24], + test(2033.01, as_read[c(1L, 25L)], c('---', '---')) + test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L])) + test(2033.03, grepl('creation_time_utc', as_read[3L])) + test(2033.04, as_read[4:24], c("schema:", " fields:", " - name: a", " type: integer", " - name: b", " type: numeric", " - name: c", " type: character", "header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''", @@ -14387,13 +14387,17 @@ if (test_yaml) { # csvy; #1701 "eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double", "logical01: no", "")) tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e") - test(2033.5, as_read[26:31], tbl_body) + test(2033.05, as_read[26:31], tbl_body) + + # windows eol + fwrite(DT, tmp, yaml = TRUE, eol = '\r\n') + test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"') # multi-class columns DT[ , t := .POSIXct(1:5, tz = 'UTC')] fwrite(DT, tmp, yaml = TRUE) as_read = readLines(tmp) - test(2033.5, as_read[13L], " type: POSIXct") + test(2033.07, as_read[13L], " type: POSIXct") # ~invertibility~ # fread side needs to be improved for Hugh's colClasses update @@ -14402,12 +14406,12 @@ if (test_yaml) { # csvy; #1701 DT2 = fread(tmp, yaml = TRUE) # remove metadata to compare attr(DT2, 'yaml_metadata') = NULL - test(2033.6, all.equal(DT, DT2)) + test(2033.08, all.equal(DT, DT2)) # unsupported operations - test(2033.7, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L], + test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L], warning = 'Skipping yaml writing because append = TRUE') - test(2033.8, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, + test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body, warning = 'Skipping yaml writing because is_gzip = TRUE') }