Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
fwrite(DT, "data.csv.gz") # 2MB; 1.6s
identical(fread("data.csv.gz"), DT)
```

* Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature.

4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).

Expand Down
4 changes: 2 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,10 @@ yaml=FALSE, autostart=NA)
# whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804
}
if (yaml) {
# for tracking which YAML elements may be overridden by being declared explicitly
call_args = names(match.call())
if (!requireNamespace('yaml', quietly = TRUE))
stop("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
# for tracking which YAML elements may be overridden by being declared explicitly
call_args = names(match.call())
if (is.character(skip))
warning("Combining a search string as 'skip' and reading a YAML header may not work as expected -- currently, ",
"reading will proceed to search for 'skip' from the beginning of the file, NOT from the end of ",
Expand Down
37 changes: 35 additions & 2 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
buffMB=8, nThread=getDTthreads(verbose),
showProgress=getOption("datatable.showProgress", interactive()),
compress = c("auto", "none", "gzip"),
yaml = FALSE,
verbose=getOption("datatable.verbose", FALSE)) {
na = as.character(na[1L]) # fix for #1725
if (missing(qmethod)) qmethod = qmethod[1L]
Expand Down Expand Up @@ -73,9 +74,41 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
return(invisible())
}
}

# process YAML after potentially short-circuiting due to irregularities
if (yaml) {
if (!requireNamespace('yaml', quietly = TRUE))
stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
if (append || is_gzip) {
if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.")
if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.")
} else {
schema_vec = sapply(x, class)
# multi-class objects reduced to first class
if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L)
# as.vector strips names
schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec))
yaml_header = list(
source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
R.version$major, R.version$minor, format(utils::packageVersion('data.table'))),
creation_time_utc = format(Sys.time(), tz = 'UTC'),
schema = list(
fields = lapply(
seq_along(x),
function(i) list(name = schema_vec$name[i], type = schema_vec$type[i])
)
),
header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na,
dec = dec, qmethod = qmethod, logical01 = logical01
)
# NB: as.yaml adds trailing newline
cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file)
append = TRUE
}
}
file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
.Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
showProgress, is_gzip, verbose)
row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
showProgress, is_gzip, verbose)
invisible()
}
53 changes: 49 additions & 4 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -14401,18 +14401,63 @@ if (test_yaml) { # csvy; #1701
DT_yaml[ , var2 := as.integer(var2)]
test(2032.22, fread(f, skip = 'var1,', yaml = TRUE),
DT_yaml, warning = 'Combining a search.*YAML.*')


# fwrite csvy: #3534
tmp = tempfile()
DT = data.table(a = 1:5, b = c(pi, 1:4), c = letters[1:5])
# force eol for platform independence
fwrite(DT, tmp, yaml = TRUE, eol = '\n')
as_read = readLines(tmp)
test(2033.01, as_read[c(1L, 25L)], c('---', '---'))
test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
test(2033.03, grepl('creation_time_utc', as_read[3L]))
test(2033.04, as_read[4:24],
c("schema:", " fields:", " - name: a", " type: integer",
" - name: b", " type: numeric", " - name: c", " type: character",
"header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
# NB: apparently \n is encoded like this in YAML
"eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
"logical01: no", ""))
tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
test(2033.05, as_read[26:31], tbl_body)

# windows eol
fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
test(2033.06, readLines(tmp)[18L], 'eol: "\\r\\n"')

# multi-class columns
DT[ , t := .POSIXct(1:5, tz = 'UTC')]
fwrite(DT, tmp, yaml = TRUE)
as_read = readLines(tmp)
test(2033.07, as_read[13L], " type: POSIXct")

# ~invertibility~
# fread side needs to be improved for Hugh's colClasses update
DT[ , t := NULL]
fwrite(DT, tmp, yaml = TRUE)
DT2 = fread(tmp, yaml = TRUE)
# remove metadata to compare
attr(DT2, 'yaml_metadata') = NULL
test(2033.08, all.equal(DT, DT2))

# unsupported operations
test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L],
warning = 'Skipping yaml writing because append = TRUE')
test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body,
warning = 'Skipping yaml writing because is_gzip = TRUE')
}

# fcast coverage
DT = data.table(a = rep(1:2, each = 2), b = rep(1:2, 2), c = 4:1, d = 5:8)
test(2033.1,
test(2034.1,
dcast(DT, a ~ b, value.var = list('c', 'd'), fun.aggregate = list(sum)),
error = "When 'fun.aggregate' and 'value.var' are both lists")

# fread no quote coverage
test(2034.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE')
test(2034.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"'))
test(2034.3, fread('A,B\n"foo","ba"r"', quote=""), ans)
test(2035.1, fread('A,B\n"foo","ba"r"', quote="''"), error='quote= must be a single character, blank "", or FALSE')
test(2035.2, fread('A,B\n"foo","ba"r"', quote=FALSE), ans<-data.table(A='"foo"', B='"ba"r"'))
test(2035.3, fread('A,B\n"foo","ba"r"', quote=""), ans)


###################################
Expand Down
2 changes: 1 addition & 1 deletion man/fread.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ yaml=FALSE, autostart=NA
\item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
\item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.}
\item{keepLeadingZeros}{If TRUE a column containing numeric data with leading zeros will be read as character, otherwise leading zeros will be removed and converted to numeric.}
\item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{yaml::yaml.load}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. }
\item{yaml}{ If \code{TRUE}, \code{fread} will attempt to parse (using \code{\link[yaml]{yaml.load}}) the top of the input as YAML, and further to glean parameters relevant to improving the performance of \code{fread} on the data itself. The entire YAML section is returned as parsed into a \code{list} in the \code{yaml_metadata} attribute. See \code{Details}. }
\item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. }
}
\details{
Expand Down
21 changes: 21 additions & 0 deletions man/fwrite.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
buffMB = 8L, nThread = getDTthreads(verbose),
showProgress = getOption("datatable.showProgress", interactive()),
compress = c("auto", "none", "gzip"),
yaml = FALSE,
verbose = getOption("datatable.verbose", FALSE))
}
\arguments{
Expand Down Expand Up @@ -54,10 +55,30 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
\item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
\item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
\item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
\item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. }
\item{verbose}{Be chatty and report timings?}
}
\details{
\code{fwrite} began as a community contribution with \href{https://github.com/Rdatatable/data.table/pull/1613}{pull request #1613} by Otto Seiskari. This gave Matt Dowle the impetus to specialize the numeric formatting and to parallelize: \url{http://blog.h2o.ai/2016/04/fast-csv-writing-for-r/}. Final items were tracked in \href{https://github.com/Rdatatable/data.table/issues/1664}{issue #1664} such as automatic quoting, \code{bit64::integer64} support, decimal/scientific formatting exactly matching \code{write.csv} between 2.225074e-308 and 1.797693e+308 to 15 significant figures, \code{row.names}, dates (between 0000-03-01 and 9999-12-31), times and \code{sep2} for \code{list} columns where each cell can itself be a vector.

\bold{CSVY Support:}

The following fields will be written to the header of the file and surrounded by \code{---} on top and bottom:

\itemize{
\item{ \code{source} - Contains the R version and \code{data.table} version used to write the file }
\item{ \code{creation_time_utc} - Current timestamp in UTC time just before the header is written }
\item{ \code{schema} with element \code{fields} giving \code{name}-\code{type} (\code{class}) pairs for the table; multi-class objects (e.g. \code{c('POSIXct', 'POSIXt')}) will have their first class written. }
\item{ \code{header} - same as \code{col.names} (which is \code{header} on input) }
\item{ \code{sep} }
\item{ \code{sep2} }
\item{ \code{eol} }
\item{ \code{na.strings} - same as \code{na} }
\item{ \code{dec} }
\item{ \code{qmethod} }
\item{ \code{logical01} }
}

}
\seealso{
\code{\link{setDTthreads}}, \code{\link{fread}}, \code{\link[utils:write.table]{write.csv}}, \code{\link[utils:write.table]{write.table}}, \href{https://CRAN.R-project.org/package=bit64}{\code{bit64::integer64}}
Expand Down