Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
9eca5d0
Implements with_bom in fwrite
May 21, 2019
18ff4a0
with_bom is compatible with yaml
May 21, 2019
dd17a42
Add with_bom in documentation
May 21, 2019
e8d92c2
Add tests for fwrite UTF-8 with bom
May 22, 2019
0d9a272
Remove useless debug
May 22, 2019
322aa41
using integers
MichaelChirico May 22, 2019
4e28d61
Rename 'with_bom' parameter in 'bom'
May 22, 2019
cd6dd91
Use TRUE and FALSE in tests
May 22, 2019
9b67de8
Remove useless test
May 22, 2019
f877558
When appending in existing file, bom is set to FALSE
May 22, 2019
38bd125
Merge branch 'master' into fwrite_withbom
mattdowle May 22, 2019
bebaced
Made tests 2033.06 and 2033.07 pass but I don't follow why. Will foll…
mattdowle May 22, 2019
e6f12d7
fix append problem for bom writing
May 23, 2019
5447e91
add some tests
May 23, 2019
951bd57
windows problem suggests eol issues
May 23, 2019
49d28f5
different file?
May 23, 2019
2a85b0a
readLines needs warn=FALSE
May 23, 2019
e842474
trying with resetting the file instead of rawToChar
May 23, 2019
61d935a
Add nocov
May 23, 2019
db39ba3
Add a NEWS item for fwrite bom
May 23, 2019
5024343
moved yaml write to C level as Philippe suggested; now supports gzip'…
mattdowle May 23, 2019
29216d3
tidy and trace
mattdowle May 24, 2019
bbeb615
more tracing
mattdowle May 24, 2019
8181ea1
more tracing
mattdowle May 24, 2019
1f0fbb4
one down one to go, hopefully
mattdowle May 24, 2019
28d6b44
same for-Windows-only fix applied to 2nd test
mattdowle May 24, 2019
737600b
trace 48 vs 49 difference on Windows for test 2033.11 and 2033.13
mattdowle May 24, 2019
14b6e23
deal with blank line difference after 'eol: |2+'
mattdowle May 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

* Gains `yaml` argument matching that of `fread`, [#3534](https://github.com/Rdatatable/data.table/issues/3534). See the item in `fread` for a bit more detail; here, we'd like to reiterate that feedback is appreciated in the initial phase of rollout for this feature.

* Gains `bom` argument to add a *byte order mark* (BOM) at the beginning of the file to signal that the file is encoded in UTF-8, [#3488](https://github.com/Rdatatable/data.table/issues/3488). Thanks to Stefan Fleck for requesting and Philippe Chataignon for implementing.

4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).

```R
Expand Down
72 changes: 35 additions & 37 deletions R/fwrite.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
showProgress=getOption("datatable.showProgress", interactive()),
compress = c("auto", "none", "gzip"),
yaml = FALSE,
bom = FALSE,
verbose=getOption("datatable.verbose", FALSE)) {
na = as.character(na[1L]) # fix for #1725
if (missing(qmethod)) qmethod = qmethod[1L]
Expand All @@ -30,7 +31,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
# validate arguments
if (is.matrix(x)) { # coerce to data.table if input object is matrix
message("x being coerced from class: matrix to data.table")
x <- as.data.table(x)
x = as.data.table(x)
}
stopifnot(is.list(x),
identical(quote,"auto") || isTRUEorFALSE(quote),
Expand All @@ -43,17 +44,22 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
length(compress) == 1L && compress %chin% c("auto", "none", "gzip"),
isTRUEorFALSE(col.names), isTRUEorFALSE(append), isTRUEorFALSE(row.names),
isTRUEorFALSE(verbose), isTRUEorFALSE(showProgress), isTRUEorFALSE(logical01),
isTRUEorFALSE(bom),
length(na) == 1L, #1725, handles NULL or character(0) input
is.character(file) && length(file)==1L && !is.na(file),
length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024,
length(nThread)==1L && !is.na(nThread) && nThread>=1L
)

is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))
is_gzip = compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file))

file <- path.expand(file) # "~/foo/bar"
if (append && missing(col.names) && (file=="" || file.exists(file)))
col.names = FALSE # test 1658.16 checks this
file = path.expand(file) # "~/foo/bar"
if (append && (file=="" || file.exists(file))) {
if (missing(col.names)) col.names = FALSE
if (verbose) cat("Appending to existing file so setting bom=FALSE and yaml=FALSE\n")
bom = FALSE
yaml = FALSE
}
if (identical(quote,"auto")) quote=NA # logical NA
if (file=="") {
# console output which it seems isn't thread safe on Windows even when one-batch-at-a-time
Expand All @@ -74,41 +80,33 @@ fwrite <- function(x, file="", append=FALSE, quote="auto",
return(invisible())
}
}

# process YAML after potentially short-circuiting due to irregularities
if (yaml) {
if (!requireNamespace('yaml', quietly = TRUE))
yaml = if (!yaml) "" else {
if (!requireNamespace('yaml', quietly=TRUE))
stop("'data.table' relies on the package 'yaml' to write the file header; please add this to your library with install.packages('yaml') and try again.") # nocov
if (append || is_gzip) {
if (append) warning("Skipping yaml writing because append = TRUE; YAML will only be written to the top of a file.")
if (is_gzip) warning("Skipping yaml writing because is_gzip = TRUE; compression of YAML metadata is not supported.")
} else {
schema_vec = sapply(x, class)
# multi-class objects reduced to first class
if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L)
# as.vector strips names
schema_vec = list(name = names(schema_vec), type = as.vector(schema_vec))
yaml_header = list(
source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e)'DEV'))),
creation_time_utc = format(Sys.time(), tz = 'UTC'),
schema = list(
fields = lapply(
seq_along(x),
function(i) list(name = schema_vec$name[i], type = schema_vec$type[i])
)
),
header = col.names, sep = sep, sep2 = sep2, eol = eol, na.strings = na,
dec = dec, qmethod = qmethod, logical01 = logical01
)
# NB: as.yaml adds trailing newline
cat('---', yaml::as.yaml(yaml_header, line.sep = eol), '---', sep = eol, file = file)
append = TRUE
}
schema_vec = sapply(x, class)
# multi-class objects reduced to first class
if (is.list(schema_vec)) schema_vec = sapply(schema_vec, `[`, 1L)
# as.vector strips names
schema_vec = list(name=names(schema_vec), type=as.vector(schema_vec))
yaml_header = list(
source = sprintf('R[v%s.%s]::data.table[v%s]::fwrite',
R.version$major, R.version$minor, format(tryCatch(utils::packageVersion('data.table'), error=function(e) 'DEV'))),
creation_time_utc = format(Sys.time(), tz='UTC'),
schema = list(
fields = lapply(
seq_along(x),
function(i) list(name=schema_vec$name[i], type=schema_vec$type[i])
)
),
header=col.names, sep=sep, sep2=sep2, eol=eol, na.strings=na,
dec=dec, qmethod=qmethod, logical01=logical01
)
paste0('---', eol, yaml::as.yaml(yaml_header, line.sep=eol), '---', eol) # NB: as.yaml adds trailing newline
}
file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
file = enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078.
.Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append,
row.names, col.names, logical01, dateTimeAs, buffMB, nThread,
showProgress, is_gzip, verbose)
showProgress, is_gzip, bom, yaml, verbose)
invisible()
}

67 changes: 52 additions & 15 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -6514,13 +6514,13 @@ if (test_xts) {
setcolorder(dt, c(2, 3, 1))
dt[ , char_col := 'a']
test(1465.17, as.xts(dt), xt, warning = 'columns are not numeric')

# 890 -- key argument for as.data.table.xts
x = xts(1:10, as.Date(1:10, origin = "1970-01-01"))
test(1465.18, capture.output(as.data.table(x, key="index")),
c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2",
" 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5",
" 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8",
c(" index V1", " 1: 1970-01-02 1", " 2: 1970-01-03 2",
" 3: 1970-01-04 3", " 4: 1970-01-05 4", " 5: 1970-01-06 5",
" 6: 1970-01-07 6", " 7: 1970-01-08 7", " 8: 1970-01-09 8",
" 9: 1970-01-10 9", "10: 1970-01-11 10"))

Sys.setenv("_R_CHECK_LENGTH_1_LOGIC2_" = TRUE)
Expand Down Expand Up @@ -9466,7 +9466,7 @@ test(1658.25, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"')
# integer NA
DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5))
test(1658.26, fwrite(DT), output='A,B\n2,\n,4\n3,5')
test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing column names.*"A","B".*2,NA\nNA,4\n3,5')
test(1658.27, fwrite(DT, na="NA", verbose=TRUE), output='Writing bom .false., yaml .0 characters. and column names .true.*"A","B".*2,NA\nNA,4\n3,5')

# wrong argument types
test(1658.28, fwrite(ok_dt, 1), error="is.character\\(file\\).*not TRUE")
Expand Down Expand Up @@ -9511,6 +9511,24 @@ test(1658.46, fwrite(DT), error="Row 3 of list column is type 'complex'")
DT[3,b:=factor(letters[1:3])]
test(1658.47, fwrite(DT), error="Row 3 of list column is type 'factor'")

# fwrite bom
DT = data.table(l=letters, n=1:26)
fwrite(DT, f1<-tempfile(), bom=TRUE)
f1con = file(f1, encoding="UTF-8") # Windows readLines needs to be told otherwise it thinks n_lines==1
test(1658.48, length(readLines(f1con)), 27L)
test(1658.49, readBin(f1, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x6c, 0x2c, 0x6e)))
close(f1con)
fwrite(DT, f2<-tempfile(), bom=FALSE)
test(1658.50, readBin(f2, raw(), 3L), as.raw(c(0x6c, 0x2c, 0x6e)))
# re-write to the same file should overwrite.
# Windows seems to cache the connection to f2 and fails on a subsequent read, hence using file(,encoding="UTF-8")
fwrite(DT, f2, bom=TRUE)
f2con = file(f2, encoding="UTF-8")
test(1658.51, length(readLines(f2con)), 27L)
close(f2con)
test(1658.52, file.info(f1)$size, file.info(f2)$size)
unlink(c(f1, f2))

## End fwrite tests

# tests for #679, inrange(), FR #707
Expand Down Expand Up @@ -14432,18 +14450,18 @@ if (test_yaml) { # csvy; #1701
# force eol for platform independence
fwrite(DT, tmp, yaml = TRUE, eol = '\n')
as_read = readLines(tmp)
test(2033.01, as_read[c(1L, 25L)], c('---', '---'))
test(2033.01, as_read[c(1L, 24L)], c('---', '---'))
test(2033.02, grepl('source: R.*data.table.*fwrite', as_read[2L]))
test(2033.03, grepl('creation_time_utc', as_read[3L]))
test(2033.04, as_read[4:24],
test(2033.04, as_read[4:23],
c("schema:", " fields:", " - name: a", " type: integer",
" - name: b", " type: numeric", " - name: c", " type: character",
"header: yes", "sep: ','", "sep2:", "- ''", "- '|'", "- ''",
# NB: apparently \n is encoded like this in YAML
"eol: |2+", "", "na.strings: ''", "dec: '.'", "qmethod: double",
"logical01: no", ""))
"logical01: no"))
tbl_body = c("a,b,c", "1,3.14159265358979,a", "2,1,b", "3,2,c", "4,3,d", "5,4,e")
test(2033.05, as_read[26:31], tbl_body)
test(2033.05, as_read[25:30], tbl_body)

# windows eol
fwrite(DT, tmp, yaml = TRUE, eol = '\r\n')
Expand All @@ -14464,11 +14482,30 @@ if (test_yaml) { # csvy; #1701
attr(DT2, 'yaml_metadata') = NULL
test(2033.08, all.equal(DT, DT2))

# unsupported operations
test(2033.09, capture.output(fwrite(DT, append = TRUE, yaml = TRUE)), tbl_body[-1L],
warning = 'Skipping yaml writing because append = TRUE')
test(2033.10, capture.output(fwrite(DT, compress = 'gzip', yaml = TRUE)), tbl_body,
warning = 'Skipping yaml writing because is_gzip = TRUE')
test(2033.09, fwrite(DT, append=TRUE, yaml=TRUE, verbose=TRUE),
output = paste0(c('Appending to existing file so setting bom=FALSE and yaml=FALSE', tbl_body[-1L]), collapse=".*"))

# TODO: test gzip'd yaml which is now supported

# yaml + bom arguments
DT = data.table(l=letters, n=1:26)
fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE)
fcon = file(f, encoding="UTF-8") # Windows readLines needs to be told; see also test 1658.50
lines = readLines(fcon)
lines = lines[lines!=""] # an extra "" after "eol: |2+" (line 16) on Linux but not Windows
# remove the blank here so we don't need to change this test if/when that changes in yaml package
test(2033.11, length(lines), 48L)
close(fcon)
test(2033.12, readBin(f, raw(), 6L), as.raw(c(0xef, 0xbb, 0xbf, 0x2d, 0x2d, 0x2d)))
# re-write should have same output (not appended)
fwrite(DT, f<-tempfile(), bom=TRUE, yaml=TRUE)
fcon = file(f, encoding="UTF-8")
lines = readLines(fcon)
lines = lines[lines!=""]
test(2033.13, length(lines), 48L)
close(fcon)
test(2033.14, fread(f), DT)
unlink(f)
}

# fcast coverage
Expand Down Expand Up @@ -14782,7 +14819,7 @@ test(2045.15, d1[d2, verbose = TRUE], cbind(d1, X1 = d2$X1), output="natural joi
options(datatable.naturaljoin=FALSE)

#tests for adding key to as.data.table, #890
## as.data.table.numeric (should cover as.data.table.factor,
## as.data.table.numeric (should cover as.data.table.factor,
## *.ordered, *.integer, *.logical, *.character, and *.Date since
## all are the same function in as.data.table.R)
nn = c(a=0.1, c=0.2, b=0.3, d=0.4)
Expand Down
4 changes: 3 additions & 1 deletion man/fwrite.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
showProgress = getOption("datatable.showProgress", interactive()),
compress = c("auto", "none", "gzip"),
yaml = FALSE,
bom = FALSE,
verbose = getOption("datatable.verbose", FALSE))
}
\arguments{
Expand Down Expand Up @@ -55,7 +56,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto",
\item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.}
\item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. }
\item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.}
\item{yaml}{ If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. Incompatible with \code{append = TRUE} or \code{gzip} compression. See \code{Details}. }
\item{yaml}{If \code{TRUE}, \code{fwrite} will output a CSVY file, that is, a CSV file with metadata stored as a YAML header, using \code{\link[yaml]{as.yaml}}. See \code{Details}. }
\item{bom}{If \code{TRUE} a BOM (Byte Order Mark) sequence (EF BB BF) is added at the beginning of the file; format 'UTF-8 with BOM'.}
\item{verbose}{Be chatty and report timings?}
}
\details{
Expand Down
35 changes: 23 additions & 12 deletions src/fwrite.c
Original file line number Diff line number Diff line change
Expand Up @@ -672,28 +672,39 @@ void fwriteMain(fwriteMainArgs args)
}
}

int yamlLen = strlen(args.yaml);
if (args.verbose) {
DTPRINT("Writing column names ... ");
DTPRINT("Writing bom (%s), yaml (%d characters) and column names (%s) ... ",
args.bom?"true":"false", yamlLen, args.colNames?"true":"false");
if (f==-1) DTPRINT("\n");
}
size_t headerLen = 0;
if (args.bom) headerLen += 3;
headerLen += yamlLen;
if (args.colNames) {
size_t headerLen = 0;
for (int j=0; j<args.ncol; j++) headerLen += getStringLen(args.colNames, j)*2; // *2 in case quotes are escaped or doubled
headerLen += args.ncol*(1/*sep*/+(doQuote!=0)*2) + eolLen + 3; // 3 in case doRowNames and doQuote (the first blank <<"",>> column name)
}
if (headerLen) {
char *buff = malloc(headerLen);
if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno));
char *ch = buff;
if (args.doRowNames) {
// Unusual: the extra blank column name when row_names are added as the first column
if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv
*ch++ = sep;
}
for (int j=0; j<args.ncol; j++) {
writeString(args.colNames, j, &ch);
*ch++ = sep;
if (args.bom) {*ch++=0xEF; *ch++=0xBB; *ch++=0xBF; } // 3 appears above (search for "bom")
memcpy(ch, args.yaml, yamlLen);
ch += yamlLen;
if (args.colNames) {
if (args.doRowNames) {
// Unusual: the extra blank column name when row_names are added as the first column
if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv
*ch++ = sep;
}
for (int j=0; j<args.ncol; j++) {
writeString(args.colNames, j, &ch);
*ch++ = sep;
}
ch--; // backup over the last sep
write_chars(args.eol, &ch);
}
ch--; // backup over the last sep
write_chars(args.eol, &ch);
if (f==-1) {
*ch = '\0';
DTPRINT(buff);
Expand Down
2 changes: 2 additions & 0 deletions src/fwrite.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ typedef struct fwriteMainArgs
int nth;
bool showProgress;
bool is_gzip;
bool bom;
const char *yaml;
bool verbose;
} fwriteMainArgs;

Expand Down
4 changes: 4 additions & 0 deletions src/fwriteR.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,16 @@ SEXP fwriteR(
SEXP nThread_Arg,
SEXP showProgress_Arg,
SEXP is_gzip_Arg,
SEXP bom_Arg,
SEXP yaml_Arg,
SEXP verbose_Arg
)
{
if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table");
fwriteMainArgs args;
args.is_gzip = LOGICAL(is_gzip_Arg)[0];
args.bom = LOGICAL(bom_Arg)[0];
args.yaml = CHAR(STRING_ELT(yaml_Arg, 0));
args.verbose = LOGICAL(verbose_Arg)[0];
args.filename = CHAR(STRING_ELT(filename_Arg, 0));
args.ncol = length(DF);
Expand Down