diff --git a/R/fwrite.R b/R/fwrite.R index 9f918c46da..c775510935 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -7,10 +7,13 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(verbose), showProgress=getOption("datatable.showProgress", interactive()), - verbose=getOption("datatable.verbose", FALSE)) { + compress = c("auto", "none", "gzip"), + verbose=getOption("datatable.verbose", FALSE) + ) { isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] + if (missing(compress)) compress = compress[1L] if (missing(dateTimeAs)) { dateTimeAs = dateTimeAs[1L] } else if (length(dateTimeAs)>1L) stop("dateTimeAs must be a single string") dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L @@ -38,6 +41,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present is.character(eol) && length(eol)==1L, length(qmethod) == 1L && qmethod %chin% c("double", "escape"), + length(compress) == 1L && compress %chin% c("auto", "none", "gzip"), isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names), isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logical01), length(na) == 1L, #1725, handles NULL or character(0) input @@ -45,6 +49,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024, length(nThread)==1L && !is.na(nThread) && nThread>=1L ) + + is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) + file <- path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) col.names = FALSE # test 1658.16 checks this @@ -71,7 +78,6 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, verbose) + showProgress, is_gzip, verbose) invisible() } - diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0e7a318d11..5b9ac5f34d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9349,6 +9349,27 @@ test(1658.34, fwrite(matrix(1:4, nrow=2, ncol=2), quote = TRUE), output = '"V1", test(1658.35, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\n.*1\n2\n3', message = "x being coerced from class: matrix to data.table") test(1658.36, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") +# fwrite output to console ignore compress +test(1658.37, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), + output='a,b\n1,1\n2,2\n3,3') + +# fwrite force gzipped output +if (.Platform$OS.type=="unix") { + f <- tempfile() + fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="gzip") + test(1658.38, system(paste("zcat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') + unlink(f) +} + + +# fwrite force csv output +if (.Platform$OS.type=="unix") { + f <- tempfile() + fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="none") + test(1658.39, system(paste("cat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') + unlink(f) +} + ## End fwrite tests # tests for #679, inrange(), FR #707 diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 8baf0d2c78..59519281a7 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -17,6 +17,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(verbose), showProgress = getOption("datatable.showProgress", interactive()), + compress = c("default", "none", "gzip"), verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -52,6 +53,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{buffMB}{The buffer size (MB) per thread in the range 1 to 1024, default 8MB. Experiment to see what works best for your data on your hardware.} \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } + \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} \item{verbose}{Be chatty and report timings?} } \details{ diff --git a/src/fwrite.c b/src/fwrite.c index d6a01c1c30..cd3e45a450 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -7,6 +7,7 @@ #include // isfinite, isnan #include // abs #include // strlen, strerror + #ifdef WIN32 #include #include @@ -17,6 +18,8 @@ #define WRITE write #define CLOSE close #endif + +#include "zlib.h" // for writing gzip file #include "myomp.h" #include "fwrite.h" @@ -643,11 +646,14 @@ void fwriteMain(fwriteMainArgs args) maxLineLen += eolLen; if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); - int f; + int f=0; + gzFile zf=NULL; + int err; if (*args.filename=='\0') { f=-1; // file="" means write to standard output + args.is_gzip = false; // gzip is only for file // eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows - } else { + } else if (!args.is_gzip) { #ifdef WIN32 f = _open(args.filename, _O_WRONLY | _O_BINARY | _O_CREAT | (args.append ? _O_APPEND : _O_TRUNC), _S_IWRITE); // O_BINARY rather than O_TEXT for explicit control and speed since it seems that write() has a branch inside it @@ -663,7 +669,22 @@ void fwriteMain(fwriteMainArgs args) "%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), args.filename); } + } else { + zf = gzopen(args.filename, "wb"); + if (zf == NULL) { + int erropen = errno; + STOP(access( args.filename, F_OK ) != -1 ? + "%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?" : + "%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", + strerror(erropen), args.filename); + } + // alloc gzip buffer : buff + 10% + 16 + size_t buffzSize = (size_t)(1024*1024*buffMB + 1024*1024*buffMB / 10 + 16); + if (gzbuffer(zf, buffzSize)) { + STOP("Error allocate buffer for gzip file"); + } } + t0=wallclock(); if (args.verbose) { @@ -683,32 +704,50 @@ void fwriteMain(fwriteMainArgs args) } for (int j=0; j 1 million bytes long *ch++ = args.sep; // this sep after the last column name won't be written to the file } if (f==-1) { DTPRINT(args.eol); - } else if (WRITE(f, args.eol, eolLen)==-1) { + } else if (!args.is_gzip && WRITE(f, args.eol, eolLen)==-1) { int errwrite=errno; - close(f); + CLOSE(f); free(buff); STOP("%s: '%s'", strerror(errwrite), args.filename); + } else if (args.is_gzip && (!gzwrite(zf, args.eol, eolLen))) { + int errwrite=gzclose(zf); + free(buff); + STOP("Error gzwrite %d: %s", errwrite, args.filename); } + } free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); if (args.nrow == 0) { if (args.verbose) DTPRINT("No data rows present (nrow==0)\n"); - if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); + if (args.is_gzip) { + if ( (err = gzclose(zf)) ) STOP("gzclose error %d: '%s'", err, args.filename); + } else { + if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); + } return; } @@ -815,8 +854,10 @@ void fwriteMain(fwriteMainArgs args) // by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console // to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast. } else { - if (WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { + if (!args.is_gzip && WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { failed=errno; + } else if (args.is_gzip && (!gzwrite(zf, myBuff, (int)(ch-myBuff)))) { + gzerror(zf, &failed); } if (myAlloc > buffSize) anyBufferGrown = true; int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB @@ -873,8 +914,15 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("\n"); } } - if (f!=-1 && CLOSE(f) && !failed) - STOP("%s: '%s'", strerror(errno), args.filename); + + if (!args.is_gzip) { + if (f!=-1 && CLOSE(f) && !failed) + STOP("%s: '%s'", strerror(errno), args.filename); + } else { + if ( (err=gzclose(zf)) ) { + STOP("gzclose error %d: '%s'", err, args.filename); + } + } // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail diff --git a/src/fwrite.h b/src/fwrite.h index 2a6933b785..3bc2942c10 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -32,14 +32,10 @@ typedef struct fwriteMainArgs // contains non-ASCII characters, it should be UTF-8 encoded (however fread // will not validate the encoding). const char *filename; - int ncol; - int64_t nrow; - // a vector of pointers to all-same-length column vectors void **columns; - writer_fun_t *funs; // a vector of writer_fun_t function pointers // length ncol vector containing which fun[] to use for each column @@ -48,19 +44,12 @@ typedef struct fwriteMainArgs uint8_t *whichFun; void *colNames; // NULL means no header, otherwise ncol strings - bool doRowNames; // optional, likely false - void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. - char sep; - char sep2; - char dec; - const char *eol; - const char *na; // The quote character is always " (ascii 34) and cannot be changed since nobody on Earth uses a different quoting character, surely @@ -69,19 +58,13 @@ typedef struct fwriteMainArgs int8_t doQuote; bool qmethodEscape; // true means escape quotes using backslash, else double-up double quotes. - bool squashDateTime; - bool append; - int buffMB; // [1-1024] default 8MB - int nth; - bool showProgress; - bool verbose; - + bool is_gzip; } fwriteMainArgs; void fwriteMain(fwriteMainArgs args); diff --git a/src/fwriteR.c b/src/fwriteR.c index e3affcc3dc..dcea9fffdc 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -1,4 +1,3 @@ - #include #include "data.table.h" #include "fwrite.h" @@ -128,10 +127,13 @@ SEXP fwriteR( SEXP buffMB_Arg, // [1-1024] default 8MB SEXP nThread_Arg, SEXP showProgress_Arg, - SEXP verbose_Arg) + SEXP is_gzip_Arg, + SEXP verbose_Arg + ) { if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table"); fwriteMainArgs args; + args.is_gzip = LOGICAL(is_gzip_Arg)[0]; args.verbose = LOGICAL(verbose_Arg)[0]; args.filename = CHAR(STRING_ELT(filename_Arg, 0)); args.ncol = length(DF);