From aa848a7f1240a00292bf502535f2456f63422650 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sat, 12 Jan 2019 15:04:02 +0100 Subject: [PATCH 01/62] Add gzip support to fwrite Use zlib and gzopen/gzwrite/gzclose function to write buffer directly in a gzipped csv file. zlib is thread-safe and the gzip compression use the fwrite threads. Option compress="gzip" is added to fwrite et is automatically set when file ends with ".gz" --- R/fwrite.R | 11 +++++-- src/fwrite.c | 82 ++++++++++++++++++++++++++++++++++++++++----------- src/fwrite.h | 19 +----------- src/fwriteR.c | 6 ++-- 4 files changed, 79 insertions(+), 39 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 9f918c46da..a2d0776ef4 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -7,10 +7,13 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(verbose), showProgress=getOption("datatable.showProgress", interactive()), - verbose=getOption("datatable.verbose", FALSE)) { + compress = c("none", "gzip"), + verbose=getOption("datatable.verbose", FALSE) + ) { isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] + if (missing(compress)) compress = compress[1L] if (missing(dateTimeAs)) { dateTimeAs = dateTimeAs[1L] } else if (length(dateTimeAs)>1L) stop("dateTimeAs must be a single string") dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L @@ -38,6 +41,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present is.character(eol) && length(eol)==1L, length(qmethod) == 1L && qmethod %chin% c("double", "escape"), + length(compress) == 1L && compress %chin% c("none", "gzip"), isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names), isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logical01), length(na) == 1L, #1725, handles NULL or character(0) input @@ -45,6 +49,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(buffMB)==1L && !is.na(buffMB) && 1L<=buffMB && buffMB<=1024, length(nThread)==1L && !is.na(nThread) && nThread>=1L ) + + is_gzip <- compress == "gzip" || grepl("\\.gz$", file) + file <- path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) col.names = FALSE # test 1658.16 checks this @@ -71,7 +78,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", file <- enc2native(file) # CfwriteR cannot handle UTF-8 if that is not the native encoding, see #3078. .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, - showProgress, verbose) + showProgress, is_gzip, verbose) invisible() } diff --git a/src/fwrite.c b/src/fwrite.c index d6a01c1c30..c42297e512 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -7,6 +7,7 @@ #include // isfinite, isnan #include // abs #include // strlen, strerror + #ifdef WIN32 #include #include @@ -17,6 +18,8 @@ #define WRITE write #define CLOSE close #endif + +#include "zlib.h" // for writing gzip file #include "myomp.h" #include "fwrite.h" @@ -643,11 +646,14 @@ void fwriteMain(fwriteMainArgs args) maxLineLen += eolLen; if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); - int f; + int f=0; + gzFile zf=NULL; + int err; if (*args.filename=='\0') { f=-1; // file="" means write to standard output + args.is_gzip = false; // gzip is only for file // eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows - } else { + } else if (!args.is_gzip) { #ifdef WIN32 f = _open(args.filename, _O_WRONLY | _O_BINARY | _O_CREAT | (args.append ? _O_APPEND : _O_TRUNC), _S_IWRITE); // O_BINARY rather than O_TEXT for explicit control and speed since it seems that write() has a branch inside it @@ -655,7 +661,6 @@ void fwriteMain(fwriteMainArgs args) #else f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666); // There is no binary/text mode distinction on Linux and Mac -#endif if (f == -1) { int erropen = errno; STOP(access( args.filename, F_OK ) != -1 ? @@ -663,7 +668,23 @@ void fwriteMain(fwriteMainArgs args) "%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), args.filename); } + } else { +#endif + zf = gzopen(args.filename, "wb"); + if (zf == NULL) { + int erropen = errno; + STOP(access( args.filename, F_OK ) != -1 ? + "%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?" : + "%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", + strerror(erropen), args.filename); + } + // alloc gzip buffer : buff + 10% + 16 + size_t buffzSize = (size_t)(1024*1024*buffMB + 1024*1024*buffMB / 10 + 16); + if (gzbuffer(zf, buffzSize)) { + STOP("Error allocate buffer for gzip file"); + } } + t0=wallclock(); if (args.verbose) { @@ -683,32 +704,50 @@ void fwriteMain(fwriteMainArgs args) } for (int j=0; j 1 million bytes long *ch++ = args.sep; // this sep after the last column name won't be written to the file } if (f==-1) { DTPRINT(args.eol); - } else if (WRITE(f, args.eol, eolLen)==-1) { + } else if (!args.is_gzip && WRITE(f, args.eol, eolLen)==-1) { int errwrite=errno; - close(f); + CLOSE(f); free(buff); STOP("%s: '%s'", strerror(errwrite), args.filename); + } else if (args.is_gzip && (!gzwrite(zf, args.eol, eolLen))) { + int errwrite=gzclose(zf); + free(buff); + STOP("Error gzwrite %d: %s", errwrite, args.filename); } + } free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); if (args.nrow == 0) { if (args.verbose) DTPRINT("No data rows present (nrow==0)\n"); - if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); + if (args.is_gzip) { + if ( (err = gzclose(zf)) ) STOP("gzclose error %d: '%s'", err, args.filename); + } else { + if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); + } return; } @@ -815,8 +854,10 @@ void fwriteMain(fwriteMainArgs args) // by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console // to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast. } else { - if (WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { + if (!args.is_gzip && WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { failed=errno; + } else if (args.is_gzip && (!gzwrite(zf, myBuff, (int)(ch-myBuff)))) { + gzerror(zf, &failed); } if (myAlloc > buffSize) anyBufferGrown = true; int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB @@ -873,8 +914,15 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("\n"); } } - if (f!=-1 && CLOSE(f) && !failed) - STOP("%s: '%s'", strerror(errno), args.filename); + + if (!args.is_gzip) { + if (f!=-1 && CLOSE(f) && !failed) + STOP("%s: '%s'", strerror(errno), args.filename); + } else { + if ( (err=gzclose(zf)) ) { + STOP("gzclose error %d: '%s'", err, args.filename); + } + } // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail diff --git a/src/fwrite.h b/src/fwrite.h index 2a6933b785..3bc2942c10 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -32,14 +32,10 @@ typedef struct fwriteMainArgs // contains non-ASCII characters, it should be UTF-8 encoded (however fread // will not validate the encoding). const char *filename; - int ncol; - int64_t nrow; - // a vector of pointers to all-same-length column vectors void **columns; - writer_fun_t *funs; // a vector of writer_fun_t function pointers // length ncol vector containing which fun[] to use for each column @@ -48,19 +44,12 @@ typedef struct fwriteMainArgs uint8_t *whichFun; void *colNames; // NULL means no header, otherwise ncol strings - bool doRowNames; // optional, likely false - void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. - char sep; - char sep2; - char dec; - const char *eol; - const char *na; // The quote character is always " (ascii 34) and cannot be changed since nobody on Earth uses a different quoting character, surely @@ -69,19 +58,13 @@ typedef struct fwriteMainArgs int8_t doQuote; bool qmethodEscape; // true means escape quotes using backslash, else double-up double quotes. - bool squashDateTime; - bool append; - int buffMB; // [1-1024] default 8MB - int nth; - bool showProgress; - bool verbose; - + bool is_gzip; } fwriteMainArgs; void fwriteMain(fwriteMainArgs args); diff --git a/src/fwriteR.c b/src/fwriteR.c index e3affcc3dc..dcea9fffdc 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -1,4 +1,3 @@ - #include #include "data.table.h" #include "fwrite.h" @@ -128,10 +127,13 @@ SEXP fwriteR( SEXP buffMB_Arg, // [1-1024] default 8MB SEXP nThread_Arg, SEXP showProgress_Arg, - SEXP verbose_Arg) + SEXP is_gzip_Arg, + SEXP verbose_Arg + ) { if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table"); fwriteMainArgs args; + args.is_gzip = LOGICAL(is_gzip_Arg)[0]; args.verbose = LOGICAL(verbose_Arg)[0]; args.filename = CHAR(STRING_ELT(filename_Arg, 0)); args.ncol = length(DF); From 92b9ca47f00b9770fd4e7515264dc44f36fc0241 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sat, 12 Jan 2019 19:01:23 +0100 Subject: [PATCH 02/62] Add compress= option in fwrite documentation --- man/fwrite.Rd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 8baf0d2c78..ec2322490e 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -17,6 +17,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(verbose), showProgress = getOption("datatable.showProgress", interactive()), + compress = c("none", "gzip"), verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -52,6 +53,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{buffMB}{The buffer size (MB) per thread in the range 1 to 1024, default 8MB. Experiment to see what works best for your data on your hardware.} \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } + \item{compress}{If compress = \code{"gzip"} or if \code{file} ends in \code{.gz}, even if compress = \code{"none"}, then the output format is gzipped csv. Output to console is never gzipped even if compress = \code{"gzip"}. By default, compress = \code{"none"}.} \item{verbose}{Be chatty and report timings?} } \details{ From a2d6969cb871e3cf136acfc6201b24f509054b81 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sat, 12 Jan 2019 22:31:46 +0100 Subject: [PATCH 03/62] Add tests for fwrite with compress="gzip" option --- inst/tests/tests.Rraw | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0e7a318d11..3d2164d8b4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9266,13 +9266,14 @@ test(1658.8, fwrite(data.table(a=c(1:5), b=c(1:5)), quote=TRUE), test(1658.9, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), output='"a","b"\n1,1\n2,2\n3,3') -# block size one bigger than number of rows -test(1658.11, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), - output='"a","b"\n1,1\n2,2\n3,3') +# fwrite output to console ignore compress +test(1658.11, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), + output='a,b\n1,1\n2,2\n3,3') -# block size one less than number of rows -test(1658.12, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), - output='"a","b"\n1,1\n2,2\n3,3') +# fwrite gzipped output +f <- tempfile() +test(1658.12, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), + output='a,b\n1,1\n2,2\n3,3') # writing a data.frame test(1658.13, fwrite(data.frame(a="foo", b="bar"), quote=TRUE), From 6010131d8a2957f9397665d991d01f0e1f61d630 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sat, 12 Jan 2019 23:38:41 +0100 Subject: [PATCH 04/62] Rewrite test 1658.12 --- inst/tests/tests.Rraw | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3d2164d8b4..10ded2d750 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9271,9 +9271,12 @@ test(1658.11, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # fwrite gzipped output -f <- tempfile() -test(1658.12, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), - output='a,b\n1,1\n2,2\n3,3') +if (.Platform$OS.type=="unix") { + f <- tempfile() + fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="gzip") + test(1658.12, system(paste("zcat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') + unlink(f) +} # writing a data.frame test(1658.13, fwrite(data.frame(a="foo", b="bar"), quote=TRUE), From 9f16e31ad5c64fe4fed729c393808a7cfbdef041 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 10:19:41 +0100 Subject: [PATCH 05/62] Add default option in compress In fwrite, compress has now 3 options : * default : gzip if file ends with .gz, else csv * none : force csv * gzip : force gzip --- R/fwrite.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index a2d0776ef4..59ecb8f5f7 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -7,7 +7,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(verbose), showProgress=getOption("datatable.showProgress", interactive()), - compress = c("none", "gzip"), + compress = c("default", "none", "gzip"), verbose=getOption("datatable.verbose", FALSE) ) { isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? @@ -41,7 +41,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present is.character(eol) && length(eol)==1L, length(qmethod) == 1L && qmethod %chin% c("double", "escape"), - length(compress) == 1L && compress %chin% c("none", "gzip"), + length(compress) == 1L && compress %chin% c("default", "none", "gzip"), isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names), isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logical01), length(na) == 1L, #1725, handles NULL or character(0) input @@ -50,7 +50,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(nThread)==1L && !is.na(nThread) && nThread>=1L ) - is_gzip <- compress == "gzip" || grepl("\\.gz$", file) + is_gzip <- compress == "gzip" || (compress == "default" && grepl("\\.gz$", file)) file <- path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) @@ -81,4 +81,3 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", showProgress, is_gzip, verbose) invisible() } - From 1ca34548076d4f5f5fd0876aba0ca2971f0cbdca Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 10:39:44 +0100 Subject: [PATCH 06/62] Adapt fwrite compress option documentation --- man/fwrite.Rd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index ec2322490e..853e490656 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -17,7 +17,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(verbose), showProgress = getOption("datatable.showProgress", interactive()), - compress = c("none", "gzip"), + compress = c("default", "none", "gzip"), verbose = getOption("datatable.verbose", FALSE)) } \arguments{ @@ -53,7 +53,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{buffMB}{The buffer size (MB) per thread in the range 1 to 1024, default 8MB. Experiment to see what works best for your data on your hardware.} \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } - \item{compress}{If compress = \code{"gzip"} or if \code{file} ends in \code{.gz}, even if compress = \code{"none"}, then the output format is gzipped csv. Output to console is never gzipped even if compress = \code{"gzip"}. By default, compress = \code{"none"}.} + \item{compress}{If \code{compress = "default"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "default"}.} \item{verbose}{Be chatty and report timings?} } \details{ From 70fdf8bd46b035075cb42b8a13c0123efa96fc7d Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 11:37:13 +0100 Subject: [PATCH 07/62] Replace 'default' by 'auto' in fwrite compress option --- R/fwrite.R | 6 +++--- man/fwrite.Rd | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 59ecb8f5f7..c775510935 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -7,7 +7,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(verbose), showProgress=getOption("datatable.showProgress", interactive()), - compress = c("default", "none", "gzip"), + compress = c("auto", "none", "gzip"), verbose=getOption("datatable.verbose", FALSE) ) { isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? @@ -41,7 +41,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", dec != sep, # sep2!=dec and sep2!=sep checked at C level when we know if list columns are present is.character(eol) && length(eol)==1L, length(qmethod) == 1L && qmethod %chin% c("double", "escape"), - length(compress) == 1L && compress %chin% c("default", "none", "gzip"), + length(compress) == 1L && compress %chin% c("auto", "none", "gzip"), isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names), isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logical01), length(na) == 1L, #1725, handles NULL or character(0) input @@ -50,7 +50,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", length(nThread)==1L && !is.na(nThread) && nThread>=1L ) - is_gzip <- compress == "gzip" || (compress == "default" && grepl("\\.gz$", file)) + is_gzip <- compress == "gzip" || (compress == "auto" && grepl("\\.gz$", file)) file <- path.expand(file) # "~/foo/bar" if (append && missing(col.names) && (file=="" || file.exists(file))) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 853e490656..59519281a7 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -53,7 +53,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{buffMB}{The buffer size (MB) per thread in the range 1 to 1024, default 8MB. Experiment to see what works best for your data on your hardware.} \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{showProgress}{ Display a progress meter on the console? Ignored when \code{file==""}. } - \item{compress}{If \code{compress = "default"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "default"}.} + \item{compress}{If \code{compress = "auto"} and if \code{file} ends in \code{.gz} then output format is gzipped csv else csv. If \code{compress = "none"}, output format is always csv. If \code{compress = "gzip"} then format is gzipped csv. Output to the console is never gzipped even if \code{compress = "gzip"}. By default, \code{compress = "auto"}.} \item{verbose}{Be chatty and report timings?} } \details{ From a279daef44835ae0057bfe5aa6a4ff3526871b53 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 11:47:08 +0100 Subject: [PATCH 08/62] Tests for gzip compression in fwrite and restore tests 1658.11,12 --- inst/tests/tests.Rraw | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 10ded2d750..5b9ac5f34d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9266,17 +9266,13 @@ test(1658.8, fwrite(data.table(a=c(1:5), b=c(1:5)), quote=TRUE), test(1658.9, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), output='"a","b"\n1,1\n2,2\n3,3') -# fwrite output to console ignore compress -test(1658.11, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), - output='a,b\n1,1\n2,2\n3,3') +# block size one bigger than number of rows +test(1658.11, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), + output='"a","b"\n1,1\n2,2\n3,3') -# fwrite gzipped output -if (.Platform$OS.type=="unix") { - f <- tempfile() - fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="gzip") - test(1658.12, system(paste("zcat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') - unlink(f) -} +# block size one less than number of rows +test(1658.12, fwrite(data.table(a=c(1:3), b=c(1:3)), quote=TRUE), + output='"a","b"\n1,1\n2,2\n3,3') # writing a data.frame test(1658.13, fwrite(data.frame(a="foo", b="bar"), quote=TRUE), @@ -9353,6 +9349,27 @@ test(1658.34, fwrite(matrix(1:4, nrow=2, ncol=2), quote = TRUE), output = '"V1", test(1658.35, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\n.*1\n2\n3', message = "x being coerced from class: matrix to data.table") test(1658.36, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") +# fwrite output to console ignore compress +test(1658.37, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), + output='a,b\n1,1\n2,2\n3,3') + +# fwrite force gzipped output +if (.Platform$OS.type=="unix") { + f <- tempfile() + fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="gzip") + test(1658.38, system(paste("zcat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') + unlink(f) +} + + +# fwrite force csv output +if (.Platform$OS.type=="unix") { + f <- tempfile() + fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="none") + test(1658.39, system(paste("cat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') + unlink(f) +} + ## End fwrite tests # tests for #679, inrange(), FR #707 From c8858e42c087824c1f9eac4bfb795cf481eaa589 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 13:09:41 +0100 Subject: [PATCH 09/62] \#endif was in wrong place --- src/fwrite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index c42297e512..cd3e45a450 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -661,6 +661,7 @@ void fwriteMain(fwriteMainArgs args) #else f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666); // There is no binary/text mode distinction on Linux and Mac +#endif if (f == -1) { int erropen = errno; STOP(access( args.filename, F_OK ) != -1 ? @@ -669,7 +670,6 @@ void fwriteMain(fwriteMainArgs args) strerror(erropen), args.filename); } } else { -#endif zf = gzopen(args.filename, "wb"); if (zf == NULL) { int erropen = errno; From 42b4964bbb39e4dec5595c1a27f575743e06fb60 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 11:38:44 +0100 Subject: [PATCH 10/62] Remove realloc sections --- src/fwrite.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index cd3e45a450..193d049264 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -547,17 +547,10 @@ static inline void checkBuffer( // This checkBuffer() is called after every line. ) { if (failed) return; // another thread already failed. Fall through and error(). - size_t thresh = 0.75*(*myAlloc); - if ((*ch > (*buffer)+thresh) || - (rowsPerBatch*myMaxLineLen > thresh )) { - size_t off = *ch-*buffer; - *myAlloc = 1.5*(*myAlloc); - *buffer = realloc(*buffer, *myAlloc); - if (*buffer==NULL) { - failed = -errno; // - for malloc/realloc errno, + for write errno - } else { - *ch = *buffer+off; // in case realloc moved the allocation - } + + // buffer is too small. ask to increase buffMB + if (rowsPerBatch * myMaxLineLen >= (*myAlloc)) { + failed = 1; } } @@ -759,10 +752,11 @@ void fwriteMain(fwriteMainArgs args) // If we don't use all the buffer for any reasons that's ok as OS will only getch the cache lines touched. // So, generally the larger the better up to max filesize/nth to use all the threads. A few times // smaller than that though, to achieve some load balancing across threads since schedule(dynamic). - if (maxLineLen > buffSize) buffSize=2*maxLineLen; // A very long line; at least 1,048,576 characters (since min(buffMB)==1) - rowsPerBatch = - (10*maxLineLen > buffSize) ? 1 : // very very long lines (100,000 characters+) each thread will just do one row at a time. - 0.5 * buffSize/maxLineLen; // Aim for 50% buffer usage. See checkBuffer for comments. + + if (2 * maxLineLen > buffSize) { + STOP("Error : line length is greater than half buffer size. Increase buffMB"); + } + rowsPerBatch = buffSize / maxLineLen / 2; if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; int numBatches = (args.nrow-1)/rowsPerBatch + 1; int nth = args.nth; @@ -838,7 +832,7 @@ void fwriteMain(fwriteMainArgs args) // file output would be out-of-order. Can't change rowsPerBatch after the 'parallel for' started. size_t thisLineLen = ch-lineStart; if (thisLineLen > myMaxLineLen) myMaxLineLen=thisLineLen; - checkBuffer(&myBuff, &myAlloc, &ch, myMaxLineLen); + // checkBuffer(&myBuff, &myAlloc, &ch, myMaxLineLen); if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below } #pragma omp ordered From 81cc49b7dee5359cfd88e28a7f28faf16d61bd4d Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 12:37:47 +0100 Subject: [PATCH 11/62] Header line is written in buff Before this commit, every column is output one by one. --- src/fwrite.c | 62 +++++++++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 35 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 193d049264..b35758ad21 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -686,51 +686,43 @@ void fwriteMain(fwriteMainArgs args) } if (args.colNames) { // We don't know how long this line will be. - // It could be (much) longer than the data row line lengths - // To keep things simple we'll reuse the same buffer used above for each field, and write each column name separately to the file. - // If multiple calls to write() is ever an issue, we'll come back to this. But very unlikely. + // This line must be lesser than buffer size buffSize char *ch = buff; if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column - if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv + if (doQuote) { + *ch++='"'; *ch++='"'; + } *ch++ = sep; } for (int j=0; j0) { + *ch++ = sep; + } writeString(args.colNames, j, &ch); - if(!args.is_gzip) { - if (f==-1) { - *ch = '\0'; - DTPRINT(buff); - } else if (WRITE(f, buff, (int)(ch-buff)) == -1) { // TODO: move error check inside WRITE - int errwrite=errno; // capture write errno now incase close fails with a different errno - CLOSE(f); - free(buff); - STOP("%s: '%s'", strerror(errwrite), args.filename); - } - } else { - if ((!gzwrite(zf, buff, (int)(ch-buff)))) { - int errwrite=gzclose(zf); - free(buff); - STOP("Error gzwrite %d: %s", errwrite, args.filename); - } + if ((size_t) (ch - buff) >= buffSize) { + STOP("Header line too long : increase buffMB parameter"); } - - ch = buff; // overwrite column names at the start in case they are > 1 million bytes long - *ch++ = args.sep; // this sep after the last column name won't be written to the file } - if (f==-1) { - DTPRINT(args.eol); - } else if (!args.is_gzip && WRITE(f, args.eol, eolLen)==-1) { - int errwrite=errno; - CLOSE(f); - free(buff); - STOP("%s: '%s'", strerror(errwrite), args.filename); - } else if (args.is_gzip && (!gzwrite(zf, args.eol, eolLen))) { - int errwrite=gzclose(zf); - free(buff); - STOP("Error gzwrite %d: %s", errwrite, args.filename); + write_chars(args.eol, &ch); + + if(!args.is_gzip) { + if (f==-1) { + *ch = '\0'; + DTPRINT(buff); + } else if (WRITE(f, buff, (int)(ch-buff)) == -1) { // TODO: move error check inside WRITE + int errwrite=errno; // capture write errno now incase close fails with a different errno + CLOSE(f); + free(buff); + STOP("%s: '%s'", strerror(errwrite), args.filename); + } + } else { + if ((!gzwrite(zf, buff, (int)(ch-buff)))) { + int errwrite=gzclose(zf); + free(buff); + STOP("Error gzwrite %d: %s", errwrite, args.filename); + } } - } free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); From 9b575db758ed963412d0a9a778835af24d7d3990 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 15:02:39 +0100 Subject: [PATCH 12/62] Header compression OK --- src/fwrite.c | 128 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 50 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index b35758ad21..a9674e9d5f 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -547,13 +547,55 @@ static inline void checkBuffer( // This checkBuffer() is called after every line. ) { if (failed) return; // another thread already failed. Fall through and error(). - + // buffer is too small. ask to increase buffMB if (rowsPerBatch * myMaxLineLen >= (*myAlloc)) { failed = 1; } } +int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong sourceLen) +{ + int level = Z_DEFAULT_COMPRESSION; + z_stream stream; + int err; + const uInt max = (uInt)-1; + uLong left; + + left = *destLen; + *destLen = 0; + + stream.zalloc = (alloc_func)0; + stream.zfree = (free_func)0; + stream.opaque = (voidpf)0; + + //err = deflateInit(&stream, level); + err = deflateInit2 (&stream, level, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + return err; + + stream.next_out = dest; + stream.avail_out = 0; + stream.next_in = (z_const Bytef *)source; + stream.avail_in = 0; + + do { + if (stream.avail_out == 0) { + stream.avail_out = left > (uLong)max ? max : (uInt)left; + left -= stream.avail_out; + } + if (stream.avail_in == 0) { + stream.avail_in = sourceLen > (uLong)max ? max : (uInt)sourceLen; + sourceLen -= stream.avail_in; + } + err = deflate(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH); + } while (err == Z_OK); + + *destLen = stream.total_out; + deflateEnd(&stream); + return err == Z_STREAM_END ? Z_OK : err; +} + void fwriteMain(fwriteMainArgs args) { double startTime = wallclock(); @@ -598,9 +640,16 @@ void fwriteMain(fwriteMainArgs args) if (buffMB<1 || buffMB>1024) STOP("buffMB=%d outside [1,1024]", buffMB); size_t buffSize = (size_t)1024*1024*buffMB; char *buff = malloc(buffSize); - if (!buff) STOP("Unable to allocate %dMB for line length estimation: %s", buffMB, strerror(errno)); - - if (args.verbose) { + if (!buff) + STOP("Unable to allocate %dMiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); + size_t zbuffSize = buffSize + buffSize/10 + 16; + Bytef *zbuff = malloc(zbuffSize); + if (!zbuff) + STOP("Unable to allocate %dMiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); + uLongf zbuffUsed = 0; + + if(args.verbose) { + DTPRINT("\nBuff:%p size: %d zbuff:%p size: %d\n", buff, buffSize, zbuff, zbuffSize); DTPRINT("Column writers: "); if (args.ncol<=50) { for (int j=0; j buffSize) { STOP("Error : line length is greater than half buffer size. Increase buffMB"); } @@ -842,8 +876,6 @@ void fwriteMain(fwriteMainArgs args) } else { if (!args.is_gzip && WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { failed=errno; - } else if (args.is_gzip && (!gzwrite(zf, myBuff, (int)(ch-myBuff)))) { - gzerror(zf, &failed); } if (myAlloc > buffSize) anyBufferGrown = true; int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB @@ -900,14 +932,10 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("\n"); } } - + if (!args.is_gzip) { if (f!=-1 && CLOSE(f) && !failed) STOP("%s: '%s'", strerror(errno), args.filename); - } else { - if ( (err=gzclose(zf)) ) { - STOP("gzclose error %d: '%s'", err, args.filename); - } } // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the From 1375a341ce6fe89074a228436bd9d88b8e3b6e13 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 17:22:59 +0100 Subject: [PATCH 13/62] Tmp : fwrite csv doesn't end --- src/fwrite.c | 158 ++++++++++++++------------------------------------- 1 file changed, 44 insertions(+), 114 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index a9674e9d5f..7051063701 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -529,31 +529,9 @@ void writeCategString(void *col, int64_t row, char **pch) write_string(getCategString(col, row), pch); } - static int failed = 0; static int rowsPerBatch; -static inline void checkBuffer( - char **buffer, // this thread's buffer - size_t *myAlloc, // the size of this buffer - char **ch, // the end of the last line written to the buffer by this thread - size_t myMaxLineLen // the longest line seen so far by this thread - // Initial size for the thread's buffer is twice as big as needed for rowsPerBatch based on - // maxLineLen from the sample; i.e. only 50% of the buffer should be used. - // If we get to 75% used, we'll realloc. - // i.e. very cautious and grateful to the OS for not fetching untouched pages of buffer. - // Plus, more caution ... myMaxLineLine is tracked and if that grows we'll realloc too. - // Very long lines are caught up front and rowsPerBatch is set to 1 in that case. - // This checkBuffer() is called after every line. -) { - if (failed) return; // another thread already failed. Fall through and error(). - - // buffer is too small. ask to increase buffMB - if (rowsPerBatch * myMaxLineLen >= (*myAlloc)) { - failed = 1; - } -} - int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong sourceLen) { int level = Z_DEFAULT_COMPRESSION; @@ -747,37 +725,41 @@ void fwriteMain(fwriteMainArgs args) if(ret) { STOP("Compress error: %d", ret); } - buff = zbuff; - ch = zbuff + zbuffUsed; } + int errwrite = 0; if (f==-1) { *ch = '\0'; DTPRINT(buff); - } else if (WRITE(f, buff, (int)(ch-buff)) == -1) { - int errwrite=errno; // capture write errno now incase close fails with a different errno + } else if (!args.is_gzip && WRITE(f, buff, (int)(ch-buff)) == -1) { + errwrite=errno; // capture write errno now incase close fails with a different errno + } else if (args.is_gzip && WRITE(f, zbuff, (int)zbuffUsed) == -1) { + errwrite=errno; + } + + if (errwrite) { CLOSE(f); free(buff); free(zbuff); STOP("%s: '%s'", strerror(errwrite), args.filename); } } + free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above - if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); + free(zbuff); // TODO: also to be free'd in cleanup when there's an error opening file above + + if (args.verbose) + DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); if (args.nrow == 0) { - if (args.verbose) DTPRINT("No data rows present (nrow==0)\n"); - if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); + if (args.verbose) + DTPRINT("No data rows present (nrow==0)\n"); + if (f!=-1 && CLOSE(f)) + STOP("%s: '%s'", strerror(errno), args.filename); return; } // Decide buffer size and rowsPerBatch for each thread - // Once rowsPerBatch is decided it can't be changed, but we can increase buffer size if the lines - // turn out to be longer than estimated from the sample. - // buffSize large enough to fit many lines to i) reduce calls to write() and ii) reduce thread sync points - // It doesn't need to be small in cache because it's written contiguously. - // If we don't use all the buffer for any reasons that's ok as OS will only getch the cache lines touched. - // So, generally the larger the better up to max filesize/nth to use all the threads. A few times - // smaller than that though, to achieve some load balancing across threads since schedule(dynamic). + // Once rowsPerBatch is decided it can't be changed if (2 * maxLineLen > buffSize) { STOP("Error : line length is greater than half buffer size. Increase buffMB"); @@ -803,19 +785,22 @@ void fwriteMain(fwriteMainArgs args) { char *ch, *myBuff; // local to each thread ch = myBuff = malloc(buffSize); // each thread has its own buffer. malloc and errno are thread-safe. - if (myBuff==NULL) {failed=-errno;} + + if (myBuff==NULL) { + failed=-errno; + } + size_t myzbuffSize = buffSize + buffSize/10 + 16; + Bytef *myzBuff = malloc(myzbuffSize); + uLongf myzbuffUsed = 0; + if (myzBuff==NULL) { + failed=-errno; + } // Do not rely on availability of '#omp cancel' new in OpenMP v4.0 (July 2013). // OpenMP v4.0 is in gcc 4.9+ (https://gcc.gnu.org/wiki/openmp) but // not yet in clang as of v3.8 (http://openmp.llvm.org/) // If not-me failed, I'll see shared 'failed', fall through loop, free my buffer // and after parallel section, single thread will call STOP() safely. - size_t myAlloc = buffSize; - size_t myMaxLineLen = maxLineLen; - // so we can realloc(). Should only be needed if there are very long lines that are - // much longer than occurred in the sample for maxLineLen; e.g. unusally long string values - // that didn't occur in the sample, or list columns with some very long vectors in some cells. - #pragma omp single { nth = omp_get_num_threads(); // update nth with the actual nth (might be different than requested) @@ -824,10 +809,10 @@ void fwriteMain(fwriteMainArgs args) #pragma omp for ordered schedule(dynamic) for(int64_t start=0; start=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead - // Track longest line seen so far. If we start to see longer lines than we saw in the - // sample, we'll realloc the buffer. The rowsPerBatch chosen based on the (very good) sample, - // must fit in the buffer. Can't early write and reset buffer because the - // file output would be out-of-order. Can't change rowsPerBatch after the 'parallel for' started. - size_t thisLineLen = ch-lineStart; - if (thisLineLen > myMaxLineLen) myMaxLineLen=thisLineLen; - // checkBuffer(&myBuff, &myAlloc, &ch, myMaxLineLen); - if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below + // compress buffer if gzip + if(args.is_gzip){ + myzbuffUsed = myzbuffSize; + failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); + } + if (failed) + break; // this thread stop writing rows; fall through to clear up and STOP() below } #pragma omp ordered { @@ -867,76 +851,23 @@ void fwriteMain(fwriteMainArgs args) if (f==-1) { *ch='\0'; // standard C string end marker so DTPRINT knows where to stop DTPRINT(myBuff); - // nth==1 at this point since when file=="" (f==-1 here) fwrite.R calls setDTthreads(1) - // Although this ordered section is one-at-a-time it seems that calling Rprintf() here, even with a - // R_FlushConsole() too, causes corruptions on Windows but not on Linux. At least, as observed so - // far using capture.output(). Perhaps Rprintf() updates some state or allocation that cannot be done - // by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console - // to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast. - } else { - if (!args.is_gzip && WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { + } else if (!args.is_gzip && WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { + failed=errno; + } else if (args.is_gzip && WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { failed=errno; - } - if (myAlloc > buffSize) anyBufferGrown = true; - int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB - if (used > maxBuffUsedPC) maxBuffUsedPC = used; - double now; - if (me==0 && args.showProgress && (now=wallclock())>=nextTime && !failed) { - // See comments above inside the f==-1 clause. - // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the - // master thread (me==0) and hopefully this will work on Windows. If not, user should set - // showProgress=FALSE until this can be fixed or removed. - int ETA = (int)((args.nrow-end)*((now-startTime)/end)); - if (hasPrinted || ETA >= 2) { - if (args.verbose && !hasPrinted) DTPRINT("\n"); - DTPRINT("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " - "anyBufferGrown=%s; maxBuffUsed=%d%%. ETA %d secs. ", - (100.0*end)/args.nrow, args.nrow, (int)(now-startTime), nth, nth==1?"":"s", - anyBufferGrown?"yes":"no", maxBuffUsedPC, ETA); - // TODO: use progress() as in fread - nextTime = now+1; - hasPrinted = true; - } - } - // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. - // Something like: - // if (me==0) { - // failed = TRUE; // inside ordered here; the slaves are before ordered and not looking at 'failed' - // R_CheckUserInterrupt(); - // failed = FALSE; // no user interrupt so return state - // } - // But I fear the slaves will hang waiting for the master (me==0) to complete the ordered - // section which may not happen if the master thread has been interrupted. Rather than - // seeing failed=TRUE and falling through to free() and close() as intended. - // Could register a finalizer to free() and close() perhaps : - // [r-devel] http://r.789695.n4.nabble.com/checking-user-interrupts-in-C-code-tp2717528p2717722.html - // Conclusion for now: do not provide ability to interrupt. - // write() errors and malloc() fails will be caught and cleaned up properly, however. } ch = myBuff; // back to the start of my buffer ready to fill it up again } } } - free(myBuff); // all threads will call this free on their buffer, even if one or more threads had malloc // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. - } - // Finished parallel region and can call R API safely now. - if (hasPrinted) { - if (!failed) { - // clear the progress meter - DTPRINT("\r " - " \r"); - } else { - // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC - DTPRINT("\n"); - } + free(myBuff); + free(myzBuff); } - if (!args.is_gzip) { - if (f!=-1 && CLOSE(f) && !failed) - STOP("%s: '%s'", strerror(errno), args.filename); - } + if (f!=-1 && CLOSE(f) && !failed) + STOP("%s: '%s'", strerror(errno), args.filename); // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail @@ -951,4 +882,3 @@ void fwriteMain(fwriteMainArgs args) nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); return; } - From 38764d9361d7e502a2abcb66c16e34143ee13b83 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 18:07:09 +0100 Subject: [PATCH 14/62] self-or for failed --- src/fwrite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index 7051063701..9164e71e7e 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -840,7 +840,7 @@ void fwriteMain(fwriteMainArgs args) // compress buffer if gzip if(args.is_gzip){ myzbuffUsed = myzbuffSize; - failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); + failed |= compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); } if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below From 7586e2f3cf71bb524fc74ae02af67c21fe7aec85 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 18:19:29 +0100 Subject: [PATCH 15/62] Add if fail --- src/fwrite.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index 9164e71e7e..fef04b10e9 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -836,7 +836,8 @@ void fwriteMain(fwriteMainArgs args) // Tepid again (once at the end of each line) ch--; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead - + if (failed) + break; // compress buffer if gzip if(args.is_gzip){ myzbuffUsed = myzbuffSize; From 72c70ae3c4e20852c4101a3294b7b78c31427a70 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 18:22:33 +0100 Subject: [PATCH 16/62] Remove #endif --- src/fwrite.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index fef04b10e9..0c41234d06 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -681,7 +681,7 @@ void fwriteMain(fwriteMainArgs args) f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666); #endif // There is no binary/text mode distinction on Linux and Mac -#endif + if (f == -1) { int erropen = errno; STOP(access( args.filename, F_OK ) != -1 ? @@ -690,6 +690,7 @@ void fwriteMain(fwriteMainArgs args) strerror(erropen), args.filename); } } + t0=wallclock(); if (args.verbose) { From 6dd28c88a2149dcc1922378c5aa62fb6aa67389f Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 19:28:04 +0100 Subject: [PATCH 17/62] Move compress buffer at the right place --- src/fwrite.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 0c41234d06..92c26b4801 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -837,13 +837,6 @@ void fwriteMain(fwriteMainArgs args) // Tepid again (once at the end of each line) ch--; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead - if (failed) - break; - // compress buffer if gzip - if(args.is_gzip){ - myzbuffUsed = myzbuffSize; - failed |= compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); - } if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below } @@ -853,10 +846,15 @@ void fwriteMain(fwriteMainArgs args) if (f==-1) { *ch='\0'; // standard C string end marker so DTPRINT knows where to stop DTPRINT(myBuff); - } else if (!args.is_gzip && WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { - failed=errno; - } else if (args.is_gzip && WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { + } else if (!args.is_gzip && WRITE(f, myBuff, (int)(ch - myBuff)) == -1) { failed=errno; + } else if (args.is_gzip) { + // compress buffer if gzip + myzbuffUsed = myzbuffSize; + if (failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff))) {} + else if (WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { + failed=errno; + } } ch = myBuff; // back to the start of my buffer ready to fill it up again } From 734687557b2efd9754a0caa07f886fa20fe34bc3 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 19:55:46 +0100 Subject: [PATCH 18/62] Compress in thread, only write is ordered --- src/fwrite.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 92c26b4801..67f9430294 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -840,6 +840,12 @@ void fwriteMain(fwriteMainArgs args) if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below } + // compress buffer if gzip + if (args.is_gzip) { + // compress buffer if gzip + myzbuffUsed = myzbuffSize; + failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); + } #pragma omp ordered { if (!failed) { // a thread ahead of me could have failed below while I was working or waiting above @@ -848,13 +854,8 @@ void fwriteMain(fwriteMainArgs args) DTPRINT(myBuff); } else if (!args.is_gzip && WRITE(f, myBuff, (int)(ch - myBuff)) == -1) { failed=errno; - } else if (args.is_gzip) { - // compress buffer if gzip - myzbuffUsed = myzbuffSize; - if (failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff))) {} - else if (WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { - failed=errno; - } + } else if (args.is_gzip && WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { + failed=errno; } ch = myBuff; // back to the start of my buffer ready to fill it up again } From 98d90dc2b3850c6554246c3eac7a4149b6f402d5 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 13 Jan 2019 21:08:25 +0100 Subject: [PATCH 19/62] Remove comment --- src/fwrite.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index 67f9430294..6e1cb4d5b0 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -842,7 +842,6 @@ void fwriteMain(fwriteMainArgs args) } // compress buffer if gzip if (args.is_gzip) { - // compress buffer if gzip myzbuffUsed = myzbuffSize; failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); } From 9202717246d749500c72540e2598ac41f9c12f23 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Mon, 14 Jan 2019 09:22:46 +0100 Subject: [PATCH 20/62] Move bool is_gzip in fwrite.h --- src/fwrite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fwrite.h b/src/fwrite.h index 3bc2942c10..4a68e2dea1 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -63,8 +63,8 @@ typedef struct fwriteMainArgs int buffMB; // [1-1024] default 8MB int nth; bool showProgress; - bool verbose; bool is_gzip; + bool verbose; } fwriteMainArgs; void fwriteMain(fwriteMainArgs args); From 2564197135e363d45847e52852fe8f1e822a49a8 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Mon, 14 Jan 2019 11:14:16 +0100 Subject: [PATCH 21/62] Add cast and remove unused variables --- src/fwrite.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 6e1cb4d5b0..49ad8e6fbc 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -577,7 +577,6 @@ int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong source void fwriteMain(fwriteMainArgs args) { double startTime = wallclock(); - double nextTime = startTime+2; // start printing progress meter in 2 sec if not completed by then double t0 = startTime; na = args.na; @@ -667,7 +666,6 @@ void fwriteMain(fwriteMainArgs args) if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); int f=0; - int err; if (*args.filename=='\0') { f=-1; // file="" means write to standard output args.is_gzip = false; // gzip is only for file @@ -722,7 +720,7 @@ void fwriteMain(fwriteMainArgs args) // compress buff into zbuff if(args.is_gzip){ zbuffUsed = zbuffSize; - int ret = compressbuff(zbuff, &zbuffUsed, buff, (int)(ch - buff)); + int ret = compressbuff(zbuff, &zbuffUsed, (Bytef*)buff, (int)(ch - buff)); if(ret) { STOP("Compress error: %d", ret); } @@ -778,7 +776,6 @@ void fwriteMain(fwriteMainArgs args) t0 = wallclock(); failed=0; // static global so checkBuffer can set it. -errno for malloc or realloc fails, +errno for write fail - bool hasPrinted=false; bool anyBufferGrown=false; int maxBuffUsedPC=0; @@ -802,12 +799,6 @@ void fwriteMain(fwriteMainArgs args) // If not-me failed, I'll see shared 'failed', fall through loop, free my buffer // and after parallel section, single thread will call STOP() safely. - #pragma omp single - { - nth = omp_get_num_threads(); // update nth with the actual nth (might be different than requested) - } - int me = omp_get_thread_num(); - #pragma omp for ordered schedule(dynamic) for(int64_t start=0; start Date: Mon, 14 Jan 2019 14:30:18 +0100 Subject: [PATCH 22/62] Test buffer size et adapt error messages --- src/fwrite.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 49ad8e6fbc..6b995737a6 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -613,20 +613,19 @@ void fwriteMain(fwriteMainArgs args) int eolLen = strlen(args.eol); if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); - int buffMB = args.buffMB; - if (buffMB<1 || buffMB>1024) STOP("buffMB=%d outside [1,1024]", buffMB); - size_t buffSize = (size_t)1024*1024*buffMB; + if (args.buffMB<1 || args.buffMB>1024) STOP("buffMB=%d outside [1,1024]", args.buffMB); + size_t buffSize = (size_t)1024*1024*args.buffMB; char *buff = malloc(buffSize); if (!buff) - STOP("Unable to allocate %dMiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); + STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); size_t zbuffSize = buffSize + buffSize/10 + 16; Bytef *zbuff = malloc(zbuffSize); if (!zbuff) - STOP("Unable to allocate %dMiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); + STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); uLongf zbuffUsed = 0; if(args.verbose) { - DTPRINT("\nBuff:%p size: %d zbuff:%p size: %d\n", buff, buffSize, zbuff, zbuffSize); + DTPRINT("\nBuff size: %d zbuff size: %d\n", buffSize, zbuffSize); DTPRINT("Column writers: "); if (args.ncol<=50) { for (int j=0; j buffSize) { + STOP("Error : max line length is greater than half buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); + } int f=0; if (*args.filename=='\0') { @@ -712,7 +715,7 @@ void fwriteMain(fwriteMainArgs args) } writeString(args.colNames, j, &ch); if ((size_t) (ch - buff) >= buffSize) { - STOP("Header line too long : increase buffMB parameter"); + STOP("Error : header line is greater than buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); } } write_chars(args.eol, &ch); @@ -760,11 +763,9 @@ void fwriteMain(fwriteMainArgs args) // Decide buffer size and rowsPerBatch for each thread // Once rowsPerBatch is decided it can't be changed - if (2 * maxLineLen > buffSize) { - STOP("Error : line length is greater than half buffer size. Increase buffMB"); - } rowsPerBatch = buffSize / maxLineLen / 2; if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; + if (rowsPerBatch < 1) rowsPerBatch = 1; int numBatches = (args.nrow-1)/rowsPerBatch + 1; int nth = args.nth; if (numBatches < nth) nth = numBatches; @@ -776,8 +777,6 @@ void fwriteMain(fwriteMainArgs args) t0 = wallclock(); failed=0; // static global so checkBuffer can set it. -errno for malloc or realloc fails, +errno for write fail - bool anyBufferGrown=false; - int maxBuffUsedPC=0; #pragma omp parallel num_threads(nth) { @@ -825,6 +824,10 @@ void fwriteMain(fwriteMainArgs args) *ch++ = sep; //printf(" j=%d args.ncol=%d myBuff='%.*s' ch=%p\n", j, args.ncol, 20, myBuff, ch); } + // Test if buffer to low + if ( (int)(ch - myBuff) >= buffSize ) { + failed = -1; + } // Tepid again (once at the end of each line) ch--; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead @@ -864,12 +867,9 @@ void fwriteMain(fwriteMainArgs args) // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail // from the original error. if (failed<0) { - STOP("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", - strerror(-failed), nth, args.buffMB); + STOP("Error : one or more threads failed to malloc or buffer was too small. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); } else if (failed>0) { STOP("%s: '%s'", strerror(failed), args.filename); } - if (args.verbose) DTPRINT("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", - nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); return; } From ffa26cb2400750d17156d4391266372944af4794 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Mon, 14 Jan 2019 22:31:28 +0100 Subject: [PATCH 23/62] Test buffer size after each line --- src/fwrite.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 6b995737a6..51e3f076a1 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -818,15 +818,13 @@ void fwriteMain(fwriteMainArgs args) } // Hot loop for (int j=0; j= buffSize ) { - failed = -1; + // Test if buffer to low + if ( (int)(ch - myBuff) >= buffSize ) { + failed = -1; + break; // stop writing + } } // Tepid again (once at the end of each line) ch--; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. From 842399668f45b9780a0c8665278a00951f81b2d8 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Mon, 14 Jan 2019 22:48:39 +0100 Subject: [PATCH 24/62] Remove old comment --- src/fwrite.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 51e3f076a1..a307d259bb 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -547,8 +547,7 @@ int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong source stream.zfree = (free_func)0; stream.opaque = (voidpf)0; - //err = deflateInit(&stream, level); - err = deflateInit2 (&stream, level, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); + err = deflateInit2(&stream, level, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); if (err != Z_OK) return err; From c161828696983bd270ac3c5c45165b5d418668d0 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 10:09:37 +0100 Subject: [PATCH 25/62] Use strlen to compute maxLineLen in fwrite --- src/fwrite.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index a307d259bb..686e6f8b38 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -614,6 +614,8 @@ void fwriteMain(fwriteMainArgs args) if (args.buffMB<1 || args.buffMB>1024) STOP("buffMB=%d outside [1,1024]", args.buffMB); size_t buffSize = (size_t)1024*1024*args.buffMB; + size_t buffLimit = (size_t) 9 * buffSize / 10; // buffer error if more than 90% use + char *buff = malloc(buffSize); if (!buff) STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); @@ -637,34 +639,39 @@ void fwriteMain(fwriteMainArgs args) } int maxLineLen = 0; - int step = args.nrow<1000 ? 100 : args.nrow/10; - for (int64_t start=0; start 1 million bytes - args.funs[args.whichFun[j]]( args.columns[j], i, &ch ); - thisLineLen += (int)(ch-buff) + 1/*sep*/; // see comments above about restrictions/guarantees/contracts + if (args.whichFun[j] == 11) { // if String + thisLineLen += strlen(getString(args.columns[j], i))+ + 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + } else if (args.whichFun[j] == 12) { // if Factr + thisLineLen += strlen(getCategString(args.columns[j], i))+ + 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + } else { + thisLineLen += 16; /* for numeric and dates */ + } } - if (thisLineLen > maxLineLen) maxLineLen = thisLineLen; - } + if (thisLineLen > maxLineLen) + maxLineLen = thisLineLen; } maxLineLen += eolLen; + if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); if (2 * maxLineLen > buffSize) { - STOP("Error : max line length is greater than half buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); + STOP("Error : max line length is greater than half buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", + 2 * (args.buffMB < maxLineLen ? maxLineLen : args.buffMB) / 1024 / 1024 + 1); } int f=0; From 646535b85e190b69bde1e71d0cd5708bc86bfea1 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 10:52:21 +0100 Subject: [PATCH 26/62] Compute mexHeaderLen and maxLineLen --- src/fwrite.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++----- src/fwrite.h | 1 + 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 686e6f8b38..0a4793e3da 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -529,6 +529,23 @@ void writeCategString(void *col, int64_t row, char **pch) write_string(getCategString(col, row), pch); } +int writer_len[] = { + 5, //&writeBool8, + 5, //&writeBool32, + 5, //&writeBool32AsString, + 9, //&writeInt32, + 19, //&writeInt64, + 24, //&writeFloat64, + 32, //&writeITime, + 16, //&writeDateInt32, + 16, //&writeDateFloat64, + 32, //&writePOSIXct, + 48, //&writeNanotime, + 0, //&writeString, + 0, //&writeCategString, + 1024, //&writeList +}; + static int failed = 0; static int rowsPerBatch; @@ -638,6 +655,32 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("\n"); } + int maxHeaderLen = 0; + if (args.colNames) { + // We don't know how long this line will be. + // This line must be lesser than buffer size buffSize + if (args.doRowNames) { + // Unusual: the extra blank column name when row_names are added as the first column + if (doQuote) { + maxHeaderLen += 2; + } + maxHeaderLen += 1; /* for sep */ + } + for (int j=0; j0) { + maxHeaderLen += 1; /* for sep */ + } + maxHeaderLen += strlen(getString(args.colNames, j)); + } + maxHeaderLen += eolLen; /*eol*/ + } + if (args.verbose) + DTPRINT("maxHeaderLen=%d\n", maxHeaderLen); + + if (maxHeaderLen >= buffSize) { + STOP("Error : header line is greater than buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", + (args.buffMB < maxHeaderLen ? maxHeaderLen : args.buffMB) / 1024 / 1024 + 1); + } int maxLineLen = 0; for (int64_t i = 0; i < args.nrow; i += args.nrow / 1000 + 1) { //write_string(getString(col, row), pch); @@ -652,16 +695,18 @@ void fwriteMain(fwriteMainArgs args) } for (int j=0; j maxLineLen) maxLineLen = thisLineLen; } diff --git a/src/fwrite.h b/src/fwrite.h index 4a68e2dea1..e98cd994a9 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -37,6 +37,7 @@ typedef struct fwriteMainArgs // a vector of pointers to all-same-length column vectors void **columns; writer_fun_t *funs; // a vector of writer_fun_t function pointers + int *writer_len; // length ncol vector containing which fun[] to use for each column // one byte to use 8 times less cache lines than a vector of function pointers would do From ea68fd0e3e3ca653588d6c74d71992333c037001 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 10:57:51 +0100 Subject: [PATCH 27/62] Compute output length for writeListe with buffer --- src/fwrite.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index 0a4793e3da..88eb281748 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -543,7 +543,7 @@ int writer_len[] = { 48, //&writeNanotime, 0, //&writeString, 0, //&writeCategString, - 1024, //&writeList + 0, //&writeList }; static int failed = 0; @@ -704,6 +704,10 @@ void fwriteMain(fwriteMainArgs args) } else if (args.whichFun[j] == 12) { // if Factor thisLineLen += strlen(getCategString(args.columns[j], i))+ 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + } else if (args.whichFun[j] == 13) { // if List + char *ch = buff; // overwrite at the beginning of buff to be more robust > 1 million bytes + writeList(args.columns[j], i, &ch); + thisLineLen += (int)(ch-buff) + 1/*sep*/; } } thisLineLen += eolLen; From 7464a9a67f555fa73720ee9c7c95761afa86b5c0 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 14:32:17 +0100 Subject: [PATCH 28/62] Reinsert fwrite progress bar --- src/fwrite.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/src/fwrite.c b/src/fwrite.c index 88eb281748..3bf46bd654 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -593,6 +593,7 @@ int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong source void fwriteMain(fwriteMainArgs args) { double startTime = wallclock(); + double nextTime = startTime+2; // start printing progress meter in 2 sec if not completed by then double t0 = startTime; na = args.na; @@ -833,6 +834,9 @@ void fwriteMain(fwriteMainArgs args) failed=0; // static global so checkBuffer can set it. -errno for malloc or realloc fails, +errno for write fail + bool hasPrinted=false; + int maxBuffUsedPC=0; + #pragma omp parallel num_threads(nth) { char *ch, *myBuff; // local to each thread @@ -853,6 +857,12 @@ void fwriteMain(fwriteMainArgs args) // If not-me failed, I'll see shared 'failed', fall through loop, free my buffer // and after parallel section, single thread will call STOP() safely. + #pragma omp single + { + nth = omp_get_num_threads(); // update nth with the actual nth (might be different than requested) + } + int me = omp_get_thread_num(); + #pragma omp for ordered schedule(dynamic) for(int64_t start=0; start maxBuffUsedPC) maxBuffUsedPC = used; + double now; + if (me==0 && args.showProgress && (now=wallclock())>=nextTime && !failed) { + // See comments above inside the f==-1 clause. + // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the + // master thread (me==0) and hopefully this will work on Windows. If not, user should set + // showProgress=FALSE until this can be fixed or removed. + int ETA = (int)((args.nrow-end)*((now-startTime)/end)); + if (hasPrinted || ETA >= 2) { + if (args.verbose && !hasPrinted) DTPRINT("\n"); + DTPRINT("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " + "maxBuffUsed=%d%%. ETA %d secs. ", + (100.0*end)/args.nrow, args.nrow, (int)(now-startTime), nth, nth==1?"":"s", + maxBuffUsedPC, ETA); + // TODO: use progress() as in fread + nextTime = now+1; + hasPrinted = true; + } + } + // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. + // Something like: + // if (me==0) { + // failed = TRUE; // inside ordered here; the slaves are before ordered and not looking at 'failed' + // R_CheckUserInterrupt(); + // failed = FALSE; // no user interrupt so return state + // } + // But I fear the slaves will hang waiting for the master (me==0) to complete the ordered + // section which may not happen if the master thread has been interrupted. Rather than + // seeing failed=TRUE and falling through to free() and close() as intended. + // Could register a finalizer to free() and close() perhaps : + // [r-devel] http://r.789695.n4.nabble.com/checking-user-interrupts-in-C-code-tp2717528p2717722.html + // Conclusion for now: do not provide ability to interrupt. + // write() errors and malloc() fails will be caught and cleaned up properly, however. ch = myBuff; // back to the start of my buffer ready to fill it up again } } @@ -913,6 +958,18 @@ void fwriteMain(fwriteMainArgs args) free(myzBuff); } + // Finished parallel region and can call R API safely now. + if (hasPrinted) { + if (!failed) { + // clear the progress meter + DTPRINT("\r " + " \r"); + } else { + // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC + DTPRINT("\n"); + } + } + if (f!=-1 && CLOSE(f) && !failed) STOP("%s: '%s'", strerror(errno), args.filename); // quoted '%s' in case of trailing spaces in the filename From e62b77ef080414e536e0bdcf2cecbb8858b55ddf Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 15:20:55 +0100 Subject: [PATCH 29/62] Introduce buffLimit and buffSecure in fwrite --- src/fwrite.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 3bf46bd654..eb08560ca1 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -632,7 +632,8 @@ void fwriteMain(fwriteMainArgs args) if (args.buffMB<1 || args.buffMB>1024) STOP("buffMB=%d outside [1,1024]", args.buffMB); size_t buffSize = (size_t)1024*1024*args.buffMB; - size_t buffLimit = (size_t) 9 * buffSize / 10; // buffer error if more than 90% use + size_t buffLimit = (size_t) 9 * buffSize / 10; // set buffer limit for thread = 90% + size_t buffSecure = (size_t) 5 * buffSize / 10; // maxLineLen in initial sample must be under this value char *buff = malloc(buffSize); if (!buff) @@ -678,10 +679,11 @@ void fwriteMain(fwriteMainArgs args) if (args.verbose) DTPRINT("maxHeaderLen=%d\n", maxHeaderLen); - if (maxHeaderLen >= buffSize) { - STOP("Error : header line is greater than buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", - (args.buffMB < maxHeaderLen ? maxHeaderLen : args.buffMB) / 1024 / 1024 + 1); + if (maxHeaderLen >= buffLimit) { + STOP("Error : header line is greater than buffer limit. Try to increase buffMB option. Example 'buffMB = %d'\n", + 2 * maxHeaderLen / 1024 / 1024 + 1; } + int maxLineLen = 0; for (int64_t i = 0; i < args.nrow; i += args.nrow / 1000 + 1) { //write_string(getString(col, row), pch); @@ -714,15 +716,15 @@ void fwriteMain(fwriteMainArgs args) thisLineLen += eolLen; if (thisLineLen > maxLineLen) maxLineLen = thisLineLen; + // stop if buffer is too low + if (maxLineLen >= buffSecure) { + STOP("Error : max line length is greater than buffer secure limit. Try to increase buffMB option. Example 'buffMB = %d'\n", + 2 * maxLineLen / 1024 / 1024 + 1); + } } - maxLineLen += eolLen; if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); - if (2 * maxLineLen > buffSize) { - STOP("Error : max line length is greater than half buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", - 2 * (args.buffMB < maxLineLen ? maxLineLen : args.buffMB) / 1024 / 1024 + 1); - } int f=0; if (*args.filename=='\0') { @@ -755,8 +757,8 @@ void fwriteMain(fwriteMainArgs args) if (f==-1) DTPRINT("\n"); } if (args.colNames) { - // We don't know how long this line will be. - // This line must be lesser than buffer size buffSize + // We have tested this line length and verify that buffer was big enough + // So no more buffer tests char *ch = buff; if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column @@ -770,9 +772,6 @@ void fwriteMain(fwriteMainArgs args) *ch++ = sep; } writeString(args.colNames, j, &ch); - if ((size_t) (ch - buff) >= buffSize) { - STOP("Error : header line is greater than buffer size. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); - } } write_chars(args.eol, &ch); @@ -804,7 +803,7 @@ void fwriteMain(fwriteMainArgs args) } free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above - free(zbuff); // TODO: also to be free'd in cleanup when there's an error opening file above + free(zbuff); if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); @@ -886,7 +885,7 @@ void fwriteMain(fwriteMainArgs args) (args.funs[args.whichFun[j]])(args.columns[j], i, &ch); *ch++ = sep; // Test if buffer to low - if ( (int)(ch - myBuff) >= buffSize ) { + if ( (int)(ch - myBuff) >= buffLimit ) { failed = -1; break; // stop writing } From af44e9a7f2dec875af055591518a067dc2ac1e18 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 15:24:12 +0100 Subject: [PATCH 30/62] TRUE instead of T in tests.Rraw --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5b9ac5f34d..b62c2ed439 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9366,7 +9366,7 @@ if (.Platform$OS.type=="unix") { if (.Platform$OS.type=="unix") { f <- tempfile() fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="none") - test(1658.39, system(paste("cat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') + test(1658.39, system(paste("cat", f), intern=TRUE), output='[1] "a,b" "1,1" "2,2" "3,3"') unlink(f) } From dc2b825aff39af6910aa6551b9188ab9110539d9 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 15:49:09 +0100 Subject: [PATCH 31/62] Typo correction --- src/fwrite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index eb08560ca1..d940b88aec 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -681,7 +681,7 @@ void fwriteMain(fwriteMainArgs args) if (maxHeaderLen >= buffLimit) { STOP("Error : header line is greater than buffer limit. Try to increase buffMB option. Example 'buffMB = %d'\n", - 2 * maxHeaderLen / 1024 / 1024 + 1; + 2 * maxHeaderLen / 1024 / 1024 + 1); } int maxLineLen = 0; From 49ee2232acd48d796eee4fda1eb25ecb1880262e Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 16:58:40 +0100 Subject: [PATCH 32/62] Alloc zbuff only if args.gzip --- src/fwrite.c | 55 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index d940b88aec..6f8edee105 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -609,12 +609,11 @@ void fwriteMain(fwriteMainArgs args) qmethodEscape = args.qmethodEscape; squashDateTime = args.squashDateTime; - // Estimate max line length of a 1000 row sample (100 rows in 10 places). - // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. - // Buffers will be resized later if there are too many line lengths outside the sample, anyway. - // maxLineLen is required to determine a reasonable rowsPerBatch. + int eolLen = strlen(args.eol); + if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); + // alloc one buffMB here. Keep rewriting each field to it, to sum up the size. Restriction: one field can't be // greater that minimumum buffMB (1MB = 1 million characters). Otherwise unbounded overwrite. Possible with very // very long single strings, or very long list column values. @@ -625,11 +624,6 @@ void fwriteMain(fwriteMainArgs args) // The default buffMB is 8MB, so it's really 8 million character limit by default. 1MB is because user might set // buffMB to 1, say if they have 512 CPUs or more, perhaps. - // Cold section as only 1,000 rows. Speed not an issue issue here. - // Overestimating line length is ok. - int eolLen = strlen(args.eol); - if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); - if (args.buffMB<1 || args.buffMB>1024) STOP("buffMB=%d outside [1,1024]", args.buffMB); size_t buffSize = (size_t)1024*1024*args.buffMB; size_t buffLimit = (size_t) 9 * buffSize / 10; // set buffer limit for thread = 90% @@ -638,14 +632,19 @@ void fwriteMain(fwriteMainArgs args) char *buff = malloc(buffSize); if (!buff) STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); - size_t zbuffSize = buffSize + buffSize/10 + 16; - Bytef *zbuff = malloc(zbuffSize); - if (!zbuff) - STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); + + size_t zbuffSize = 0; uLongf zbuffUsed = 0; + Bytef *zbuff; + + if (args.is_gzip) { + size_t zbuffSize = buffSize + buffSize/10 + 16; + Bytef *zbuff = malloc(zbuffSize); + if (!zbuff) + STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); + } if(args.verbose) { - DTPRINT("\nBuff size: %d zbuff size: %d\n", buffSize, zbuffSize); DTPRINT("Column writers: "); if (args.ncol<=50) { for (int j=0; j Date: Tue, 15 Jan 2019 22:04:32 +0100 Subject: [PATCH 33/62] Test if string/factor is NA is line header length detection --- src/fwrite.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 6f8edee105..dc8f50a037 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -691,8 +691,11 @@ void fwriteMain(fwriteMainArgs args) int maxLineLen = 0; // Cold section as only 1,000 rows. Speed not an issue issue here. // Overestimating line length is ok. + if (args.verbose) { + DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%d args.ncol=%d\n", + args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol); + } for (int64_t i = 0; i < args.nrow; i += args.nrow / 1000 + 1) { - //write_string(getString(col, row), pch); int thisLineLen=0; if (args.doRowNames) { if (args.rowNames) { @@ -704,19 +707,29 @@ void fwriteMain(fwriteMainArgs args) } for (int j=0; j 1 million bytes writeList(args.columns[j], i, &ch); - thisLineLen += (int)(ch-buff) + 1/*sep*/; + thisLineLen += (int)(ch-buff) + 1; } } thisLineLen += eolLen; From 5b93d91b5780a133785c9a1ae5aa6f602b0e7dbc Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 22:22:35 +0100 Subject: [PATCH 34/62] Correct zbuff allocation --- src/fwrite.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index dc8f50a037..e207026781 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -638,8 +638,8 @@ void fwriteMain(fwriteMainArgs args) Bytef *zbuff; if (args.is_gzip) { - size_t zbuffSize = buffSize + buffSize/10 + 16; - Bytef *zbuff = malloc(zbuffSize); + zbuffSize = buffSize + buffSize/10 + 16; + zbuff = malloc(zbuffSize); if (!zbuff) STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); } @@ -872,8 +872,8 @@ void fwriteMain(fwriteMainArgs args) size_t myzbuffSize = 0; Bytef *myzBuff; if(args.is_gzip){ - size_t myzbuffSize = buffSize + buffSize/10 + 16; - Bytef *myzBuff = malloc(myzbuffSize); + myzbuffSize = buffSize + buffSize/10 + 16; + myzBuff = malloc(myzbuffSize); if (myzBuff==NULL) { failed=-errno; } From 7ccc3e693a8b7dbd94a00960f10fbb6610dc951b Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 22:43:25 +0100 Subject: [PATCH 35/62] Add missing is_gzip test --- src/fwrite.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index e207026781..ee7bb7f677 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -609,8 +609,6 @@ void fwriteMain(fwriteMainArgs args) qmethodEscape = args.qmethodEscape; squashDateTime = args.squashDateTime; - - int eolLen = strlen(args.eol); if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); @@ -692,8 +690,8 @@ void fwriteMain(fwriteMainArgs args) // Cold section as only 1,000 rows. Speed not an issue issue here. // Overestimating line length is ok. if (args.verbose) { - DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%d args.ncol=%d\n", - args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol); + DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%d args.ncol=%di eolLen=%d\n", + args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol, eolLen); } for (int64_t i = 0; i < args.nrow; i += args.nrow / 1000 + 1) { int thisLineLen=0; @@ -871,6 +869,7 @@ void fwriteMain(fwriteMainArgs args) uLongf myzbuffUsed = 0; size_t myzbuffSize = 0; Bytef *myzBuff; + if(args.is_gzip){ myzbuffSize = buffSize + buffSize/10 + 16; myzBuff = malloc(myzbuffSize); @@ -982,7 +981,9 @@ void fwriteMain(fwriteMainArgs args) // all threads will call this free on their buffer, even if one or more threads had malloc // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. free(myBuff); - free(myzBuff); + if (args.is_gzip) { + free(myzBuff); + } } // Finished parallel region and can call R API safely now. From 47d2210feb07d38b3887c92bf169cf9a7db514ce Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Tue, 15 Jan 2019 22:57:45 +0100 Subject: [PATCH 36/62] Use directly maxLineLen ; don't divide by 2 Estimation is very optimistic. With this option, ~ 50% buffer use --- src/fwrite.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index ee7bb7f677..8fbb518c08 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -690,7 +690,7 @@ void fwriteMain(fwriteMainArgs args) // Cold section as only 1,000 rows. Speed not an issue issue here. // Overestimating line length is ok. if (args.verbose) { - DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%d args.ncol=%di eolLen=%d\n", + DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%d args.ncol=%d eolLen=%d\n", args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol, eolLen); } for (int64_t i = 0; i < args.nrow; i += args.nrow / 1000 + 1) { @@ -839,7 +839,7 @@ void fwriteMain(fwriteMainArgs args) // Decide buffer size and rowsPerBatch for each thread // Once rowsPerBatch is decided it can't be changed - rowsPerBatch = buffSize / maxLineLen / 2; + rowsPerBatch = buffLimit / maxLineLen; if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; if (rowsPerBatch < 1) rowsPerBatch = 1; int numBatches = (args.nrow-1)/rowsPerBatch + 1; From 01f3db8506ded399420fb829b6fd7d3527aaa54d Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 16 Jan 2019 08:54:03 +0100 Subject: [PATCH 37/62] Initialize buffer pointer to NULL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid warning "‘zbuff’ may be used uninitialized in this function" --- src/fwrite.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 8fbb518c08..9e5a5abb89 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -633,7 +633,7 @@ void fwriteMain(fwriteMainArgs args) size_t zbuffSize = 0; uLongf zbuffUsed = 0; - Bytef *zbuff; + Bytef *zbuff = NULL; if (args.is_gzip) { zbuffSize = buffSize + buffSize/10 + 16; @@ -868,7 +868,7 @@ void fwriteMain(fwriteMainArgs args) uLongf myzbuffUsed = 0; size_t myzbuffSize = 0; - Bytef *myzBuff; + Bytef *myzBuff = NULL; if(args.is_gzip){ myzbuffSize = buffSize + buffSize/10 + 16; From e2a97ffb1c43f006fbd157a33e4181e3d9f7be14 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 16 Jan 2019 16:54:49 +0100 Subject: [PATCH 38/62] Add news entry for gzip support in fwrite --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index e87c7ff113..dc7a42b7f7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ #### NEW FEATURES +1. `fwrite` can now write directly gzipped csv file if `compress` option is set to `gzip` or if file ends with `".gz"` and `compress="auto". Compression, like `fwrite`, is multithreaded + #### BUG FIXES #### NOTES From c24c154e51028431c7110528ce6f879ef3e88562 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 16 Jan 2019 21:44:49 +0100 Subject: [PATCH 39/62] Move enum WF in fwrite.h --- src/fwrite.h | 17 +++++++++++++++++ src/fwriteR.c | 17 ----------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/fwrite.h b/src/fwrite.h index e98cd994a9..412c1bad20 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -26,6 +26,23 @@ void writeList(); void write_chars(const char *source, char **dest); +typedef enum { // same order as fun[] above + WF_Bool8, + WF_Bool32, + WF_Bool32AsString, + WF_Int32, + WF_Int64, + WF_Float64, + WF_ITime, + WF_DateInt32, + WF_DateFloat64, + WF_POSIXct, + WF_Nanotime, + WF_String, + WF_CategString, + WF_List +} WFs; + typedef struct fwriteMainArgs { // Name of the file to open (a \0-terminated C string). If the file name diff --git a/src/fwriteR.c b/src/fwriteR.c index dcea9fffdc..9e8e691d54 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -42,23 +42,6 @@ writer_fun_t funs[] = { &writeList }; -typedef enum { // same order as fun[] above - WF_Bool8, - WF_Bool32, - WF_Bool32AsString, - WF_Int32, - WF_Int64, - WF_Float64, - WF_ITime, - WF_DateInt32, - WF_DateFloat64, - WF_POSIXct, - WF_Nanotime, - WF_String, - WF_CategString, - WF_List -} WFs; - static int32_t whichWriter(SEXP); void writeList(SEXP *col, int64_t row, char **pch) { From cd8768b36edf12fba003e979bff8ee4702c312e1 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Wed, 16 Jan 2019 21:46:11 +0100 Subject: [PATCH 40/62] Replace integer by WF enum --- src/fwrite.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 9e5a5abb89..f549e66f17 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -708,7 +708,7 @@ void fwriteMain(fwriteMainArgs args) int num_fun = args.whichFun[j]; if (writer_len[num_fun]) { thisLineLen += writer_len[num_fun] + 2 * (doQuote != 0) + 1; /* 1 for sep */ - } else if (num_fun == 11) { // if String + } else if (num_fun == WF_String) { // if String const char* ch = getString(args.columns[j], i); if (ch == NULL) { thisLineLen += strlen(na); @@ -716,7 +716,7 @@ void fwriteMain(fwriteMainArgs args) thisLineLen += strlen(ch); } thisLineLen += 2 * (doQuote!=0) + 1; - } else if (num_fun == 12) { // if Factor + } else if (num_fun == WF_CategString) { // if Factor const char* ch = getCategString(args.columns[j], i); if (ch == NULL) { thisLineLen += strlen(na); @@ -724,7 +724,7 @@ void fwriteMain(fwriteMainArgs args) thisLineLen += strlen(ch); } thisLineLen += 2 * (doQuote!=0) + 1; - } else if (num_fun == 13) { // if List + } else if (num_fun == WF_List) { // if List char *ch = buff; // overwrite at the beginning of buff to be more robust > 1 million bytes writeList(args.columns[j], i, &ch); thisLineLen += (int)(ch-buff) + 1; From 33ad79416ecf0a97555318bb2cf48519ae284e1d Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Thu, 17 Jan 2019 09:40:06 +0100 Subject: [PATCH 41/62] Test size before writing in thread --- src/fwrite.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index f549e66f17..73579fa07b 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -546,9 +546,6 @@ int writer_len[] = { 0, //&writeList }; -static int failed = 0; -static int rowsPerBatch; - int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong sourceLen) { int level = Z_DEFAULT_COMPRESSION; @@ -590,6 +587,9 @@ int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong source return err == Z_STREAM_END ? Z_OK : err; } +static int failed = 0; +static int rowsPerBatch; + void fwriteMain(fwriteMainArgs args) { double startTime = wallclock(); @@ -710,19 +710,11 @@ void fwriteMain(fwriteMainArgs args) thisLineLen += writer_len[num_fun] + 2 * (doQuote != 0) + 1; /* 1 for sep */ } else if (num_fun == WF_String) { // if String const char* ch = getString(args.columns[j], i); - if (ch == NULL) { - thisLineLen += strlen(na); - } else { - thisLineLen += strlen(ch); - } + thisLineLen += (ch == NULL) ? strlen(na) : strlen(ch); thisLineLen += 2 * (doQuote!=0) + 1; } else if (num_fun == WF_CategString) { // if Factor const char* ch = getCategString(args.columns[j], i); - if (ch == NULL) { - thisLineLen += strlen(na); - } else { - thisLineLen += strlen(ch); - } + thisLineLen += (ch == NULL) ? strlen(na) : strlen(ch); thisLineLen += 2 * (doQuote!=0) + 1; } else if (num_fun == WF_List) { // if List char *ch = buff; // overwrite at the beginning of buff to be more robust > 1 million bytes @@ -909,12 +901,28 @@ void fwriteMain(fwriteMainArgs args) } // Hot loop for (int j=0; j= buffLimit ) { - failed = -1; - break; // stop writing + int size = 0; + int num_fun = args.whichFun[j]; + if (writer_len[num_fun]) { + size = writer_len[num_fun]; + } else if (num_fun == WF_String) { // if String + const char* ch = getString(args.columns[j], i); + size = (ch == NULL) ? strlen(na) : strlen(ch); + } else if (num_fun == WF_CategString) { // if Factor + const char* ch = getCategString(args.columns[j], i); + size = (ch == NULL) ? strlen(na) : strlen(ch); + } + if (size >= buffLimit) { + failed = -1; + break; + } else { + (args.funs[args.whichFun[j]])(args.columns[j], i, &ch); + *ch++ = sep; + // Test if buffer to low + if ( (int)(ch - myBuff) >= buffLimit ) { + failed = -1; + break; // stop writing + } } } // Tepid again (once at the end of each line) From 739ac7e06c98fb1570cecd78eb7c75212e23b56f Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Sun, 20 Jan 2019 23:07:49 +0100 Subject: [PATCH 42/62] Replace strlen by strnlen Try to pass test quality code on github --- src/fwrite.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 73579fa07b..9ed7732ba1 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -6,7 +6,7 @@ #include // INT32_MIN #include // isfinite, isnan #include // abs -#include // strlen, strerror +#include // strnlen, strerror #ifdef WIN32 #include @@ -609,8 +609,6 @@ void fwriteMain(fwriteMainArgs args) qmethodEscape = args.qmethodEscape; squashDateTime = args.squashDateTime; - int eolLen = strlen(args.eol); - if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); // alloc one buffMB here. Keep rewriting each field to it, to sum up the size. Restriction: one field can't be // greater that minimumum buffMB (1MB = 1 million characters). Otherwise unbounded overwrite. Possible with very @@ -654,6 +652,9 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("\n"); } + int eolLen = strnlen(args.eol, buffLimit); + if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); + int maxHeaderLen = 0; if (args.colNames) { // We don't know how long this line will be. @@ -669,7 +670,7 @@ void fwriteMain(fwriteMainArgs args) if (j>0) { maxHeaderLen += 1; /* for sep */ } - maxHeaderLen += strlen(getString(args.colNames, j)); + maxHeaderLen += strnlen(getString(args.colNames, j), buffLimit); } maxHeaderLen += eolLen; /*eol*/ } @@ -697,7 +698,7 @@ void fwriteMain(fwriteMainArgs args) int thisLineLen=0; if (args.doRowNames) { if (args.rowNames) { - thisLineLen += (int)(strlen(getString(args.rowNames, i))); + thisLineLen += (int)strnlen(getString(args.rowNames, i), buffLimit); } else { thisLineLen += 1+(int)log10(args.nrow); // the width of the row number } @@ -710,11 +711,11 @@ void fwriteMain(fwriteMainArgs args) thisLineLen += writer_len[num_fun] + 2 * (doQuote != 0) + 1; /* 1 for sep */ } else if (num_fun == WF_String) { // if String const char* ch = getString(args.columns[j], i); - thisLineLen += (ch == NULL) ? strlen(na) : strlen(ch); + thisLineLen += (ch == NULL) ? strnlen(na, buffLimit) : strnlen(ch, buffLimit); thisLineLen += 2 * (doQuote!=0) + 1; } else if (num_fun == WF_CategString) { // if Factor const char* ch = getCategString(args.columns[j], i); - thisLineLen += (ch == NULL) ? strlen(na) : strlen(ch); + thisLineLen += (ch == NULL) ? strnlen(na, buffLimit) : strnlen(ch, buffLimit); thisLineLen += 2 * (doQuote!=0) + 1; } else if (num_fun == WF_List) { // if List char *ch = buff; // overwrite at the beginning of buff to be more robust > 1 million bytes @@ -907,10 +908,10 @@ void fwriteMain(fwriteMainArgs args) size = writer_len[num_fun]; } else if (num_fun == WF_String) { // if String const char* ch = getString(args.columns[j], i); - size = (ch == NULL) ? strlen(na) : strlen(ch); + size = (ch == NULL) ? strnlen(na, buffLimit) : strnlen(ch, buffLimit); } else if (num_fun == WF_CategString) { // if Factor const char* ch = getCategString(args.columns[j], i); - size = (ch == NULL) ? strlen(na) : strlen(ch); + size = (ch == NULL) ? strnlen(na, buffLimit) : strnlen(ch, buffLimit); } if (size >= buffLimit) { failed = -1; From f7f37e4b5f91584c924b524f216a7e40d3bc6f09 Mon Sep 17 00:00:00 2001 From: Philippe Chataignon Date: Thu, 24 Jan 2019 22:35:23 +0100 Subject: [PATCH 43/62] Use void* and size_t instead of Bytef* and uLongf --- src/fwrite.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 9ed7732ba1..2f5398ab4f 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -546,7 +546,7 @@ int writer_len[] = { 0, //&writeList }; -int compressbuff(Bytef* dest, uLongf* destLen, const Bytef* source, uLong sourceLen) +int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen) { int level = Z_DEFAULT_COMPRESSION; z_stream stream; @@ -630,8 +630,8 @@ void fwriteMain(fwriteMainArgs args) STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); size_t zbuffSize = 0; - uLongf zbuffUsed = 0; - Bytef *zbuff = NULL; + size_t zbuffUsed = 0; + void *zbuff = NULL; if (args.is_gzip) { zbuffSize = buffSize + buffSize/10 + 16; @@ -788,9 +788,9 @@ void fwriteMain(fwriteMainArgs args) // compress buff into zbuff if(args.is_gzip){ zbuffUsed = zbuffSize; - int ret = compressbuff(zbuff, &zbuffUsed, (Bytef*)buff, (int)(ch - buff)); + int ret = compressbuff(zbuff, &zbuffUsed, buff, (int)(ch - buff)); if(ret) { - STOP("Compress error: %d", ret); + STOP("Compress gzip error: %d", ret); } } @@ -798,9 +798,11 @@ void fwriteMain(fwriteMainArgs args) if (f==-1) { *ch = '\0'; DTPRINT(buff); - } else if (!args.is_gzip && WRITE(f, buff, (int)(ch-buff)) == -1) { - errwrite=errno; // capture write errno now incase close fails with a different errno - } else if (args.is_gzip && WRITE(f, zbuff, (int)zbuffUsed) == -1) { + } else if ((args.is_gzip)) { + if (WRITE(f, zbuff, (int)zbuffUsed) == -1) { + errwrite=errno; // capture write errno now incase close fails with a different errno + } + } else if (WRITE(f, buff, (int)(ch-buff)) == -1) { errwrite=errno; } @@ -859,9 +861,9 @@ void fwriteMain(fwriteMainArgs args) failed=-errno; } - uLongf myzbuffUsed = 0; + size_t myzbuffUsed = 0; size_t myzbuffSize = 0; - Bytef *myzBuff = NULL; + void *myzBuff = NULL; if(args.is_gzip){ myzbuffSize = buffSize + buffSize/10 + 16; @@ -935,7 +937,7 @@ void fwriteMain(fwriteMainArgs args) // compress buffer if gzip if (args.is_gzip) { myzbuffUsed = myzbuffSize; - failed = compressbuff(myzBuff, &myzbuffUsed, (Bytef*)myBuff, (int)(ch - myBuff)); + failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); } #pragma omp ordered { @@ -943,9 +945,11 @@ void fwriteMain(fwriteMainArgs args) if (f==-1) { *ch='\0'; // standard C string end marker so DTPRINT knows where to stop DTPRINT(myBuff); - } else if (!args.is_gzip && WRITE(f, myBuff, (int)(ch - myBuff)) == -1) { + } else if ((args.is_gzip)) { + if (WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { failed=errno; - } else if (args.is_gzip && WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { + } + } else if (WRITE(f, myBuff, (int)(ch - myBuff)) == -1) { failed=errno; } From 46fd237542de204cb30b01b743cf7a19876ed920 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 16 Apr 2019 16:21:30 -0700 Subject: [PATCH 44/62] attempt to pass on travis/appveyor: added -lz to PKG_LIBS --- src/Makevars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makevars b/src/Makevars index 5162e62756..e8e586004c 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,6 +1,6 @@ PKG_CFLAGS = $(SHLIB_OPENMP_CFLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CFLAGS) -lz all: $(SHLIB) mv $(SHLIB) datatable$(SHLIB_EXT) From 0e10df9904215569253936c1f78253d3a49f2722 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 16 Apr 2019 16:35:55 -0700 Subject: [PATCH 45/62] fixed fwrite.Rd warning (treated as fail by CI) --- man/fwrite.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 59519281a7..f98314e798 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -17,7 +17,7 @@ fwrite(x, file = "", append = FALSE, quote = "auto", dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(verbose), showProgress = getOption("datatable.showProgress", interactive()), - compress = c("default", "none", "gzip"), + compress = c("auto", "none", "gzip"), verbose = getOption("datatable.verbose", FALSE)) } \arguments{ From 031ebd0a6e8d836e43f1af1e14c11cc30c192758 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 16 Apr 2019 17:04:52 -0700 Subject: [PATCH 46/62] news tidy --- DESCRIPTION | 1 + NEWS.md | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 41be910570..d05f4ec762 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,6 +43,7 @@ Authors@R: c( Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo +SystemRequirements: zlib Description: Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development. License: MPL-2.0 | file LICENSE URL: http://r-datatable.com diff --git a/NEWS.md b/NEWS.md index 525d018f8d..cf96ca691b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,17 +4,15 @@ #### NEW FEATURES -1. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R. +1. `rleid()` functions now support long vectors (length > 2 billion). -2. `rleid()` functions now support long vectors (length > 2 billion). - -3. `fread()`: +2. `fread()`: * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, and Roy Storey for the initial PR. - -4. `fwrite()`: - * can now write directly gzipped csv file if `compress` option is set to `gzip` or if file ends with `".gz"`. Compression, like `fwrite`, is multithreaded. Thanks to Philippe Chataignon for the PR. -5. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). +3. `fwrite()`: + * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Either set `compress="gzip"` or use a `.gz` filename extension. Compression, like `fwrite`, is multithreaded and compresses each chunk on-the-fly (a full size file is not created first). Many thanks to Philippe Chataignon for the PR. + +4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). ```R > DT = data.table(A=1:3, B=list(1:2,"foo",3:5)) > DT @@ -36,9 +34,9 @@ 3: 3 3,4,5 ``` -6. `print.data.table()` gains an option to display the timezone of `POSIXct` columns when available, [#2842](https://github.com/Rdatatable/data.table/issues/2842). Thanks to Michael Chirico for reporting and Felipe Parages for the PR. +5. `print.data.table()` gains an option to display the timezone of `POSIXct` columns when available, [#2842](https://github.com/Rdatatable/data.table/issues/2842). Thanks to Michael Chirico for reporting and Felipe Parages for the PR. -7. New functions `nafill` and `setnafill`, [#854](https://github.com/Rdatatable/data.table/issues/854). Thanks to Matthieu Gomez for the request and Jan Gorecki for implementing. +6. New functions `nafill` and `setnafill`, [#854](https://github.com/Rdatatable/data.table/issues/854). Thanks to Matthieu Gomez for the request and Jan Gorecki for implementing. #### BUG FIXES @@ -61,6 +59,8 @@ 3. `setorder` on a superset of a keyed `data.table`'s key now retains its key, [#3456](https://github.com/Rdatatable/data.table/issues/3456). For example, if `a` is the key of `DT`, `setorder(DT, a, -v)` will leave `DT` keyed by `a`. +4. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R. + ### Changes in [v1.12.2](https://github.com/Rdatatable/data.table/milestone/14?closed=1) (07 Apr 2019) From c32249af7f3a998d6641fe38f310ece41747d2b5 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 16 Apr 2019 17:36:55 -0700 Subject: [PATCH 47/62] nocov and 2-space indentation --- inst/tests/tests.Rraw | 3 +- src/fwrite.c | 215 +++++++++++++++++++++--------------------- 2 files changed, 112 insertions(+), 106 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 057e4462e7..462727fc60 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -10355,7 +10355,8 @@ DT = data.table( D = as.POSIXct(dt<-paste(d,t), tz="UTC"), E = as.POSIXct(paste0(dt,c(".999",".0",".5",".111112",".123456",".023",".0",".999999",".99",".0009")), tz="UTC")) -test(1740.1, fwrite(DT,dateTimeAs="iso"), error="dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") +test(1740.0, fwrite(DT,dateTimeAs="iso"), error="dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") +test(1740.1, fwrite(DT,dateTimeAs=c("ISO","squash")), error="dateTimeAs must be a single string") test(1740.2, capture.output(fwrite(DT,dateTimeAs="ISO")), c( "A,B,C,D,E", "1907-10-21,1907-10-21,23:59:59,1907-10-21T23:59:59Z,1907-10-21T23:59:59.999Z", diff --git a/src/fwrite.c b/src/fwrite.c index 2f5398ab4f..fe0582089e 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -248,19 +248,19 @@ void writeFloat64(double *col, int64_t row, char **pch) // 30460 => l=3046, sf=4, exp=4 dr=0; dl0=1; width=5 // 0.0072 => l=72, sf=2, exp=-3 dr=4; dl0=1; width=6 if (width <= sf + (sf>1) + 2 + (abs(exp)>99?3:2)) { - // ^^^^ to not include 1 char for dec in -7e-04 where sf==1 - // ^ 2 for 'e+'/'e-' - // decimal format ... - ch += width-1; - if (dr) { - while (dr && sf) { *ch--='0'+l%10; l/=10; dr--; sf--; } - while (dr) { *ch--='0'; dr--; } - *ch-- = dec; - } - while (dl0) { *ch--='0'; dl0--; } - while (sf) { *ch--='0'+l%10; l/=10; sf--; } - // ch is now 1 before the first char of the field so position it afterward again, and done - ch += width+1; + // ^^^^ to not include 1 char for dec in -7e-04 where sf==1 + // ^ 2 for 'e+'/'e-' + // decimal format ... + ch += width-1; + if (dr) { + while (dr && sf) { *ch--='0'+l%10; l/=10; dr--; sf--; } + while (dr) { *ch--='0'; dr--; } + *ch-- = dec; + } + while (dl0) { *ch--='0'; dl0--; } + while (sf) { *ch--='0'+l%10; l/=10; sf--; } + // ch is now 1 before the first char of the field so position it afterward again, and done + ch += width+1; } else { // scientific ... ch += sf; // sf-1 + 1 for dec @@ -548,43 +548,43 @@ int writer_len[] = { int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen) { - int level = Z_DEFAULT_COMPRESSION; - z_stream stream; - int err; - const uInt max = (uInt)-1; - uLong left; - - left = *destLen; - *destLen = 0; - - stream.zalloc = (alloc_func)0; - stream.zfree = (free_func)0; - stream.opaque = (voidpf)0; - - err = deflateInit2(&stream, level, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); - if (err != Z_OK) - return err; - - stream.next_out = dest; - stream.avail_out = 0; - stream.next_in = (z_const Bytef *)source; - stream.avail_in = 0; - - do { - if (stream.avail_out == 0) { - stream.avail_out = left > (uLong)max ? max : (uInt)left; - left -= stream.avail_out; - } - if (stream.avail_in == 0) { - stream.avail_in = sourceLen > (uLong)max ? max : (uInt)sourceLen; - sourceLen -= stream.avail_in; - } - err = deflate(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH); - } while (err == Z_OK); + int level = Z_DEFAULT_COMPRESSION; + z_stream stream; + int err; + const uInt max = (uInt)-1; + uLong left; + + left = *destLen; + *destLen = 0; + + stream.zalloc = (alloc_func)0; + stream.zfree = (free_func)0; + stream.opaque = (voidpf)0; + + err = deflateInit2(&stream, level, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); + if (err != Z_OK) + return err; // # nocov + + stream.next_out = dest; + stream.avail_out = 0; + stream.next_in = (z_const Bytef *)source; + stream.avail_in = 0; + + do { + if (stream.avail_out == 0) { + stream.avail_out = left > (uLong)max ? max : (uInt)left; + left -= stream.avail_out; + } + if (stream.avail_in == 0) { + stream.avail_in = sourceLen > (uLong)max ? max : (uInt)sourceLen; + sourceLen -= stream.avail_in; + } + err = deflate(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH); + } while (err == Z_OK); - *destLen = stream.total_out; - deflateEnd(&stream); - return err == Z_STREAM_END ? Z_OK : err; + *destLen = stream.total_out; + deflateEnd(&stream); + return err == Z_STREAM_END ? Z_OK : err; } static int failed = 0; @@ -627,7 +627,7 @@ void fwriteMain(fwriteMainArgs args) char *buff = malloc(buffSize); if (!buff) - STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); + STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); // # nocov size_t zbuffSize = 0; size_t zbuffUsed = 0; @@ -637,7 +637,7 @@ void fwriteMain(fwriteMainArgs args) zbuffSize = buffSize + buffSize/10 + 16; zbuff = malloc(zbuffSize); if (!zbuff) - STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); + STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); // # nocov } if(args.verbose) { @@ -678,8 +678,7 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("maxHeaderLen=%d\n", maxHeaderLen); if (maxHeaderLen >= buffLimit) { - STOP("Error : header line is greater than buffer limit. Try to increase buffMB option. Example 'buffMB = %d'\n", - 2 * maxHeaderLen / 1024 / 1024 + 1); + STOP("Error : header line is greater than buffer limit. Try to increase buffMB option. Example 'buffMB = %d'\n", 2*maxHeaderLen/1024/1024 + 1); // # nocov } // Estimate max line length of a 1000 row sample (100 rows in 10 places). @@ -695,42 +694,40 @@ void fwriteMain(fwriteMainArgs args) args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol, eolLen); } for (int64_t i = 0; i < args.nrow; i += args.nrow / 1000 + 1) { - int thisLineLen=0; - if (args.doRowNames) { - if (args.rowNames) { - thisLineLen += (int)strnlen(getString(args.rowNames, i), buffLimit); - } else { - thisLineLen += 1+(int)log10(args.nrow); // the width of the row number - } - thisLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + int thisLineLen=0; + if (args.doRowNames) { + if (args.rowNames) { + thisLineLen += (int)strnlen(getString(args.rowNames, i), buffLimit); + } else { + thisLineLen += 1+(int)log10(args.nrow); // the width of the row number } + thisLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + } - for (int j=0; j 1 million bytes - writeList(args.columns[j], i, &ch); - thisLineLen += (int)(ch-buff) + 1; - } - } - thisLineLen += eolLen; - if (thisLineLen > maxLineLen) - maxLineLen = thisLineLen; - // stop if buffer is too low - if (maxLineLen >= buffSecure) { - STOP("Error : max line length is greater than buffer secure limit. Try to increase buffMB option. Example 'buffMB = %d'\n", - 2 * maxLineLen / 1024 / 1024 + 1); + for (int j=0; j 1 million bytes + writeList(args.columns[j], i, &ch); + thisLineLen += (int)(ch-buff) + 1; } + } + thisLineLen += eolLen; + if (thisLineLen > maxLineLen) maxLineLen = thisLineLen; + // stop if buffer is too low + if (maxLineLen >= buffSecure) { + STOP("Error : max line length is greater than buffer secure limit. Try to increase buffMB option. Example 'buffMB = %d'\n", 2*maxLineLen/1024/1024 + 1); // # nocov + } } if (args.verbose) @@ -752,11 +749,13 @@ void fwriteMain(fwriteMainArgs args) // There is no binary/text mode distinction on Linux and Mac if (f == -1) { + // # nocov start int erropen = errno; STOP(access( args.filename, F_OK ) != -1 ? "%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?" : "%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), args.filename); + // # nocov end } } @@ -790,7 +789,7 @@ void fwriteMain(fwriteMainArgs args) zbuffUsed = zbuffSize; int ret = compressbuff(zbuff, &zbuffUsed, buff, (int)(ch - buff)); if(ret) { - STOP("Compress gzip error: %d", ret); + STOP("Compress gzip error: %d", ret); // # nocov } } @@ -800,19 +799,21 @@ void fwriteMain(fwriteMainArgs args) DTPRINT(buff); } else if ((args.is_gzip)) { if (WRITE(f, zbuff, (int)zbuffUsed) == -1) { - errwrite=errno; // capture write errno now incase close fails with a different errno + errwrite=errno; // # nocov ; capture write errno now incase close fails with a different errno } } else if (WRITE(f, buff, (int)(ch-buff)) == -1) { - errwrite=errno; + errwrite=errno; // # nocov } if (errwrite) { + // # nocov start CLOSE(f); free(buff); if(args.is_gzip){ free(zbuff); } STOP("%s: '%s'", strerror(errwrite), args.filename); + // # nocov end } } @@ -825,9 +826,9 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); if (args.nrow == 0) { if (args.verbose) - DTPRINT("No data rows present (nrow==0)\n"); + DTPRINT("No data rows present (nrow==0)\n"); // # nocov if (f!=-1 && CLOSE(f)) - STOP("%s: '%s'", strerror(errno), args.filename); + STOP("%s: '%s'", strerror(errno), args.filename); // # nocov return; } @@ -858,7 +859,7 @@ void fwriteMain(fwriteMainArgs args) ch = myBuff = malloc(buffSize); // each thread has its own buffer. malloc and errno are thread-safe. if (myBuff==NULL) { - failed=-errno; + failed=-errno; // # nocov } size_t myzbuffUsed = 0; @@ -869,7 +870,7 @@ void fwriteMain(fwriteMainArgs args) myzbuffSize = buffSize + buffSize/10 + 16; myzBuff = malloc(myzbuffSize); if (myzBuff==NULL) { - failed=-errno; + failed=-errno; // # nocov } } // Do not rely on availability of '#omp cancel' new in OpenMP v4.0 (July 2013). @@ -887,7 +888,7 @@ void fwriteMain(fwriteMainArgs args) #pragma omp for ordered schedule(dynamic) for(int64_t start=0; start= buffLimit) { - failed = -1; - break; + failed = -1; // # nocov + break; // # nocov } else { (args.funs[args.whichFun[j]])(args.columns[j], i, &ch); *ch++ = sep; // Test if buffer to low if ( (int)(ch - myBuff) >= buffLimit ) { - failed = -1; - break; // stop writing + failed = -1; // # nocov + break; // # nocov ; stop writing } } } @@ -932,7 +933,7 @@ void fwriteMain(fwriteMainArgs args) ch--; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead if (failed) - break; // this thread stop writing rows; fall through to clear up and STOP() below + break; // # nocov ; this thread stop writing rows; fall through to clear up and STOP() below } // compress buffer if gzip if (args.is_gzip) { @@ -947,10 +948,10 @@ void fwriteMain(fwriteMainArgs args) DTPRINT(myBuff); } else if ((args.is_gzip)) { if (WRITE(f, myzBuff, (int)(myzbuffUsed)) == -1) { - failed=errno; + failed=errno; // # nocov } } else if (WRITE(f, myBuff, (int)(ch - myBuff)) == -1) { - failed=errno; + failed=errno; // # nocov } int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB @@ -961,6 +962,7 @@ void fwriteMain(fwriteMainArgs args) // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the // master thread (me==0) and hopefully this will work on Windows. If not, user should set // showProgress=FALSE until this can be fixed or removed. + // # nocov start int ETA = (int)((args.nrow-end)*((now-startTime)/end)); if (hasPrinted || ETA >= 2) { if (args.verbose && !hasPrinted) DTPRINT("\n"); @@ -972,6 +974,7 @@ void fwriteMain(fwriteMainArgs args) nextTime = now+1; hasPrinted = true; } + // # nocov end } // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. // Something like: @@ -995,12 +998,13 @@ void fwriteMain(fwriteMainArgs args) // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. free(myBuff); if (args.is_gzip) { - free(myzBuff); + free(myzBuff); } } // Finished parallel region and can call R API safely now. if (hasPrinted) { + // # nocov start if (!failed) { // clear the progress meter DTPRINT("\r " @@ -1009,18 +1013,19 @@ void fwriteMain(fwriteMainArgs args) // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC DTPRINT("\n"); } + // # nocov end } if (f!=-1 && CLOSE(f) && !failed) - STOP("%s: '%s'", strerror(errno), args.filename); + STOP("%s: '%s'", strerror(errno), args.filename); // # nocov // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail // from the original error. if (failed<0) { - STOP("Error : one or more threads failed to malloc or buffer was too small. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); + STOP("Error : one or more threads failed to malloc or buffer was too small. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); // # nocov } else if (failed>0) { - STOP("%s: '%s'", strerror(failed), args.filename); + STOP("%s: '%s'", strerror(failed), args.filename); // # nocov } return; } From 1189568acca8398584ad385826f6eb73f6d408eb Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 17 Apr 2019 15:48:42 -0700 Subject: [PATCH 48/62] removed not-used writer_len item in fwriteMainArgs --- src/fwrite.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fwrite.h b/src/fwrite.h index 412c1bad20..7cf92052a3 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -54,7 +54,6 @@ typedef struct fwriteMainArgs // a vector of pointers to all-same-length column vectors void **columns; writer_fun_t *funs; // a vector of writer_fun_t function pointers - int *writer_len; // length ncol vector containing which fun[] to use for each column // one byte to use 8 times less cache lines than a vector of function pointers would do From 25ffdb09e13b9b86c38110536482f2e611db862b Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 17 Apr 2019 16:22:46 -0700 Subject: [PATCH 49/62] extra 1 width for sign of negative ints --- src/fwrite.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index fe0582089e..249a0d504b 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -529,21 +529,21 @@ void writeCategString(void *col, int64_t row, char **pch) write_string(getCategString(col, row), pch); } -int writer_len[] = { - 5, //&writeBool8, - 5, //&writeBool32, - 5, //&writeBool32AsString, - 9, //&writeInt32, - 19, //&writeInt64, - 24, //&writeFloat64, - 32, //&writeITime, - 16, //&writeDateInt32, - 16, //&writeDateFloat64, - 32, //&writePOSIXct, - 48, //&writeNanotime, - 0, //&writeString, - 0, //&writeCategString, - 0, //&writeList +int writer_len[] = { // max field length for calculating max line length + 5, //&writeBool8 "false" + 5, //&writeBool32 + 5, //&writeBool32AsString + 11, //&writeInt32 "-2147483647" + 20, //&writeInt64 "-9223372036854775807" + 24, //&writeFloat64 "-3.141592653589793115998" with max options(digits=22) + 32, //&writeITime + 16, //&writeDateInt32 + 16, //&writeDateFloat64 + 32, //&writePOSIXct + 48, //&writeNanotime + 0, //&writeString + 0, //&writeCategString + 0, //&writeList }; int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen) From 3c097aecb77b3630d889c4cc4ae8bdbba745eedf Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 17 Apr 2019 18:03:07 -0700 Subject: [PATCH 50/62] declare variables close to first usage, and added comment as to why uInt type used --- src/fwrite.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 249a0d504b..f5f9a0c822 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -548,20 +548,12 @@ int writer_len[] = { // max field length for calculating max line length int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen) { - int level = Z_DEFAULT_COMPRESSION; z_stream stream; - int err; - const uInt max = (uInt)-1; - uLong left; - - left = *destLen; - *destLen = 0; - stream.zalloc = (alloc_func)0; stream.zfree = (free_func)0; stream.opaque = (voidpf)0; - err = deflateInit2(&stream, level, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); + int err = deflateInit2(&stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); if (err != Z_OK) return err; // # nocov @@ -569,14 +561,15 @@ int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceL stream.avail_out = 0; stream.next_in = (z_const Bytef *)source; stream.avail_in = 0; - + size_t left = *destLen; + const uInt uInt_max = (uInt)-1; // stream.avail_out is type uInt do { if (stream.avail_out == 0) { - stream.avail_out = left > (uLong)max ? max : (uInt)left; + stream.avail_out = left>uInt_max ? uInt_max : left; left -= stream.avail_out; } if (stream.avail_in == 0) { - stream.avail_in = sourceLen > (uLong)max ? max : (uInt)sourceLen; + stream.avail_in = sourceLen>uInt_max ? uInt_max : sourceLen; sourceLen -= stream.avail_in; } err = deflate(&stream, sourceLen ? Z_NO_FLUSH : Z_FINISH); From c2efc8e456ee7437a7d8ae296d7e5f9d8a011781 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 18 Apr 2019 16:53:50 -0700 Subject: [PATCH 51/62] reduce diff; free(NULL) is no-op ok (removed if) --- src/fwrite.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index f5f9a0c822..cbfe8b2ca2 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -738,8 +738,8 @@ void fwriteMain(fwriteMainArgs args) // to convert \n to \r\n on Windows when in text mode not not when in binary mode. #else f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666); -#endif // There is no binary/text mode distinction on Linux and Mac +#endif if (f == -1) { // # nocov start @@ -764,15 +764,11 @@ void fwriteMain(fwriteMainArgs args) char *ch = buff; if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column - if (doQuote) { - *ch++='"'; *ch++='"'; - } + if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv *ch++ = sep; } for (int j=0; j0) { - *ch++ = sep; - } + if (j>0) *ch++ = sep; writeString(args.colNames, j, &ch); } write_chars(args.eol, &ch); @@ -802,26 +798,19 @@ void fwriteMain(fwriteMainArgs args) // # nocov start CLOSE(f); free(buff); - if(args.is_gzip){ - free(zbuff); - } + free(zbuff); STOP("%s: '%s'", strerror(errwrite), args.filename); // # nocov end } } free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above - if(args.is_gzip){ - free(zbuff); - } + free(zbuff); - if (args.verbose) - DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); + if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); if (args.nrow == 0) { - if (args.verbose) - DTPRINT("No data rows present (nrow==0)\n"); // # nocov - if (f!=-1 && CLOSE(f)) - STOP("%s: '%s'", strerror(errno), args.filename); // # nocov + if (args.verbose) DTPRINT("No data rows present (nrow==0)\n"); + if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); return; } @@ -880,8 +869,7 @@ void fwriteMain(fwriteMainArgs args) #pragma omp for ordered schedule(dynamic) for(int64_t start=0; start=1 because 0-columns was caught earlier. write_chars(args.eol, &ch); // overwrite last sep with eol instead - if (failed) - break; // # nocov ; this thread stop writing rows; fall through to clear up and STOP() below + if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below } // compress buffer if gzip if (args.is_gzip) { From 01833e2cfd00657d832b4613554dbfff0a162eef Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 18 Apr 2019 17:13:57 -0700 Subject: [PATCH 52/62] more diff reduction --- src/fwrite.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index cbfe8b2ca2..373572435f 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -619,8 +619,7 @@ void fwriteMain(fwriteMainArgs args) size_t buffSecure = (size_t) 5 * buffSize / 10; // maxLineLen in initial sample must be under this value char *buff = malloc(buffSize); - if (!buff) - STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); // # nocov + if (!buff) STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); size_t zbuffSize = 0; size_t zbuffUsed = 0; @@ -629,11 +628,10 @@ void fwriteMain(fwriteMainArgs args) if (args.is_gzip) { zbuffSize = buffSize + buffSize/10 + 16; zbuff = malloc(zbuffSize); - if (!zbuff) - STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); // # nocov + if (!zbuff) STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); } - if(args.verbose) { + if (args.verbose) { DTPRINT("Column writers: "); if (args.ncol<=50) { for (int j=0; j Date: Thu, 18 Apr 2019 17:57:44 -0700 Subject: [PATCH 53/62] free's were missing on STOP('compress gzip error'). Moved zbuffUsed declaration next to usage. --- src/fwrite.c | 43 ++++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 373572435f..c4aafafb74 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -622,7 +622,6 @@ void fwriteMain(fwriteMainArgs args) if (!buff) STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); size_t zbuffSize = 0; - size_t zbuffUsed = 0; void *zbuff = NULL; if (args.is_gzip) { @@ -770,34 +769,28 @@ void fwriteMain(fwriteMainArgs args) } write_chars(args.eol, &ch); - // compress buff into zbuff - if(args.is_gzip){ - zbuffUsed = zbuffSize; - int ret = compressbuff(zbuff, &zbuffUsed, buff, (int)(ch - buff)); - if(ret) { - STOP("Compress gzip error: %d", ret); // # nocov - } - } - - int errwrite = 0; if (f==-1) { *ch = '\0'; DTPRINT(buff); - } else if ((args.is_gzip)) { - if (WRITE(f, zbuff, (int)zbuffUsed) == -1) { - errwrite=errno; // # nocov ; capture write errno now incase close fails with a different errno + } else { + int ret1=0, ret2=0; + if (args.is_gzip) { + size_t zbuffUsed = zbuffSize; + ret1 = compressbuff(zbuff, &zbuffUsed, buff, (int)(ch - buff)); + ret2 = WRITE(f, zbuff, (int)zbuffUsed); + } else { + ret2 = WRITE(f, buff, (int)(ch-buff)); + } + if (ret1 || ret2==-1) { + // # nocov start + int errwrite = errno; // capture write errno now incase close fails with a different errno + CLOSE(f); + free(buff); + free(zbuff); + if (ret1) STOP("Compress gzip error: %d", ret1); + else STOP("%s: '%s'", strerror(errwrite), args.filename); + // # nocov end } - } else if (WRITE(f, buff, (int)(ch-buff)) == -1) { - errwrite=errno; // # nocov - } - - if (errwrite) { - // # nocov start - CLOSE(f); - free(buff); - free(zbuff); - STOP("%s: '%s'", strerror(errwrite), args.filename); - // # nocov end } } From 42711f2c2465c53286c4e7d21028bf83e8737213 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 19 Apr 2019 23:29:46 -0700 Subject: [PATCH 54/62] interm: no sample for maxLineLen, reorganized --- src/fwrite.c | 166 ++++++++++++++------------------------------------ src/fwriteR.c | 4 ++ 2 files changed, 49 insertions(+), 121 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index c4aafafb74..3864613e67 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -6,7 +6,7 @@ #include // INT32_MIN #include // isfinite, isnan #include // abs -#include // strnlen, strerror +#include // strlen, strerror #ifdef WIN32 #include @@ -36,6 +36,7 @@ static bool qmethodEscape=false; // when quoting fields, how to escape dou static bool squashDateTime=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) extern const char *getString(void *, int); +extern const int getStringLen(void *, int); extern const char *getCategString(void *, int); extern double wallclock(void); @@ -587,7 +588,6 @@ void fwriteMain(fwriteMainArgs args) { double startTime = wallclock(); double nextTime = startTime+2; // start printing progress meter in 2 sec if not completed by then - double t0 = startTime; na = args.na; sep = args.sep; @@ -602,33 +602,11 @@ void fwriteMain(fwriteMainArgs args) qmethodEscape = args.qmethodEscape; squashDateTime = args.squashDateTime; - - // alloc one buffMB here. Keep rewriting each field to it, to sum up the size. Restriction: one field can't be - // greater that minimumum buffMB (1MB = 1 million characters). Otherwise unbounded overwrite. Possible with very - // very long single strings, or very long list column values. - // The caller guarantees no field with be longer than this. If so, it can set buffMB larger. It might know - // due to some stats it has maintained on each column or in the environment generally. - // However, a single field being longer than 1 million characters is considered a very reasonable restriction. - // Once we have a good line length estimate, we may increase the buffer size a lot anyway. - // The default buffMB is 8MB, so it's really 8 million character limit by default. 1MB is because user might set - // buffMB to 1, say if they have 512 CPUs or more, perhaps. - if (args.buffMB<1 || args.buffMB>1024) STOP("buffMB=%d outside [1,1024]", args.buffMB); size_t buffSize = (size_t)1024*1024*args.buffMB; - size_t buffLimit = (size_t) 9 * buffSize / 10; // set buffer limit for thread = 90% - size_t buffSecure = (size_t) 5 * buffSize / 10; // maxLineLen in initial sample must be under this value - - char *buff = malloc(buffSize); - if (!buff) STOP("Unable to allocate %d MiB for buffer: %s", buffSize / 1024 / 1024, strerror(errno)); - size_t zbuffSize = 0; - void *zbuff = NULL; - - if (args.is_gzip) { - zbuffSize = buffSize + buffSize/10 + 16; - zbuff = malloc(zbuffSize); - if (!zbuff) STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno)); - } + int eolLen=strlen(args.eol), naLen=strlen(args.na); + if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); if (args.verbose) { DTPRINT("Column writers: "); @@ -639,88 +617,9 @@ void fwriteMain(fwriteMainArgs args) DTPRINT("... "); for (int j=args.ncol-10; j0) { - maxHeaderLen += 1; /* for sep */ - } - maxHeaderLen += strnlen(getString(args.colNames, j), buffLimit); - } - maxHeaderLen += eolLen; /*eol*/ - } - if (args.verbose) - DTPRINT("maxHeaderLen=%d\n", maxHeaderLen); - - if (maxHeaderLen >= buffLimit) { - STOP("Error : header line is greater than buffer limit. Try to increase buffMB option. Example 'buffMB = %d'\n", 2*maxHeaderLen/1024/1024 + 1); // # nocov - } - - // Estimate max line length of a 1000 row sample (100 rows in 10 places). - // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. - // Buffers will be resized later if there are too many line lengths outside the sample, anyway. - // maxLineLen is required to determine a reasonable rowsPerBatch. - - int maxLineLen = 0; - // Cold section as only 1,000 rows. Speed not an issue issue here. - // Overestimating line length is ok. - if (args.verbose) { DTPRINT("\nargs.doRowNames=%d args.rowNames=%d doQuote=%d args.nrow=%d args.ncol=%d eolLen=%d\n", args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol, eolLen); } - for (int64_t i = 0; i < args.nrow; i += args.nrow / 1000 + 1) { - int thisLineLen=0; - if (args.doRowNames) { - if (args.rowNames) { - thisLineLen += (int)strnlen(getString(args.rowNames, i), buffLimit); - } else { - thisLineLen += 1+(int)log10(args.nrow); // the width of the row number - } - thisLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; - } - - for (int j=0; j 1 million bytes - writeList(args.columns[j], i, &ch); - thisLineLen += (int)(ch-buff) + 1; - } - } - thisLineLen += eolLen; - if (thisLineLen > maxLineLen) maxLineLen = thisLineLen; - // stop if buffer is too low - if (maxLineLen >= buffSecure) { - STOP("Error : max line length is greater than buffer secure limit. Try to increase buffMB option. Example 'buffMB = %d'\n", 2*maxLineLen/1024/1024 + 1); // # nocov - } - } - - if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); int f=0; if (*args.filename=='\0') { @@ -736,7 +635,6 @@ void fwriteMain(fwriteMainArgs args) f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666); // There is no binary/text mode distinction on Linux and Mac #endif - if (f == -1) { // # nocov start int erropen = errno; @@ -748,15 +646,17 @@ void fwriteMain(fwriteMainArgs args) } } - t0=wallclock(); - + double t0 = wallclock(); if (args.verbose) { DTPRINT("Writing column names ... "); if (f==-1) DTPRINT("\n"); } if (args.colNames) { - // We have tested this line length and verify that buffer was big enough - // So no more buffer tests + size_t headerLen = 0; + for (int j=0; j> column name) + char *buff = malloc(headerLen); + if (!buff) STOP("Unable to allocate %d MiB for header: %s", headerLen / 1024 / 1024, strerror(errno)); char *ch = buff; if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column @@ -764,39 +664,39 @@ void fwriteMain(fwriteMainArgs args) *ch++ = sep; } for (int j=0; j0) *ch++ = sep; writeString(args.colNames, j, &ch); + *ch++ = sep; } + ch--; // backup over the last sep write_chars(args.eol, &ch); - if (f==-1) { *ch = '\0'; DTPRINT(buff); + free(buff); } else { int ret1=0, ret2=0; if (args.is_gzip) { + size_t zbuffSize = headerLen + headerLen/10 + 16; + char *zbuff = malloc(zbuffSize); + if (!zbuff) {free(buff); STOP("Unable to allocate %d MiB for zbuffer: %s", zbuffSize / 1024 / 1024, strerror(errno));} size_t zbuffUsed = zbuffSize; - ret1 = compressbuff(zbuff, &zbuffUsed, buff, (int)(ch - buff)); - ret2 = WRITE(f, zbuff, (int)zbuffUsed); + ret1 = compressbuff(zbuff, &zbuffUsed, buff, (int)(ch-buff)); + if (ret1==0) ret2 = WRITE(f, zbuff, (int)zbuffUsed); + free(zbuff); } else { ret2 = WRITE(f, buff, (int)(ch-buff)); } + free(buff); if (ret1 || ret2==-1) { // # nocov start int errwrite = errno; // capture write errno now incase close fails with a different errno CLOSE(f); - free(buff); - free(zbuff); if (ret1) STOP("Compress gzip error: %d", ret1); else STOP("%s: '%s'", strerror(errwrite), args.filename); // # nocov end } } } - - free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above - free(zbuff); - if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); if (args.nrow == 0) { if (args.verbose) DTPRINT("No data rows present (nrow==0)\n"); @@ -807,7 +707,31 @@ void fwriteMain(fwriteMainArgs args) // Decide buffer size and rowsPerBatch for each thread // Once rowsPerBatch is decided it can't be changed - rowsPerBatch = buffLimit / maxLineLen; + // Calculate upper bound for line length. Numbers use a fixed maximum (e.g. 12 for integer) while strings find the longest + // string in each column. Upper bound is then the sum of the columns' max widths. + // This upper bound is required to determine a reasonable rowsPerBatch. It also saves needing to grow the buffers which + // is especially tricky when compressing, and saves needing to check/limit the buffer writing because we know + // up front the buffer does have sufficient capacity. + // A large overestimate (e.g. 2-5x too big) is ok, provided it is not so large that the buffers can't be allocated. + + size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + 1/*sep*/); + if (args.doRowNames) { + maxLineLen += args.rowNames ? getMaxStringLen(args.rowNames)*2 : 1+(int)log10(args.nrow); // the width of the row number + maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + } + for (int j=0; jbuffsize) { buffSize=2*maxLineLen; rowsPerBatch=2; } + else rowsPerBatch = buffSize / maxLineLen; if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; if (rowsPerBatch < 1) rowsPerBatch = 1; int numBatches = (args.nrow-1)/rowsPerBatch + 1; diff --git a/src/fwriteR.c b/src/fwriteR.c index 9e8e691d54..81fe345713 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -19,6 +19,10 @@ const char *getString(SEXP *col, int64_t row) { // TODO: inline for use in fwr return x==NA_STRING ? NULL : CHAR(x); } +const int getStringLen(SEXP *col, int64_t row) { + return LENGTH(x); // LENGTH of CHARSXP is nchar +} + const char *getCategString(SEXP col, int64_t row) { // the only writer that needs to have the header of the SEXP column, to get to the levels int x = INTEGER(col)[row]; From 034149b21a3838950b094e434d62a16450ac715c Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 22 Apr 2019 19:01:39 -0700 Subject: [PATCH 55/62] hot loop hot again; passes all tests --- src/fwrite.c | 72 +++++++++++++++++++-------------------------------- src/fwrite.h | 17 ++++++++++++ src/fwriteR.c | 59 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 102 insertions(+), 46 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 3864613e67..2f08a89256 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -37,6 +37,9 @@ static bool squashDateTime=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) extern const char *getString(void *, int); extern const int getStringLen(void *, int); +extern const int getMaxStringLen(void *, int64_t); +extern const int getMaxCategLen(void *); +extern const int getMaxListItemLen(void *, int64_t); extern const char *getCategString(void *, int); extern double wallclock(void); @@ -530,23 +533,6 @@ void writeCategString(void *col, int64_t row, char **pch) write_string(getCategString(col, row), pch); } -int writer_len[] = { // max field length for calculating max line length - 5, //&writeBool8 "false" - 5, //&writeBool32 - 5, //&writeBool32AsString - 11, //&writeInt32 "-2147483647" - 20, //&writeInt64 "-9223372036854775807" - 24, //&writeFloat64 "-3.141592653589793115998" with max options(digits=22) - 32, //&writeITime - 16, //&writeDateInt32 - 16, //&writeDateFloat64 - 32, //&writePOSIXct - 48, //&writeNanotime - 0, //&writeString - 0, //&writeCategString - 0, //&writeList -}; - int compressbuff(void* dest, size_t *destLen, const void* source, size_t sourceLen) { z_stream stream; @@ -716,12 +702,26 @@ void fwriteMain(fwriteMainArgs args) size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + 1/*sep*/); if (args.doRowNames) { - maxLineLen += args.rowNames ? getMaxStringLen(args.rowNames)*2 : 1+(int)log10(args.nrow); // the width of the row number + maxLineLen += args.rowNames ? getMaxStringLen(args.rowNames, args.nrow)*2 : 1+(int)log10(args.nrow); // the width of the row number maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; } for (int j=0; jbuffsize) { buffSize=2*maxLineLen; rowsPerBatch=2; } + if (maxLineLen*2>buffSize) { buffSize=2*maxLineLen; rowsPerBatch=2; } else rowsPerBatch = buffSize / maxLineLen; if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; if (rowsPerBatch < 1) rowsPerBatch = 1; @@ -795,29 +795,11 @@ void fwriteMain(fwriteMainArgs args) } // Hot loop for (int j=0; j= buffLimit) { - failed = -1; // # nocov - break; // # nocov - } else { - (args.funs[args.whichFun[j]])(args.columns[j], i, &ch); - *ch++ = sep; - // Test if buffer to low - if ( (int)(ch - myBuff) >= buffLimit ) { - failed = -1; // # nocov - break; // # nocov ; stop writing - } - } + //printf("j=%d args.ncol=%d myBuff='%.*s' ch=%p\n", j, args.ncol, 20, myBuff, ch); + (args.funs[args.whichFun[j]])(args.columns[j], i, &ch); + //printf(" j=%d args.ncol=%d myBuff='%.*s' ch=%p\n", j, args.ncol, 20, myBuff, ch); + *ch++ = sep; + //printf(" j=%d args.ncol=%d myBuff='%.*s' ch=%p\n", j, args.ncol, 20, myBuff, ch); } // Tepid again (once at the end of each line) ch--; // backup onto the last sep after the last column. ncol>=1 because 0-columns was caught earlier. @@ -827,7 +809,7 @@ void fwriteMain(fwriteMainArgs args) // compress buffer if gzip if (args.is_gzip) { myzbuffUsed = myzbuffSize; - failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch - myBuff)); + failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch-myBuff)); } #pragma omp ordered { diff --git a/src/fwrite.h b/src/fwrite.h index 7cf92052a3..406cdf9d7d 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -43,6 +43,23 @@ typedef enum { // same order as fun[] above WF_List } WFs; +static const int writerMaxLen[] = { // same order as fun[] and WFs above; max field width used for calculating upper bound line length + 5, //&writeBool8 "false" + 5, //&writeBool32 "false" + 5, //&writeBool32AsString "false" + 11, //&writeInt32 "-2147483647" + 20, //&writeInt64 "-9223372036854775807" + 29, //&writeFloat64 "-3.141592653589793115998E-123" [max sf 22 consistent with options()$digits] + 32, //&writeITime + 16, //&writeDateInt32 + 16, //&writeDateFloat64 + 32, //&writePOSIXct + 48, //&writeNanotime + 0, //&writeString + 0, //&writeCategString + 0, //&writeList +}; + typedef struct fwriteMainArgs { // Name of the file to open (a \0-terminated C string). If the file name diff --git a/src/fwriteR.c b/src/fwriteR.c index 81fe345713..fc813e57db 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -20,7 +20,26 @@ const char *getString(SEXP *col, int64_t row) { // TODO: inline for use in fwr } const int getStringLen(SEXP *col, int64_t row) { - return LENGTH(x); // LENGTH of CHARSXP is nchar + return LENGTH(col[row]); // LENGTH of CHARSXP is nchar +} + +const int getMaxStringLen(const SEXP *col, const int64_t n) { + int max=0; + SEXP last=NULL; + for (int i=0; imax) max=thisnchar; + last = this; + } + return max; +} + +const int getMaxCategLen(SEXP col) { + col = getAttrib(col, R_LevelsSymbol); + if (!isString(col)) error("Internal error: col passed to getMaxCategLen is missing levels"); + return getMaxStringLen( STRING_PTR(col), LENGTH(col) ); } const char *getCategString(SEXP col, int64_t row) { @@ -68,6 +87,44 @@ void writeList(SEXP *col, int64_t row, char **pch) { *pch = ch; } +const int getMaxListItemLen(const SEXP *col, const int64_t n) { + int max=0; + SEXP last=NULL; + for (int i=0; i=1) width+=LENGTH(STRING_ELT(levels, *id-1)); + } break; + case WF_List: // error above should have happened, fall through to default + default: + STOP("Internal error: row %d of list column has no max length method implemented", i+1); + } + } else { + width = (length(this)+1) * width; // +1 for sep2 + } + if (width>max) max=width; + last = this; + } + return max; +} + static int32_t whichWriter(SEXP column) { // int32_t is returned here just so the caller can output nice context-full error message should INT32_MIN be returned // the caller then passes uint8_t to fwriteMain From 96d62305adfb4dd3fe8b1028f5510ad085216f7d Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 22 Apr 2019 19:15:48 -0700 Subject: [PATCH 56/62] revert unrelated indentation fix to reduce diff (will do post-merge) --- src/fwrite.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 2f08a89256..a09c984848 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -252,19 +252,19 @@ void writeFloat64(double *col, int64_t row, char **pch) // 30460 => l=3046, sf=4, exp=4 dr=0; dl0=1; width=5 // 0.0072 => l=72, sf=2, exp=-3 dr=4; dl0=1; width=6 if (width <= sf + (sf>1) + 2 + (abs(exp)>99?3:2)) { - // ^^^^ to not include 1 char for dec in -7e-04 where sf==1 - // ^ 2 for 'e+'/'e-' - // decimal format ... - ch += width-1; - if (dr) { - while (dr && sf) { *ch--='0'+l%10; l/=10; dr--; sf--; } - while (dr) { *ch--='0'; dr--; } - *ch-- = dec; - } - while (dl0) { *ch--='0'; dl0--; } - while (sf) { *ch--='0'+l%10; l/=10; sf--; } - // ch is now 1 before the first char of the field so position it afterward again, and done - ch += width+1; + // ^^^^ to not include 1 char for dec in -7e-04 where sf==1 + // ^ 2 for 'e+'/'e-' + // decimal format ... + ch += width-1; + if (dr) { + while (dr && sf) { *ch--='0'+l%10; l/=10; dr--; sf--; } + while (dr) { *ch--='0'; dr--; } + *ch-- = dec; + } + while (dl0) { *ch--='0'; dl0--; } + while (sf) { *ch--='0'+l%10; l/=10; sf--; } + // ch is now 1 before the first char of the field so position it afterward again, and done + ch += width+1; } else { // scientific ... ch += sf; // sf-1 + 1 for dec From a593abd95d706538597afbb3eb943fb5faccece4 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 22 Apr 2019 21:28:43 -0700 Subject: [PATCH 57/62] coverage --- inst/tests/tests.Rraw | 7 ++++ src/fwrite.c | 76 ++++++++++++++++++++++--------------------- src/fwriteR.c | 31 +++++------------- 3 files changed, 55 insertions(+), 59 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 462727fc60..6bd0acbb0f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9488,6 +9488,13 @@ if (.Platform$OS.type=="unix") { unlink(f) } +DT = data.table(a=1:3, b=list(1:4, c(3.14, 100e10), c("foo", "bar", "baz"))) +test(1658.40, fwrite(DT), output=c("a,b","1,1|2|3|4","2,3.14|1e+12","3,foo|bar|baz")) +DT[3,b:=c(3i,4i,5i)] +test(1658.41, fwrite(DT), error="Row 3 of list column is type 'complex'") +DT[3,b:=factor(letters[1:3])] +test(1658.42, fwrite(DT), error="Row 3 of list column is type 'factor'") + ## End fwrite tests # tests for #679, inrange(), FR #707 diff --git a/src/fwrite.c b/src/fwrite.c index a09c984848..4fb888c382 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -6,7 +6,7 @@ #include // INT32_MIN #include // isfinite, isnan #include // abs -#include // strlen, strerror +#include // strnlen (n for codacy), strerror #ifdef WIN32 #include @@ -591,7 +591,7 @@ void fwriteMain(fwriteMainArgs args) if (args.buffMB<1 || args.buffMB>1024) STOP("buffMB=%d outside [1,1024]", args.buffMB); size_t buffSize = (size_t)1024*1024*args.buffMB; - int eolLen=strlen(args.eol), naLen=strlen(args.na); + int eolLen=strnlen(args.eol, 1024), naLen=strnlen(args.na, 1024); // strnlen required by Codacy if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); if (args.verbose) { @@ -607,6 +607,43 @@ void fwriteMain(fwriteMainArgs args) args.doRowNames, args.rowNames, doQuote, args.nrow, args.ncol, eolLen); } + // Calculate upper bound for line length. Numbers use a fixed maximum (e.g. 12 for integer) while strings find the longest + // string in each column. Upper bound is then the sum of the columns' max widths. + // This upper bound is required to determine a reasonable rowsPerBatch. It also saves needing to grow the buffers which + // is especially tricky when compressing, and saves needing to check/limit the buffer writing because we know + // up front the buffer does have sufficient capacity. + // A large overestimate (e.g. 2-5x too big) is ok, provided it is not so large that the buffers can't be allocated. + // Do this first so that, for example, any unsupported types in list columns happen first before opening file (which + // could be console output) and writing column names to it. + + double t0 = wallclock(); + size_t maxLineLen = eolLen + args.ncol*(2*(doQuote!=0) + 1/*sep*/); + if (args.doRowNames) { + maxLineLen += args.rowNames ? getMaxStringLen(args.rowNames, args.nrow)*2 : 1+(int)log10(args.nrow); // the width of the row number + maxLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + } + for (int j=0; j=1) width+=LENGTH(STRING_ELT(levels, *id-1)); - } break; - case WF_List: // error above should have happened, fall through to default - default: - STOP("Internal error: row %d of list column has no max length method implemented", i+1); - } + if (wf!=WF_String) STOP("Internal error: row %d of list column has no max length method implemented", i+1); // # nocov + SEXP *elem = STRING_PTR(this); + const int l = LENGTH(this); + for (int j=0; j Date: Tue, 23 Apr 2019 11:54:40 -0700 Subject: [PATCH 58/62] compress tests made cross-platform, and example added to news item --- NEWS.md | 9 ++++++++- inst/tests/tests.Rraw | 37 +++++++++++++------------------------ 2 files changed, 21 insertions(+), 25 deletions(-) diff --git a/NEWS.md b/NEWS.md index cf96ca691b..df61dfcab9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,14 @@ * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, and Roy Storey for the initial PR. 3. `fwrite()`: - * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Either set `compress="gzip"` or use a `.gz` filename extension. Compression, like `fwrite`, is multithreaded and compresses each chunk on-the-fly (a full size file is not created first). Many thanks to Philippe Chataignon for the PR. + * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the PR. For example: + + ```R + DT = data.table(A=rep(1:2,each=100), B=rep(1:4,each=25)) + fwrite(DT, "data.csv"); file.size("data.csv") # 804 + fwrite(DT, "data.csv.gz"); file.size("data.csv.gz") # 74 + identical(DT, fread("data.csv.gz")) + ``` 4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). ```R diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6bd0acbb0f..8ad591f4b2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9467,33 +9467,22 @@ test(1658.34, fwrite(matrix(1:4, nrow=2, ncol=2), quote = TRUE), output = '"V1", test(1658.35, fwrite(matrix(1:3, nrow=3, ncol=1), quote = TRUE), output = '"V1"\n.*1\n2\n3', message = "x being coerced from class: matrix to data.table") test(1658.36, fwrite(matrix(1:4, nrow=2, ncol=2, dimnames = list(c("ra","rb"),c("ca","cb"))), quote = TRUE), output = '"ca","cb"\n.*1,3\n2,4', message = "x being coerced from class: matrix to data.table") -# fwrite output to console ignore compress -test(1658.37, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), - output='a,b\n1,1\n2,2\n3,3') - -# fwrite force gzipped output -if (.Platform$OS.type=="unix") { - f <- tempfile() - fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="gzip") - test(1658.38, system(paste("zcat", f), intern=T), output='[1] "a,b" "1,1" "2,2" "3,3"') - unlink(f) -} - - -# fwrite force csv output -if (.Platform$OS.type=="unix") { - f <- tempfile() - fwrite(data.table(a=c(1:3), b=c(1:3)), file=f, compress="none") - test(1658.39, system(paste("cat", f), intern=TRUE), output='[1] "a,b" "1,1" "2,2" "3,3"') - unlink(f) -} - +# fwrite compress +test(1658.37, fwrite(data.table(a=c(1:3), b=c(1:3)), compress="gzip"), output='a,b\n1,1\n2,2\n3,3') # compress ignored on console +DT = data.table(a=rep(1:2,each=100), b=rep(1:4,each=25)) +fwrite(DT, file=f1<-tempfile(fileext=".gz")) +fwrite(DT, file=f2<-tempfile()) +test(1658.38, file.size(f1) Date: Tue, 23 Apr 2019 11:58:27 -0700 Subject: [PATCH 59/62] simpler example in news item --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index df61dfcab9..78f666824b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,8 +14,8 @@ ```R DT = data.table(A=rep(1:2,each=100), B=rep(1:4,each=25)) - fwrite(DT, "data.csv"); file.size("data.csv") # 804 - fwrite(DT, "data.csv.gz"); file.size("data.csv.gz") # 74 + fwrite(DT, "data.csv") # 804 bytes + fwrite(DT, "data.csv.gz") # 74 bytes identical(DT, fread("data.csv.gz")) ``` From ee3e595edf0ef45773ecd3fc636f437bd9656d02 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 23 Apr 2019 12:50:41 -0700 Subject: [PATCH 60/62] tidy --- src/fwrite.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 4fb888c382..ad6d2c8531 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -728,10 +728,6 @@ void fwriteMain(fwriteMainArgs args) // Decide buffer size and rowsPerBatch for each thread // Once rowsPerBatch is decided it can't be changed - - //size_t buffLimit = (size_t) 9 * buffSize / 10; // set buffer limit for thread = 90% - //size_t buffSecure = (size_t) 5 * buffSize / 10; // maxLineLen in initial sample must be under this value - if (maxLineLen*2>buffSize) { buffSize=2*maxLineLen; rowsPerBatch=2; } else rowsPerBatch = buffSize / maxLineLen; if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; @@ -761,7 +757,7 @@ void fwriteMain(fwriteMainArgs args) size_t myzbuffSize = 0; void *myzBuff = NULL; - if(args.is_gzip){ + if(args.is_gzip && !failed){ myzbuffSize = buffSize + buffSize/10 + 16; myzBuff = malloc(myzbuffSize); if (myzBuff==NULL) failed=-errno; @@ -797,11 +793,8 @@ void fwriteMain(fwriteMainArgs args) } // Hot loop for (int j=0; j=1 because 0-columns was caught earlier. @@ -809,7 +802,7 @@ void fwriteMain(fwriteMainArgs args) if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below } // compress buffer if gzip - if (args.is_gzip) { + if (args.is_gzip && !failed) { myzbuffUsed = myzbuffSize; failed = compressbuff(myzBuff, &myzbuffUsed, myBuff, (int)(ch-myBuff)); } @@ -876,12 +869,10 @@ void fwriteMain(fwriteMainArgs args) // Finished parallel region and can call R API safely now. if (hasPrinted) { // # nocov start - if (!failed) { - // clear the progress meter + if (!failed) { // clear the progress meter DTPRINT("\r " " \r"); - } else { - // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC + } else { // don't clear any potentially helpful output before error DTPRINT("\n"); } // # nocov end @@ -894,9 +885,11 @@ void fwriteMain(fwriteMainArgs args) // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail // from the original error. if (failed<0) { - STOP("Error : one or more threads failed to malloc or buffer was too small. Try to increase buffMB option. Example 'buffMB = %d'\n", 2 * args.buffMB); // # nocov + STOP("Error %d: one or more threads failed to allocate buffers or there was a compression error." + " Please try again with verbose=TRUE and try searching online for this error message.\n", failed); // # nocov } else if (failed>0) { STOP("%s: '%s'", strerror(failed), args.filename); // # nocov } return; } + From 989740c5876edeb89dfbb07bca44a3ccb516bd69 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 23 Apr 2019 13:31:03 -0700 Subject: [PATCH 61/62] news item for bug fixes with links --- NEWS.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 78f666824b..75d23fddf4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,7 +10,7 @@ * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, and Roy Storey for the initial PR. 3. `fwrite()`: - * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the PR. For example: + * now writes compressed `.gz` files directly, [#2016](https://github.com/Rdatatable/data.table/issues/2016). Compression, like `fwrite()`, is multithreaded and compresses each chunk on-the-fly (a full size intermediate file is not created). Use a ".gz" extension, or the new `compress=` option. Many thanks to Philippe Chataignon for the significant PR. For example: ```R DT = data.table(A=rep(1:2,each=100), B=rep(1:4,each=25)) @@ -53,6 +53,8 @@ 3. A missing item in `j` such as `j=.(colA, )` now gives a helpful error (`Item 2 of the .() or list() passed to j is missing`) rather than the unhelpful error `argument "this_jsub" is missing, with no default` (v1.12.2) or `argument 2 is empty` (v1.12.0 and before), [#3507](https://github.com/Rdatatable/data.table/issues/3507). Thanks to @eddelbuettel for the report. +4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288). + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. From 7665a8db5c5227a1a7422f4652a924cec27bdf6e Mon Sep 17 00:00:00 2001 From: mattdowle Date: Tue, 23 Apr 2019 13:48:37 -0700 Subject: [PATCH 62/62] coverage --- src/fwrite.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fwrite.c b/src/fwrite.c index ad6d2c8531..9156f69b1e 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -885,7 +885,7 @@ void fwriteMain(fwriteMainArgs args) // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail // from the original error. if (failed<0) { - STOP("Error %d: one or more threads failed to allocate buffers or there was a compression error." + STOP("Error %d: one or more threads failed to allocate buffers or there was a compression error." // # nocov " Please try again with verbose=TRUE and try searching online for this error message.\n", failed); // # nocov } else if (failed>0) { STOP("%s: '%s'", strerror(failed), args.filename); // # nocov