From 93f6db2f13ad1e2489ab1866b3eac38531241115 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 28 Aug 2021 16:54:01 +0200 Subject: [PATCH 01/30] fread: turn off sampling for fill --- R/fread.R | 10 +++++----- inst/tests/tests.Rraw | 3 +++ man/fread.Rd | 3 ++- src/fread.c | 13 +++++++++++-- src/fread.h | 3 +++ src/freadR.c | 4 +++- 6 files changed, 27 insertions(+), 9 deletions(-) diff --git a/R/fread.R b/R/fread.R index 12f46b57ea..2e25243223 100644 --- a/R/fread.R +++ b/R/fread.R @@ -2,7 +2,7 @@ fread = function( input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), -col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, +col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, sample.fill=TRUE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") @@ -22,7 +22,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( - isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(sample.fill), isTRUEorFALSE(showProgress), isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L @@ -79,7 +79,6 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w<=2L) { # https: or ftps: if (!requireNamespace("curl", quietly = TRUE)) stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) } else { method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 @@ -146,6 +145,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } + if (sample.fill & !fill) ("sample.fill=TRUE cannot be used without fill=TRUE.") if (yaml) { if (!requireNamespace('yaml', quietly = TRUE)) stopf("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov @@ -262,8 +262,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local tz="UTC" } - ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, - fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") + ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,fill, + sample.fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) require_bit64_if_needed(ans) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 769703c7e5..23ccbd4976 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18121,3 +18121,6 @@ if (base::getRversion() >= "4.1.0") { # precision powers of 10^(-n), #4461 test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) +# turning off sampling for detecting the number of columns #2691 #1812 #4130 #3436 #2727 +test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE, select = 1:2), data.table(1L, rep(2L, 200))) diff --git a/man/fread.Rd b/man/fread.Rd index c7b7da8566..871d3540bb 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64", "integer64"), col.names, check.names=FALSE, encoding="unknown", -strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, +strip.white=TRUE, fill=FALSE, sample.fill=TRUE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), @@ -54,6 +54,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } \item{fill}{logical (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, blank fields are implicitly filled.} + \item{sample.fill}{logical (default is \code{TRUE}). Only applicable if fill=TRUE. If \code{FALSE} then all rows are used for detecting number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } diff --git a/src/fread.c b/src/fread.c index e0a32d3e14..86f2a8a75e 100644 --- a/src/fread.c +++ b/src/fread.c @@ -55,6 +55,7 @@ static bool any_number_like_NAstrings=false; static bool blank_is_a_NAstring=false; static bool stripWhite=true; // only applies to character columns; numeric fields always stripped static bool skipEmptyLines=false, fill=false; +static bool sampleFill=true; // turn off sampling for determining number of columns static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE @@ -162,6 +163,7 @@ bool freadCleanup(void) skipEmptyLines = false; eol_one_r = false; fill = false; + sampleFill = true; // following are borrowed references: do not free sof = eof = NULL; NAstrings = NULL; @@ -1328,6 +1330,7 @@ int freadMain(freadMainArgs _args) { if (quote == dec) STOP(_("quote == dec ('%c') is not allowed"), dec); // since quote=='\0' when user passed quote="", the logic in this file uses '*ch==quote && quote' otherwise // the ending \0 at eof could be treated as a quote (test xxx) + sampleFill = args.sampleFill; // File parsing context: pointer to the start of file, and to the end of // the file. The `sof` pointer may be shifted in order to skip over @@ -1578,7 +1581,8 @@ int freadMain(freadMainArgs _args) { int ncol; // Detected number of columns in the file const char *firstJumpEnd=NULL; // remember where the winning jumpline from jump 0 ends, to know its size excluding header const char *prevStart = NULL; // the start of the non-empty line before the first not-ignored row (for warning message later, or taking as column names) - int jumpLines = (int)umin(100,nrowLimit); // how many lines from each jump point to use. If nrowLimit is supplied, nJumps is later set to 1 as well. + int jumpLines = sampleFill ? (int)umin(100,nrowLimit) : INT32_MAX; // how many lines from each jump point to use and whether sampling should be used or not. + // If nrowLimit is supplied, nJumps is later set to 1 as well. { if (verbose) DTPRINT(_("[06] Detect separator, quoting rule, and ncolumns\n")); @@ -2590,8 +2594,13 @@ int freadMain(freadMainArgs _args) { else { ch = headPos; int tt = countfields(&ch); - DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), + if (fill) { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider additional sample.fill=FALSE. First discarded non-empty line: <<%s>>"), + (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); + } else { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); + } } } } diff --git a/src/fread.h b/src/fread.h index 446da18e4b..321cb3af8e 100644 --- a/src/fread.h +++ b/src/fread.h @@ -126,6 +126,9 @@ typedef struct freadMainArgs // all ragged rows will be filled with NAs on the right. bool fill; + // If True, then a sample will be used at fill for detecting ncol. Otherwise all rows will be used for the detection. + bool sampleFill; + // If True, then emit progress messages during the parsing. bool showProgress; diff --git a/src/freadR.c b/src/freadR.c index 97fe691aa1..2dc8b8e0af 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -61,6 +61,7 @@ SEXP freadR( SEXP stripWhiteArg, SEXP skipEmptyLinesArg, SEXP fillArg, + SEXP sampleFillArg, SEXP showProgressArg, SEXP nThreadArg, SEXP verboseArg, @@ -82,7 +83,7 @@ SEXP freadR( freadMainArgs args; ncol = 0; dtnrows = 0; - + if (!isString(inputArg) || LENGTH(inputArg)!=1) error(_("Internal error: freadR input not a single character string: a filename or the data itself. Should have been caught at R level.")); // # nocov const char *ch = (const char *)CHAR(STRING_ELT(inputArg,0)); @@ -153,6 +154,7 @@ SEXP freadR( args.stripWhite = LOGICAL(stripWhiteArg)[0]; args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0]; args.fill = LOGICAL(fillArg)[0]; + args.sampleFill = LOGICAL(sampleFillArg)[0]; args.showProgress = LOGICAL(showProgressArg)[0]; if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]); args.nth = (uint32_t)INTEGER(nThreadArg)[0]; From 23ce31e4bbb9af9b8968b84df20fe6df0f87ce75 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 28 Aug 2021 17:05:53 +0200 Subject: [PATCH 02/30] fixed stop --- R/fread.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 2e25243223..1ca314cbe7 100644 --- a/R/fread.R +++ b/R/fread.R @@ -145,7 +145,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } - if (sample.fill & !fill) ("sample.fill=TRUE cannot be used without fill=TRUE.") + if (!sample.fill & !fill) ("sample.fill=FALSE cannot be used without fill=TRUE.") if (yaml) { if (!requireNamespace('yaml', quietly = TRUE)) stopf("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov From 117bc4bab367b98e0b77d2ca38cc700a5c5987ee Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 28 Aug 2021 17:07:20 +0200 Subject: [PATCH 03/30] add stopf --- R/fread.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 1ca314cbe7..c56ece3a66 100644 --- a/R/fread.R +++ b/R/fread.R @@ -145,7 +145,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } - if (!sample.fill & !fill) ("sample.fill=FALSE cannot be used without fill=TRUE.") + if (!sample.fill & !fill) stopf("sample.fill=FALSE cannot be used without fill=TRUE.") if (yaml) { if (!requireNamespace('yaml', quietly = TRUE)) stopf("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov From cb3d03bd6cbcff9c6be1532b976adefab7494672 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 28 Aug 2021 17:23:02 +0200 Subject: [PATCH 04/30] fread: turn off sampling for fill --- R/fread.R | 10 +++++----- inst/tests/tests.Rraw | 3 +++ man/fread.Rd | 3 ++- src/fread.c | 13 +++++++++++-- src/fread.h | 3 +++ src/freadR.c | 4 +++- 6 files changed, 27 insertions(+), 9 deletions(-) diff --git a/R/fread.R b/R/fread.R index 12f46b57ea..c56ece3a66 100644 --- a/R/fread.R +++ b/R/fread.R @@ -2,7 +2,7 @@ fread = function( input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), -col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, +col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, sample.fill=TRUE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") @@ -22,7 +22,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( - isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(sample.fill), isTRUEorFALSE(showProgress), isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L @@ -79,7 +79,6 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w<=2L) { # https: or ftps: if (!requireNamespace("curl", quietly = TRUE)) stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) } else { method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 @@ -146,6 +145,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } + if (!sample.fill & !fill) stopf("sample.fill=FALSE cannot be used without fill=TRUE.") if (yaml) { if (!requireNamespace('yaml', quietly = TRUE)) stopf("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov @@ -262,8 +262,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local tz="UTC" } - ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, - fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") + ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,fill, + sample.fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) require_bit64_if_needed(ans) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 769703c7e5..23ccbd4976 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18121,3 +18121,6 @@ if (base::getRversion() >= "4.1.0") { # precision powers of 10^(-n), #4461 test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) +# turning off sampling for detecting the number of columns #2691 #1812 #4130 #3436 #2727 +test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE, select = 1:2), data.table(1L, rep(2L, 200))) diff --git a/man/fread.Rd b/man/fread.Rd index c7b7da8566..871d3540bb 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64", "integer64"), col.names, check.names=FALSE, encoding="unknown", -strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, +strip.white=TRUE, fill=FALSE, sample.fill=TRUE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), @@ -54,6 +54,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } \item{fill}{logical (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, blank fields are implicitly filled.} + \item{sample.fill}{logical (default is \code{TRUE}). Only applicable if fill=TRUE. If \code{FALSE} then all rows are used for detecting number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } diff --git a/src/fread.c b/src/fread.c index e0a32d3e14..86f2a8a75e 100644 --- a/src/fread.c +++ b/src/fread.c @@ -55,6 +55,7 @@ static bool any_number_like_NAstrings=false; static bool blank_is_a_NAstring=false; static bool stripWhite=true; // only applies to character columns; numeric fields always stripped static bool skipEmptyLines=false, fill=false; +static bool sampleFill=true; // turn off sampling for determining number of columns static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE @@ -162,6 +163,7 @@ bool freadCleanup(void) skipEmptyLines = false; eol_one_r = false; fill = false; + sampleFill = true; // following are borrowed references: do not free sof = eof = NULL; NAstrings = NULL; @@ -1328,6 +1330,7 @@ int freadMain(freadMainArgs _args) { if (quote == dec) STOP(_("quote == dec ('%c') is not allowed"), dec); // since quote=='\0' when user passed quote="", the logic in this file uses '*ch==quote && quote' otherwise // the ending \0 at eof could be treated as a quote (test xxx) + sampleFill = args.sampleFill; // File parsing context: pointer to the start of file, and to the end of // the file. The `sof` pointer may be shifted in order to skip over @@ -1578,7 +1581,8 @@ int freadMain(freadMainArgs _args) { int ncol; // Detected number of columns in the file const char *firstJumpEnd=NULL; // remember where the winning jumpline from jump 0 ends, to know its size excluding header const char *prevStart = NULL; // the start of the non-empty line before the first not-ignored row (for warning message later, or taking as column names) - int jumpLines = (int)umin(100,nrowLimit); // how many lines from each jump point to use. If nrowLimit is supplied, nJumps is later set to 1 as well. + int jumpLines = sampleFill ? (int)umin(100,nrowLimit) : INT32_MAX; // how many lines from each jump point to use and whether sampling should be used or not. + // If nrowLimit is supplied, nJumps is later set to 1 as well. { if (verbose) DTPRINT(_("[06] Detect separator, quoting rule, and ncolumns\n")); @@ -2590,8 +2594,13 @@ int freadMain(freadMainArgs _args) { else { ch = headPos; int tt = countfields(&ch); - DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), + if (fill) { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider additional sample.fill=FALSE. First discarded non-empty line: <<%s>>"), + (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); + } else { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); + } } } } diff --git a/src/fread.h b/src/fread.h index 446da18e4b..321cb3af8e 100644 --- a/src/fread.h +++ b/src/fread.h @@ -126,6 +126,9 @@ typedef struct freadMainArgs // all ragged rows will be filled with NAs on the right. bool fill; + // If True, then a sample will be used at fill for detecting ncol. Otherwise all rows will be used for the detection. + bool sampleFill; + // If True, then emit progress messages during the parsing. bool showProgress; diff --git a/src/freadR.c b/src/freadR.c index 97fe691aa1..2dc8b8e0af 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -61,6 +61,7 @@ SEXP freadR( SEXP stripWhiteArg, SEXP skipEmptyLinesArg, SEXP fillArg, + SEXP sampleFillArg, SEXP showProgressArg, SEXP nThreadArg, SEXP verboseArg, @@ -82,7 +83,7 @@ SEXP freadR( freadMainArgs args; ncol = 0; dtnrows = 0; - + if (!isString(inputArg) || LENGTH(inputArg)!=1) error(_("Internal error: freadR input not a single character string: a filename or the data itself. Should have been caught at R level.")); // # nocov const char *ch = (const char *)CHAR(STRING_ELT(inputArg,0)); @@ -153,6 +154,7 @@ SEXP freadR( args.stripWhite = LOGICAL(stripWhiteArg)[0]; args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0]; args.fill = LOGICAL(fillArg)[0]; + args.sampleFill = LOGICAL(sampleFillArg)[0]; args.showProgress = LOGICAL(showProgressArg)[0]; if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]); args.nth = (uint32_t)INTEGER(nThreadArg)[0]; From 6dc2c9db43fe981038ee7a392b01fb18956c4671 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 28 Aug 2021 21:34:40 +0200 Subject: [PATCH 05/30] added coverage --- inst/tests/tests.Rraw | 1 + 1 file changed, 1 insertion(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 23ccbd4976..4bf152d0b1 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18124,3 +18124,4 @@ test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) # turning off sampling for detecting the number of columns #2691 #1812 #4130 #3436 #2727 test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE), data.table(1L, 2L, rep(c(NA,3L), each=100))) test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE, select = 1:2), data.table(1L, rep(2L, 200))) +test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE, sample.fill=FALSE), error="sample.fill=FALSE cannot be used without fill=TRUE.") From a3e5864dcf274e8444352779ca1f367e314b96a3 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 28 Aug 2021 22:11:13 +0200 Subject: [PATCH 06/30] coverage --- inst/tests/tests.Rraw | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4bf152d0b1..77c73d2738 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18122,6 +18122,7 @@ if (base::getRversion() >= "4.1.0") { test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) # turning off sampling for detecting the number of columns #2691 #1812 #4130 #3436 #2727 -test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE, select = 1:2), data.table(1L, rep(2L, 200))) -test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE, sample.fill=FALSE), error="sample.fill=FALSE cannot be used without fill=TRUE.") +test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE, select = 1:2), data.table(1L, rep(2L, 200))) +test(2214.4, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE, sample.fill=FALSE), error="sample.fill=FALSE cannot be used without fill=TRUE.") From 9b6bdb3f3d30b975e0ed34f1daa41157bea89117 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 18:33:09 +0200 Subject: [PATCH 07/30] revert additional argument --- R/fread.R | 9 ++++----- inst/tests/tests.Rraw | 8 ++++---- man/fread.Rd | 3 +-- src/fread.c | 15 ++++++--------- src/fread.h | 9 ++++----- src/freadR.c | 4 +--- 6 files changed, 20 insertions(+), 28 deletions(-) diff --git a/R/fread.R b/R/fread.R index c56ece3a66..6f9f1f30de 100644 --- a/R/fread.R +++ b/R/fread.R @@ -2,7 +2,7 @@ fread = function( input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), -col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, sample.fill=TRUE, blank.lines.skip=FALSE, key=NULL, index=NULL, +col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), keepLeadingZeros=getOption("datatable.keepLeadingZeros",FALSE), yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") @@ -22,7 +22,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( - isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(sample.fill), isTRUEorFALSE(showProgress), + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), (isTRUEorFALSE(fill)||is.numeric(fill)), isTRUEorFALSE(showProgress), isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L @@ -145,7 +145,6 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } # whitespace at the beginning or end of na.strings is checked at C level and is an error there; test 1804 } - if (!sample.fill & !fill) stopf("sample.fill=FALSE cannot be used without fill=TRUE.") if (yaml) { if (!requireNamespace('yaml', quietly = TRUE)) stopf("'data.table' relies on the package 'yaml' to parse the file header; please add this to your library with install.packages('yaml') and try again.") # nocov @@ -262,8 +261,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local tz="UTC" } - ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,fill, - sample.fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") + ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, + fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) require_bit64_if_needed(ans) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 77c73d2738..5f3ca8275b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18122,7 +18122,7 @@ if (base::getRversion() >= "4.1.0") { test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) # turning off sampling for detecting the number of columns #2691 #1812 #4130 #3436 #2727 -test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, sample.fill=FALSE, select = 1:2), data.table(1L, rep(2L, 200))) -test(2214.4, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE, sample.fill=FALSE), error="sample.fill=FALSE cannot be used without fill=TRUE.") +test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, select = 1:2), data.table(1L, rep(2L, 200))) +test(2214.4, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE), error="sample.fill=FALSE cannot be used without fill=TRUE.") diff --git a/man/fread.Rd b/man/fread.Rd index 871d3540bb..c7b7da8566 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64", "integer64"), col.names, check.names=FALSE, encoding="unknown", -strip.white=TRUE, fill=FALSE, sample.fill=TRUE, blank.lines.skip=FALSE, +strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), @@ -54,7 +54,6 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } \item{fill}{logical (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, blank fields are implicitly filled.} - \item{sample.fill}{logical (default is \code{TRUE}). Only applicable if fill=TRUE. If \code{FALSE} then all rows are used for detecting number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } diff --git a/src/fread.c b/src/fread.c index 86f2a8a75e..23eea87fed 100644 --- a/src/fread.c +++ b/src/fread.c @@ -54,8 +54,8 @@ static const char* const* NAstrings; static bool any_number_like_NAstrings=false; static bool blank_is_a_NAstring=false; static bool stripWhite=true; // only applies to character columns; numeric fields always stripped -static bool skipEmptyLines=false, fill=false; -static bool sampleFill=true; // turn off sampling for determining number of columns +static bool skipEmptyLines=false; +static int fill=0L; static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE @@ -162,8 +162,7 @@ bool freadCleanup(void) stripWhite = true; skipEmptyLines = false; eol_one_r = false; - fill = false; - sampleFill = true; + fill = 0L; // following are borrowed references: do not free sof = eof = NULL; NAstrings = NULL; @@ -1330,7 +1329,6 @@ int freadMain(freadMainArgs _args) { if (quote == dec) STOP(_("quote == dec ('%c') is not allowed"), dec); // since quote=='\0' when user passed quote="", the logic in this file uses '*ch==quote && quote' otherwise // the ending \0 at eof could be treated as a quote (test xxx) - sampleFill = args.sampleFill; // File parsing context: pointer to the start of file, and to the end of // the file. The `sof` pointer may be shifted in order to skip over @@ -1581,8 +1579,7 @@ int freadMain(freadMainArgs _args) { int ncol; // Detected number of columns in the file const char *firstJumpEnd=NULL; // remember where the winning jumpline from jump 0 ends, to know its size excluding header const char *prevStart = NULL; // the start of the non-empty line before the first not-ignored row (for warning message later, or taking as column names) - int jumpLines = sampleFill ? (int)umin(100,nrowLimit) : INT32_MAX; // how many lines from each jump point to use and whether sampling should be used or not. - // If nrowLimit is supplied, nJumps is later set to 1 as well. + int jumpLines = (int)umin(100,nrowLimit); // how many lines from each jump point to use. If nrowLimit is supplied, nJumps is later set to 1 as well. { if (verbose) DTPRINT(_("[06] Detect separator, quoting rule, and ncolumns\n")); @@ -1599,7 +1596,7 @@ int freadMain(freadMainArgs _args) { if (eol(&ch)) ch++; } firstJumpEnd = ch; // size of first 100 lines in bytes is used later for nrow estimate - fill = true; // so that blank lines are read as empty + fill = 1L; // so that blank lines are read as empty ch = pos; } else { int nseps; @@ -1731,7 +1728,7 @@ int freadMain(freadMainArgs _args) { } sep = topSep; whiteChar = (sep==' ' ? '\t' : (sep=='\t' ? ' ' : 0)); - ncol = topNumFields; + ncol = fill > 1L ? fill : topNumFields; if (fill || sep==127) { // leave pos on the first populated line; that is start of data ch = pos; diff --git a/src/fread.h b/src/fread.h index 321cb3af8e..002c3d5b46 100644 --- a/src/fread.h +++ b/src/fread.h @@ -123,11 +123,10 @@ typedef struct freadMainArgs bool skipEmptyLines; // If True, then rows are allowed to have variable number of columns, and - // all ragged rows will be filled with NAs on the right. - bool fill; - - // If True, then a sample will be used at fill for detecting ncol. Otherwise all rows will be used for the detection. - bool sampleFill; + // all ragged rows will be filled with NAs on the right. Supplying integer + // argument > 1 results in setting an upper bound estimate for the number + // of columns. + int fill; // If True, then emit progress messages during the parsing. bool showProgress; diff --git a/src/freadR.c b/src/freadR.c index 2dc8b8e0af..efc1b0eaa9 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -61,7 +61,6 @@ SEXP freadR( SEXP stripWhiteArg, SEXP skipEmptyLinesArg, SEXP fillArg, - SEXP sampleFillArg, SEXP showProgressArg, SEXP nThreadArg, SEXP verboseArg, @@ -153,8 +152,7 @@ SEXP freadR( // here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL args.stripWhite = LOGICAL(stripWhiteArg)[0]; args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0]; - args.fill = LOGICAL(fillArg)[0]; - args.sampleFill = LOGICAL(sampleFillArg)[0]; + args.fill = INTEGER(fillArg)[0]; args.showProgress = LOGICAL(showProgressArg)[0]; if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]); args.nth = (uint32_t)INTEGER(nThreadArg)[0]; From 96f6a8dd7b5ac3636da41a87c8c50610eb92e4d3 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 18:52:24 +0200 Subject: [PATCH 08/30] fill upperbound --- R/fread.R | 1 + inst/tests/tests.Rraw | 10 ++++++---- man/fread.Rd | 2 +- src/fread.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/R/fread.R b/R/fread.R index 6f9f1f30de..1bbba9bf4b 100644 --- a/R/fread.R +++ b/R/fread.R @@ -79,6 +79,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (w<=2L) { # https: or ftps: if (!requireNamespace("curl", quietly = TRUE)) stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov + curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) } else { method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5f3ca8275b..c2fedb3450 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18122,7 +18122,9 @@ if (base::getRversion() >= "4.1.0") { test(2213, identical(fread(text="A\n0.8060667366\n")$A, 0.8060667366)) # turning off sampling for detecting the number of columns #2691 #1812 #4130 #3436 #2727 -test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2214.2, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE, select = 1:2), data.table(1L, rep(2L, 200))) -test(2214.4, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE), error="sample.fill=FALSE cannot be used without fill=TRUE.") +test(2214.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2214.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=2L), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2214.4, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=3L), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2214.5, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=3L), data.table(1L, 2L, rep(c(NA,3L), each=100), NA, NA)) # too optimistic bound +test(2214.6, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=3L, select = 1:2), data.table(1L, rep(2L, 200))) diff --git a/man/fread.Rd b/man/fread.Rd index c7b7da8566..b61693cc63 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -53,7 +53,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } - \item{fill}{logical (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, blank fields are implicitly filled.} + \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided this is used as upper bound guess for the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } diff --git a/src/fread.c b/src/fread.c index 23eea87fed..2077938195 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1728,7 +1728,7 @@ int freadMain(freadMainArgs _args) { } sep = topSep; whiteChar = (sep==' ' ? '\t' : (sep=='\t' ? ' ' : 0)); - ncol = fill > 1L ? fill : topNumFields; + ncol = fill > topNumFields ? fill : topNumFields; // overwrite user guess if (fill || sep==127) { // leave pos on the first populated line; that is start of data ch = pos; From 99303e20200a261954af8481525ccbf18a4bfe3e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 20:50:57 +0200 Subject: [PATCH 09/30] integer as fill argument --- R/fread.R | 3 ++- inst/tests/tests.Rraw | 12 ++++++------ src/fread.c | 6 +++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/R/fread.R b/R/fread.R index 1bbba9bf4b..2b06e810e9 100644 --- a/R/fread.R +++ b/R/fread.R @@ -22,11 +22,12 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( - isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), (isTRUEorFALSE(fill)||is.numeric(fill)), isTRUEorFALSE(showProgress), + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0.0, isTRUEorFALSE(showProgress), isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L ) + fill=as.integer(fill) nrows=as.double(nrows) #4686 if (is.na(nrows) || nrows<0) nrows=Inf # accept -1 to mean Inf, as read.table does if (identical(header,"auto")) header=NA diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 75d79e65cc..dc5b62477e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18142,9 +18142,9 @@ test(2215.1, DT["b", B], 5L) # has worked forever test(2215.2, DT[factor("b"), B], 5L) # now works too, joining fact/fact, char/fact and fact/char have plenty of tests # fread(...,fill) can also be used to specify a guess on the maximum number of columns #2691 #1812 #4130 #3436 #2727 -test(2216.1, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=FALSE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2216.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2216.3, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=2L), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2216.4, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=3L), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2216.5, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=3L), data.table(1L, 2L, rep(c(NA,3L), each=100), NA, NA)) # too optimistic bound -test(2216.6, fread(text = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse=""), fill=3L, select = 1:2), data.table(1L, rep(2L, 200))) +dt_str = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse="") +test(2216.1, fread(text = dt_str, fill=FALSE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2216.2, fread(text = dt_str, fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2216.3, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2216.4, fread(text = dt_str, fill=3), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2216.5, fread(text = dt_str, fill=5L), data.table(1L, 2L, rep(c(NA,3L), each=100), NA, NA)) # too optimistic bound diff --git a/src/fread.c b/src/fread.c index 0a89f6f198..60aced58c6 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2591,9 +2591,9 @@ int freadMain(freadMainArgs _args) { else { ch = headPos; int tt = countfields(&ch); - if (fill) { - DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider additional sample.fill=FALSE. First discarded non-empty line: <<%s>>"), - (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); + if (fill==1L) { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or higher number. First discarded non-empty line: <<%s>>"), + (uint64_t)DTi+row1line, ncol, tt, tt, strlim(skippedFooter,500)); } else { DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); From 7bc34e3c88aebb327dbf92fa29abdfe29a211976 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 20:58:13 +0200 Subject: [PATCH 10/30] fix typo --- src/fread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 60aced58c6..44946e6b69 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1728,7 +1728,7 @@ int freadMain(freadMainArgs _args) { } sep = topSep; whiteChar = (sep==' ' ? '\t' : (sep=='\t' ? ' ' : 0)); - ncol = fill > topNumFields ? fill : topNumFields; // overwrite user guess if estimated number if higher + ncol = fill > topNumFields ? fill : topNumFields; // overwrite user guess if estimated number is higher if (fill || sep==127) { // leave pos on the first populated line; that is start of data ch = pos; From 62ea4e7ba0d3d19ff458f37a0a3e1b228cbca0db Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 21:11:28 +0200 Subject: [PATCH 11/30] fix L --- src/fread.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/fread.c b/src/fread.c index 44946e6b69..e9a4b6c71d 100644 --- a/src/fread.c +++ b/src/fread.c @@ -55,7 +55,7 @@ static bool any_number_like_NAstrings=false; static bool blank_is_a_NAstring=false; static bool stripWhite=true; // only applies to character columns; numeric fields always stripped static bool skipEmptyLines=false; -static int fill=0L; +static int fill=0; static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE @@ -162,7 +162,7 @@ bool freadCleanup(void) stripWhite = true; skipEmptyLines = false; eol_one_r = false; - fill = 0L; + fill = 0; // following are borrowed references: do not free sof = eof = NULL; NAstrings = NULL; @@ -1596,7 +1596,7 @@ int freadMain(freadMainArgs _args) { if (eol(&ch)) ch++; } firstJumpEnd = ch; // size of first 100 lines in bytes is used later for nrow estimate - fill = 1L; // so that blank lines are read as empty + fill = 1; // so that blank lines are read as empty ch = pos; } else { int nseps; @@ -2591,7 +2591,7 @@ int freadMain(freadMainArgs _args) { else { ch = headPos; int tt = countfields(&ch); - if (fill==1L) { + if (fill==1) { DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or higher number. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, tt, strlim(skippedFooter,500)); } else { From c12bb77279d207ce07a0fe3353c4da55ba4f3677 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 21:32:14 +0200 Subject: [PATCH 12/30] add NEWS --- NEWS.md | 2 ++ src/fread.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 30866667b9..0801b47145 100644 --- a/NEWS.md +++ b/NEWS.md @@ -141,6 +141,8 @@ 26. `base::droplevels()` gains a fast method for `data.table`, [#647](https://github.com/Rdatatable/data.table/issues/647). Thanks to Steve Lianoglou for requesting, and Jan Gorecki and Benjamin Schwendinger for the PR. `fdroplevels()` for use on vectors has also been added. +27. `fread(..., fill=FALSE)` now also accepts an `integer` in addition to boolean values. `fread` with `fill=TRUE` stops reading when the automacially estimated number of columns is too low. Providing an `integer` as argument for `fill` serves as estimate for the number of columns. Thanks to @christellacaze for requesting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/src/fread.c b/src/fread.c index e9a4b6c71d..4c2b97a73a 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2592,7 +2592,7 @@ int freadMain(freadMainArgs _args) { ch = headPos; int tt = countfields(&ch); if (fill==1) { - DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or higher number. First discarded non-empty line: <<%s>>"), + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or even higher ncol estimate. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, tt, strlim(skippedFooter,500)); } else { DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), From a189b734cb9401af7801184a8ef41e49496c7a51 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 21:53:08 +0200 Subject: [PATCH 13/30] update verbose --- src/fread.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 4c2b97a73a..d321e0a897 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1747,7 +1747,11 @@ int freadMain(freadMainArgs _args) { DTPRINT(_(" Detected %d columns on line %d. This line is either column names or first data row. Line starts as: <<%s>>\n"), tt, row1line, strlim(pos, 30)); DTPRINT(_(" Quote rule picked = %d\n"), quoteRule); - DTPRINT(_(" fill=%s and the most number of columns found is %d\n"), fill?"true":"false", ncol); + if (fill > 1) { + DTPRINT(_(" fill=%d was provided by the user and the most number of columns found is %d\n"), fill, topNumFields); + } else { + DTPRINT(_(" fill=%s and the most number of columns found is %d\n"), fill?"true":"false", ncol); + } } if (ncol==1 && lastEOLreplaced && (eof[-1]=='\n' || eof[-1]=='\r')) { From de8ff8554afbdfdfa36861d2fe670f56e0bb9810 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Tue, 31 Aug 2021 22:02:32 +0200 Subject: [PATCH 14/30] undo verbose --- src/fread.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/fread.c b/src/fread.c index d321e0a897..4c2b97a73a 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1747,11 +1747,7 @@ int freadMain(freadMainArgs _args) { DTPRINT(_(" Detected %d columns on line %d. This line is either column names or first data row. Line starts as: <<%s>>\n"), tt, row1line, strlim(pos, 30)); DTPRINT(_(" Quote rule picked = %d\n"), quoteRule); - if (fill > 1) { - DTPRINT(_(" fill=%d was provided by the user and the most number of columns found is %d\n"), fill, topNumFields); - } else { - DTPRINT(_(" fill=%s and the most number of columns found is %d\n"), fill?"true":"false", ncol); - } + DTPRINT(_(" fill=%s and the most number of columns found is %d\n"), fill?"true":"false", ncol); } if (ncol==1 && lastEOLreplaced && (eof[-1]=='\n' || eof[-1]=='\r')) { From d363f94d2607c6d3b670f48ae0fa9234bfa821e4 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 31 Oct 2021 17:47:53 +0100 Subject: [PATCH 15/30] init cleanup --- src/fread.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 4c2b97a73a..04fe777aa1 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2104,6 +2104,7 @@ int freadMain(freadMainArgs _args) { int nTypeBump=0, nTypeBumpCols=0; double tRead=0, tReread=0; double thRead=0, thPush=0; // reductions of timings within the parallel region + int max_col=0; char *typeBumpMsg=NULL; size_t typeBumpMsgSize=0; int typeCounts[NUMTYPE]; // used for verbose output; needs populating after first read and before reread (if any) -- see later comment #define internalErrSize 1000 @@ -2197,7 +2198,7 @@ int freadMain(freadMainArgs _args) { } prepareThreadContext(&ctx); - #pragma omp for ordered schedule(dynamic) reduction(+:thRead,thPush) + #pragma omp for ordered schedule(dynamic) reduction(+:thRead,thPush) reduction(max:max_col) for (int jump = jump0; jump < nJumps; jump++) { if (stopTeam) continue; // must continue and not break. We desire not to depend on (relatively new) omp cancel directive, yet double tLast = 0.0; // thread local wallclock time at last measuring point for verbose mode only. @@ -2278,6 +2279,7 @@ int freadMain(freadMainArgs _args) { tch++; j++; } + if (j > max_col) max_col = j; //*** END HOT. START TEPID ***// if (tch==tLineStart) { skip_white(&tch); // skips \0 before eof @@ -2289,6 +2291,7 @@ int freadMain(freadMainArgs _args) { int8_t thisSize = size[j]; if (thisSize) ((char **) targets)[thisSize] += thisSize; j++; + if (j > max_col) max_col = j; if (j==ncol) { tch++; myNrow++; continue; } // next line. Back up to while (tch1 && max_col Date: Sun, 31 Oct 2021 17:51:04 +0100 Subject: [PATCH 16/30] fix typo news --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 79c601d94b..ebecde579e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -207,7 +207,7 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` -27. `fread(..., fill=FALSE)` now also accepts an `integer` in addition to boolean values. `fread` with `fill=TRUE` stops reading when the automacially estimated number of columns is too low. Providing an `integer` as argument for `fill` serves as estimate for the number of columns. Thanks to @christellacaze for requesting, and Benjamin Schwendinger for the PR. +27. `fread(..., fill=FALSE)` now also accepts an `integer` in addition to boolean values. `fread` with `fill=TRUE` stops reading when the automatically estimated number of columns is too low. Providing an `integer` as argument for `fill` serves as estimate for the number of columns. Thanks to @christellacaze for requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES From 826c29bc8e760d28afc11e33dd100a794ff28491 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 31 Oct 2021 17:53:58 +0100 Subject: [PATCH 17/30] renum NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index ebecde579e..ab119b1e29 100644 --- a/NEWS.md +++ b/NEWS.md @@ -207,7 +207,7 @@ # v1.14.4 0.4826 0.5586 0.6586 0.6329 0.7348 1.318 100 ``` -27. `fread(..., fill=FALSE)` now also accepts an `integer` in addition to boolean values. `fread` with `fill=TRUE` stops reading when the automatically estimated number of columns is too low. Providing an `integer` as argument for `fill` serves as estimate for the number of columns. Thanks to @christellacaze for requesting, and Benjamin Schwendinger for the PR. +31. `fread(..., fill=FALSE)` now also accepts an `integer` in addition to boolean values. `fread` with `fill=TRUE` stops reading when the automatically estimated number of columns is too low. Providing an `integer` as argument for `fill` serves as estimate for the number of columns. Thanks to @christellacaze for requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES From 2b1df578faa65db8bd0149479ff43dfce129ff54 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 31 Oct 2021 20:13:11 +0100 Subject: [PATCH 18/30] add proper cleanup of overallocated columns --- inst/tests/tests.Rraw | 3 ++- src/fread.c | 13 +++++++++++-- src/fread.h | 5 +++++ src/freadR.c | 12 +++++++++++- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ec14bb7873..953fddaeaa 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18354,4 +18354,5 @@ test(2226.1, fread(text = dt_str, fill=FALSE), data.table(1L, rep(2L, 100)), war test(2226.2, fread(text = dt_str, fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") test(2226.3, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") test(2226.4, fread(text = dt_str, fill=3), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2226.5, fread(text = dt_str, fill=5L), data.table(1L, 2L, rep(c(NA,3L), each=100), NA, NA)) # too optimistic bound +test(2226.5, fread(text = dt_str, fill=5L), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2226.6, fread(text = dt_str, fill=1000L), data.table(1L, 2L, rep(c(NA,3L), each=100))) diff --git a/src/fread.c b/src/fread.c index b505acd639..51c016ec28 100644 --- a/src/fread.c +++ b/src/fread.c @@ -56,6 +56,7 @@ static bool blank_is_a_NAstring=false; static bool stripWhite=true; // only applies to character columns; numeric fields always stripped static bool skipEmptyLines=false; static int fill=0; +static int *dropFill = NULL; static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE @@ -136,6 +137,7 @@ bool freadCleanup(void) free(tmpType); tmpType = NULL; free(size); size = NULL; free(colNames); colNames = NULL; + free(dropFill); dropFill = NULL; if (mmp != NULL) { // Important to unmap as OS keeps internal reference open on file. Process is not exiting as // we're a .so/.dll here. If this was a process exiting we wouldn't need to unmap. @@ -2494,14 +2496,21 @@ int freadMain(freadMainArgs _args) { // cleanup since fill argument for number of columns was too high if (fill>1 && max_col Date: Sun, 31 Oct 2021 22:02:47 +0100 Subject: [PATCH 19/30] add tests and coverage --- inst/tests/tests.Rraw | 20 ++++++++++++++------ src/fread.c | 3 ++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 953fddaeaa..1fc2fb1ba2 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18350,9 +18350,17 @@ test(2225.2, groupingsets(data.table(iris), j=mean(Sepal.Length), by=c('Sp'='Spe # fread(...,fill) can also be used to specify a guess on the maximum number of columns #2691 #1812 #4130 #3436 #2727 dt_str = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse="") -test(2226.1, fread(text = dt_str, fill=FALSE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2226.2, fread(text = dt_str, fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2226.3, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2226.4, fread(text = dt_str, fill=3), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2226.5, fread(text = dt_str, fill=5L), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2226.6, fread(text = dt_str, fill=1000L), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2226.01, fread(text = dt_str, fill=FALSE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2226.02, fread(text = dt_str, fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2226.03, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") +test(2226.04, fread(text = dt_str, fill=3), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2226.05, fread(text = dt_str, fill=5L, verbose=TRUE), data.table(1L, 2L, rep(c(NA,3L), each=100)), output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big +test(2226.06, fread(text = dt_str, fill=1000L), data.table(1L, 2L, rep(c(NA,3L), each=100))) # user guess much too big +# 2691 +text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n16520, California, ocean, summer, golden gate, beach, San Francisco\n" +test(2226.07, dim(fread(text)), c(6L, 3L), warning=c("fill=TRUE", "Discarded")) +test(2226.08, dim(fread(text, fill=TRUE)), c(9L, 9L)) +text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n16520, California, ocean, summer, golden gate, beach, San Francisco\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n" +test(2226.09, dim(fread(text)), c(3L, 3L), warning=c("fill=TRUE", "fill=7")) +test(2226.10, dim(fread(text, fill=TRUE)), c(9L, 9L)) +test(2226.11, dim(fread(text, fill=7)), c(9L, 9L)) diff --git a/src/fread.c b/src/fread.c index 51c016ec28..3d028f4494 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2400,6 +2400,7 @@ int freadMain(freadMainArgs _args) { int8_t thisSize = size[j]; if (thisSize) ((char**) targets)[size[j]] += size[j]; // 'if' to avoid undefined NULL+=0 when rereading j++; + if (j > max_col) max_col = j; if (*tch==sep) { tch++; continue; } if (fill && (*tch=='\n' || *tch=='\r' || tch==eof) && j0) { DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or even higher ncol estimate. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, tt, strlim(skippedFooter,500)); } else { From 386a6819eeb5d55626bf431c68e3b29beb351e12 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Sun, 31 Oct 2021 22:52:32 +0100 Subject: [PATCH 20/30] fix tests --- inst/tests/tests.Rraw | 8 -------- src/fread.c | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1fc2fb1ba2..ca2f7eaef3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18356,11 +18356,3 @@ test(2226.03, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warni test(2226.04, fread(text = dt_str, fill=3), data.table(1L, 2L, rep(c(NA,3L), each=100))) test(2226.05, fread(text = dt_str, fill=5L, verbose=TRUE), data.table(1L, 2L, rep(c(NA,3L), each=100)), output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big test(2226.06, fread(text = dt_str, fill=1000L), data.table(1L, 2L, rep(c(NA,3L), each=100))) # user guess much too big -# 2691 -text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n16520, California, ocean, summer, golden gate, beach, San Francisco\n" -test(2226.07, dim(fread(text)), c(6L, 3L), warning=c("fill=TRUE", "Discarded")) -test(2226.08, dim(fread(text, fill=TRUE)), c(9L, 9L)) -text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n16520, California, ocean, summer, golden gate, beach, San Francisco\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n" -test(2226.09, dim(fread(text)), c(3L, 3L), warning=c("fill=TRUE", "fill=7")) -test(2226.10, dim(fread(text, fill=TRUE)), c(9L, 9L)) -test(2226.11, dim(fread(text, fill=7)), c(9L, 9L)) diff --git a/src/fread.c b/src/fread.c index 3d028f4494..01bc9342ac 100644 --- a/src/fread.c +++ b/src/fread.c @@ -175,6 +175,7 @@ bool freadCleanup(void) static inline uint64_t umax(uint64_t a, uint64_t b) { return a > b ? a : b; } static inline uint64_t umin(uint64_t a, uint64_t b) { return a < b ? a : b; } static inline int64_t imin( int64_t a, int64_t b) { return a < b ? a : b; } +static inline int i32min( int a, int b) { return a < b ? a : b; } /** Return value of `x` clamped to the range [upper, lower] */ static inline int64_t clamp_szt(int64_t x, int64_t lower, int64_t upper) { @@ -2400,7 +2401,6 @@ int freadMain(freadMainArgs _args) { int8_t thisSize = size[j]; if (thisSize) ((char**) targets)[size[j]] += size[j]; // 'if' to avoid undefined NULL+=0 when rereading j++; - if (j > max_col) max_col = j; if (*tch==sep) { tch++; continue; } if (fill && (*tch=='\n' || *tch=='\r' || tch==eof) && j Date: Sun, 31 Oct 2021 22:57:44 +0100 Subject: [PATCH 21/30] add tests --- inst/tests/tests.Rraw | 10 ++++++++++ src/fread.c | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index ca2f7eaef3..2bdafe2b8c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18356,3 +18356,13 @@ test(2226.03, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warni test(2226.04, fread(text = dt_str, fill=3), data.table(1L, 2L, rep(c(NA,3L), each=100))) test(2226.05, fread(text = dt_str, fill=5L, verbose=TRUE), data.table(1L, 2L, rep(c(NA,3L), each=100)), output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big test(2226.06, fread(text = dt_str, fill=1000L), data.table(1L, 2L, rep(c(NA,3L), each=100))) # user guess much too big +# 2691 +text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n16520, California, ocean, summer, golden gate, beach, San Francisco\n" +test(2226.07, dim(fread(text)), c(6L, 3L), warning=c("fill=TRUE", "Discarded")) +test(2226.08, dim(fread(text, fill=TRUE)), c(9L, 9L)) +text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n16520, California, ocean, summer, golden gate, beach, San Francisco\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n" +test(2226.09, dim(fread(text)), c(3L, 3L), warning=c("fill=TRUE", "fill=7")) +test(2226.10, dim(fread(text, fill=TRUE)), c(9L, 9L)) +test(2226.11, dim(fread(text, fill=7)), c(9L, 9L)) +test(2226.12, dim(fread(text, fill=9)), c(9L, 9L)) +test(2226.13, dim(fread(text, fill=20)), c(9L, 20L)) # clean up currently only kicks in if sep!=' ' diff --git a/src/fread.c b/src/fread.c index 01bc9342ac..0eafaa108d 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2496,7 +2496,7 @@ int freadMain(freadMainArgs _args) { //-- end parallel ------------------ // cleanup since fill argument for number of columns was too high - if (fill>1 && max_col1 && max_col0) { int ndropFill = ncol - max_col; if (verbose) { DTPRINT(_(" Provided number of fill columns: %d but only found %d\n"), ncol, max_col); From 1aa07125454c69e542c6fab7615c6369ccfbf7f6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Mon, 1 Nov 2021 00:54:07 +0100 Subject: [PATCH 22/30] cleanup --- src/fread.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 0eafaa108d..08424c825a 100644 --- a/src/fread.c +++ b/src/fread.c @@ -175,7 +175,6 @@ bool freadCleanup(void) static inline uint64_t umax(uint64_t a, uint64_t b) { return a > b ? a : b; } static inline uint64_t umin(uint64_t a, uint64_t b) { return a < b ? a : b; } static inline int64_t imin( int64_t a, int64_t b) { return a < b ? a : b; } -static inline int i32min( int a, int b) { return a < b ? a : b; } /** Return value of `x` clamped to the range [upper, lower] */ static inline int64_t clamp_szt(int64_t x, int64_t lower, int64_t upper) { From aa2c3aaeb078f87e6350971df62206c8b6b0352c Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 5 Jan 2024 14:21:56 +0100 Subject: [PATCH 23/30] update NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index bc7f467cd3..8fcdf05c90 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +99. `fread(..., fill=FALSE)` now also accepts an `integer` in addition to boolean values. `fread` with `fill=TRUE` stops reading when the automatically estimated number of columns is too low, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` serves as estimate for the number of columns, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. + **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** # data.table [v1.14.99](https://github.com/Rdatatable/data.table/milestone/29) (in development) From 2066bda514f3fe9dfc189116f92b77bd3f48d86f Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 5 Jan 2024 14:22:05 +0100 Subject: [PATCH 24/30] update tests --- inst/tests/tests.Rraw | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7c1ad4074e..1680f897f3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18248,13 +18248,13 @@ test(2243.38, dt[, sd(y, na.rm=as.logical(j)), g, verbose=TRUE], data.table( # fread(...,fill) can also be used to specify a guess on the maximum number of columns #2691 #1812 #4130 #3436 #2727 dt_str = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse="") -test(2244.01, fread(text = dt_str, fill=FALSE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2244.02, fread(text = dt_str, fill=TRUE), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2244.03, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warning="Stopped early on line 101.*First discarded non-empty line: <<1,2,3>>") -test(2244.04, fread(text = dt_str, fill=3), data.table(1L, 2L, rep(c(NA,3L), each=100))) +test(2244.01, fread(text = dt_str, fill=FALSE), data.table(1L, rep(2L, 100)), warning=".*Consider fill=TRUE.*") +test(2244.02, fread(text = dt_str, fill=TRUE), data.table(1L, rep(2L, 100)), warning=".*Consider fill=3.*") +test(2244.03, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warning=".*Consider fill=3.*") +test(2244.04, fread(text = dt_str, fill=3L), data.table(1L, 2L, rep(c(NA,3L), each=100))) test(2244.05, fread(text = dt_str, fill=5L, verbose=TRUE), data.table(1L, 2L, rep(c(NA,3L), each=100)), output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big test(2244.06, fread(text = dt_str, fill=1000L), data.table(1L, 2L, rep(c(NA,3L), each=100))) # user guess much too big -# 2691 +# example from 2691 text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n16520, California, ocean, summer, golden gate, beach, San Francisco\n" test(2244.07, dim(fread(text)), c(6L, 3L), warning=c("fill=TRUE", "Discarded")) test(2244.08, dim(fread(text, fill=TRUE)), c(9L, 9L)) From 7ec8dc872b79b72a3bda2e6227bd18dd60f7f8e0 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 15 Mar 2024 12:03:47 -0700 Subject: [PATCH 25/30] Refine NEWS --- NEWS.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 429de97d8a..89520ec86c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,3 @@ -99. `fread(..., fill=FALSE)` now also accepts an `integer` in addition to boolean values. `fread` with `fill=TRUE` stops reading when the automatically estimated number of columns is too low, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` serves as estimate for the number of columns, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. - **If you are viewing this file on CRAN, please check [latest news on GitHub](https://github.com/Rdatatable/data.table/blob/master/NEWS.md) where the formatting is also better.** # data.table [v1.15.99](https://github.com/Rdatatable/data.table/milestone/30) (in development) @@ -22,6 +20,8 @@ 4. Namespace-qualifying `data.table::shift()`, `data.table::first()`, or `data.table::last()` will not deactivate GForce, [#5942](https://github.com/Rdatatable/data.table/issues/5942). Thanks @MichaelChirico for the proposal and fix. Namespace-qualifying other calls like `stats::sum()`, `base::prod()`, etc., continue to work as an escape valve to avoid GForce, e.g. to ensure S3 method dispatch. +5. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. From e50508a66c842ee4917fa38d216c05edbacd99e6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Mar 2024 00:16:27 +0100 Subject: [PATCH 26/30] use integer for fill Co-authored-by: Michael Chirico --- R/fread.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 936f01490b..b4086d155f 100644 --- a/R/fread.R +++ b/R/fread.R @@ -22,7 +22,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( - isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0.0, isTRUEorFALSE(showProgress), + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0L, isTRUEorFALSE(showProgress), isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L From c10ddd1a0f89946dbf7b10ef95ce26c83002a314 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Mar 2024 00:17:35 +0100 Subject: [PATCH 27/30] refine warning Co-authored-by: Michael Chirico --- src/fread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 3a9c97d3ab..a1521fb371 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2637,7 +2637,7 @@ int freadMain(freadMainArgs _args) { ch = headPos; int tt = countfields(&ch); if (fill>0) { - DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or even higher ncol estimate. First discarded non-empty line: <<%s>>"), + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or even more based on your knowledge of the input file. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, tt, strlim(skippedFooter,500)); } else { DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), From 0616651aaabac943c96a9d1f49bc1a1156cf098e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger <52290390+ben-schwen@users.noreply.github.com> Date: Thu, 21 Mar 2024 00:17:56 +0100 Subject: [PATCH 28/30] wording Co-authored-by: Michael Chirico --- man/fread.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fread.Rd b/man/fread.Rd index d9d111f9af..b431969dc6 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -53,7 +53,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } - \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided this is used as upper bound guess for the number of columns. } + \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } From 55054e028d3a2ccc5b9e535909cf5852be6d3856 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Thu, 21 Mar 2024 00:29:04 +0100 Subject: [PATCH 29/30] test readability --- inst/tests/tests.Rraw | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d874d15756..d220c91ab3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18333,17 +18333,28 @@ if (test_bit64) { # fread(...,fill) can also be used to specify a guess on the maximum number of columns #2691 #1812 #4130 #3436 #2727 dt_str = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse="") -test(2249.01, fread(text = dt_str, fill=FALSE), data.table(1L, rep(2L, 100)), warning=".*Consider fill=TRUE.*") -test(2249.02, fread(text = dt_str, fill=TRUE), data.table(1L, rep(2L, 100)), warning=".*Consider fill=3.*") -test(2249.03, fread(text = dt_str, fill=2L), data.table(1L, rep(2L, 100)), warning=".*Consider fill=3.*") -test(2249.04, fread(text = dt_str, fill=3L), data.table(1L, 2L, rep(c(NA,3L), each=100))) -test(2249.05, fread(text = dt_str, fill=5L, verbose=TRUE), data.table(1L, 2L, rep(c(NA,3L), each=100)), output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big -test(2249.06, fread(text = dt_str, fill=1000L), data.table(1L, 2L, rep(c(NA,3L), each=100))) # user guess much too big -# example from 2691 -text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n16520, California, ocean, summer, golden gate, beach, San Francisco\n" +ans = data.table(1L, 2L, rep(c(NA, 3L), each=100L)) +test(2249.01, fread(text = dt_str, fill=FALSE), ans[1:100, -3L], warning=".*Consider fill=TRUE.*") +test(2249.02, fread(text = dt_str, fill=TRUE), ans[1:100, -3L], warning=".*Consider fill=3.*") +test(2249.03, fread(text = dt_str, fill=2L), ans[1:100, -3L], warning=".*Consider fill=3.*") +test(2249.04, fread(text = dt_str, fill=3L), ans) +test(2249.05, fread(text = dt_str, fill=5L, verbose=TRUE), ans, output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big +test(2249.06, fread(text = dt_str, fill=1000L), ans) # user guess much too big +lines = c( + "12223, University", + "12227, bridge, Sky", + "12828, Sunset", + "13801, Ground", + "14853, Tranceamerica", + "14854, San Francisco", + "15595, shibuya, Shrine", + "16126, fog, San Francisco", + "16520, California, ocean, summer, golden gate, beach, San Francisco", + "") +text = paste(lines, collapse="\n") test(2249.07, dim(fread(text)), c(6L, 3L), warning=c("fill=TRUE", "Discarded")) test(2249.08, dim(fread(text, fill=TRUE)), c(9L, 9L)) -text = "12223, University\n12227, bridge, Sky\n12828, Sunset\n13801, Ground\n14853, Tranceamerica\n16520, California, ocean, summer, golden gate, beach, San Francisco\n14854, San Francisco\n15595, shibuya, Shrine\n16126, fog, San Francisco\n" +text = paste(lines[c(1:5, 9L, 6:8, 10L)], collapse="\n") test(2249.09, dim(fread(text)), c(3L, 3L), warning=c("fill=TRUE", "fill=7")) test(2249.10, dim(fread(text, fill=TRUE)), c(9L, 9L)) test(2249.11, dim(fread(text, fill=7)), c(9L, 9L)) From 5b96e1b29f45de12a9ceadfae4fb172670726252 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 20 Mar 2024 23:31:16 -0700 Subject: [PATCH 30/30] small tweak to NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c46e815415..7110f10e08 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,7 +24,7 @@ 6. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. -7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. +7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES