diff --git a/NEWS.md b/NEWS.md index d751b16526..7110f10e08 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,7 +22,9 @@ 5. `transpose` gains `list.cols=` argument, [#5639](https://github.com/Rdatatable/data.table/issues/5639). Use this to return output with list columns and avoids type promotion (an exception is `factor` columns which are promoted to `character` for consistency between `list.cols=TRUE` and `list.cols=FALSE`). This is convenient for creating a row-major representation of a table. Thanks to @MLopez-Ibanez for the request, and Benjamin Schwendinger for the PR. -4. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. +6. Using `dt[, names(.SD) := lapply(.SD, fx)]` now works, [#795](https://github.com/Rdatatable/data.table/issues/795) -- one of our [most-requested issues (see #3189)](https://github.com/Rdatatable/data.table/issues/3189). Thanks to @brodieG for the report, 20 or so others for chiming in, and @ColeMiller1 for PR. + +7. `fread`'s `fill` argument now also accepts an `integer` in addition to boolean values. `fread` always guesses the number of columns based on reading a sample of rows in the file. When `fill=TRUE`, `fread` stops reading and ignores subsequent rows when this estimate winds up too low, e.g. when the sampled rows happen to exclude some rows that are even wider, [#2727](https://github.com/Rdatatable/data.table/issues/2727) [#2691](https://github.com/Rdatatable/data.table/issues/2691) [#4130](https://github.com/Rdatatable/data.table/issues/4130) [#3436](https://github.com/Rdatatable/data.table/issues/3436). Providing an `integer` as argument for `fill` allows for a manual estimate of the number of columns instead, [#1812](https://github.com/Rdatatable/data.table/issues/1812) [#5378](https://github.com/Rdatatable/data.table/issues/5378). Thanks to @jangorecki, @christellacaze, @Yiguan, @alexdthomas, @ibombonato, @Befrancesco, @TobiasGold for reporting/requesting, and Benjamin Schwendinger for the PR. ## BUG FIXES diff --git a/R/fread.R b/R/fread.R index 8e9a11b123..b4086d155f 100644 --- a/R/fread.R +++ b/R/fread.R @@ -22,11 +22,12 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") } stopifnot( - isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill), isTRUEorFALSE(showProgress), + isTRUEorFALSE(strip.white), isTRUEorFALSE(blank.lines.skip), isTRUEorFALSE(fill) || is.numeric(fill) && length(fill)==1L && fill >= 0L, isTRUEorFALSE(showProgress), isTRUEorFALSE(verbose), isTRUEorFALSE(check.names), isTRUEorFALSE(logical01), isTRUEorFALSE(keepLeadingZeros), isTRUEorFALSE(yaml), isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L ) + fill=as.integer(fill) nrows=as.double(nrows) #4686 if (is.na(nrows) || nrows<0) nrows=Inf # accept -1 to mean Inf, as read.table does if (identical(header,"auto")) header=NA diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 10f365332b..2e75c8c96a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -18389,3 +18389,33 @@ test(2250.12, dt[, names(.SD) := lapply(.SD, \(x) x + b), .SDcols = "a"], data.t dt = data.table(a = 1L, b = 2L, c = 3L, d = 4L, e = 5L, f = 6L) test(2250.13, dt[, names(.SD)[1:5] := sum(.SD)], data.table(a = 21L, b = 21L, c = 21L, d = 21L, e = 21L, f = 6L)) + +# fread(...,fill) can also be used to specify a guess on the maximum number of columns #2691 #1812 #4130 #3436 #2727 +dt_str = paste(rep(c("1,2\n", "1,2,3\n"), each=100), collapse="") +ans = data.table(1L, 2L, rep(c(NA, 3L), each=100L)) +test(2251.01, fread(text = dt_str, fill=FALSE), ans[1:100, -3L], warning=".*Consider fill=TRUE.*") +test(2251.02, fread(text = dt_str, fill=TRUE), ans[1:100, -3L], warning=".*Consider fill=3.*") +test(2251.03, fread(text = dt_str, fill=2L), ans[1:100, -3L], warning=".*Consider fill=3.*") +test(2251.04, fread(text = dt_str, fill=3L), ans) +test(2251.05, fread(text = dt_str, fill=5L, verbose=TRUE), ans, output="Provided number of fill columns: 5 but only found 3\n Dropping 2 overallocated columns") # user guess slightly too big +test(2251.06, fread(text = dt_str, fill=1000L), ans) # user guess much too big +lines = c( + "12223, University", + "12227, bridge, Sky", + "12828, Sunset", + "13801, Ground", + "14853, Tranceamerica", + "14854, San Francisco", + "15595, shibuya, Shrine", + "16126, fog, San Francisco", + "16520, California, ocean, summer, golden gate, beach, San Francisco", + "") +text = paste(lines, collapse="\n") +test(2251.07, dim(fread(text)), c(6L, 3L), warning=c("fill=TRUE", "Discarded")) +test(2251.08, dim(fread(text, fill=TRUE)), c(9L, 9L)) +text = paste(lines[c(1:5, 9L, 6:8, 10L)], collapse="\n") +test(2251.09, dim(fread(text)), c(3L, 3L), warning=c("fill=TRUE", "fill=7")) +test(2251.10, dim(fread(text, fill=TRUE)), c(9L, 9L)) +test(2251.11, dim(fread(text, fill=7)), c(9L, 9L)) +test(2251.12, dim(fread(text, fill=9)), c(9L, 9L)) +test(2251.13, dim(fread(text, fill=20)), c(9L, 20L)) # clean up currently only kicks in if sep!=' ' diff --git a/man/fread.Rd b/man/fread.Rd index 09ed80bd58..b431969dc6 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -53,7 +53,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC" \item{encoding}{ default is \code{"unknown"}. Other possible options are \code{"UTF-8"} and \code{"Latin-1"}. Note: it is not used to re-encode the input, rather enables handling of encoded strings in their native encoding. } \item{quote}{ By default (\code{"\""}), if a field starts with a double quote, \code{fread} handles embedded quotes robustly as explained under \code{Details}. If it fails, then another attempt is made to read the field \emph{as is}, i.e., as if quotes are disabled. By setting \code{quote=""}, the field is always read as if quotes are disabled. It is not expected to ever need to pass anything other than \"\" to quote; i.e., to turn it off. } \item{strip.white}{ default is \code{TRUE}. Strips leading and trailing whitespaces of unquoted fields. If \code{FALSE}, only header trailing spaces are removed. } - \item{fill}{logical (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, blank fields are implicitly filled.} + \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } diff --git a/src/fread.c b/src/fread.c index f27c17ff18..a1521fb371 100644 --- a/src/fread.c +++ b/src/fread.c @@ -55,7 +55,9 @@ static const char* const* NAstrings; static bool any_number_like_NAstrings=false; static bool blank_is_a_NAstring=false; static bool stripWhite=true; // only applies to character columns; numeric fields always stripped -static bool skipEmptyLines=false, fill=false; +static bool skipEmptyLines=false; +static int fill=0; +static int *dropFill = NULL; static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE @@ -141,6 +143,7 @@ bool freadCleanup(void) free(tmpType); tmpType = NULL; free(size); size = NULL; free(colNames); colNames = NULL; + free(dropFill); dropFill = NULL; if (mmp != NULL) { // Important to unmap as OS keeps internal reference open on file. Process is not exiting as // we're a .so/.dll here. If this was a process exiting we wouldn't need to unmap. @@ -171,7 +174,7 @@ bool freadCleanup(void) stripWhite = true; skipEmptyLines = false; eol_one_r = false; - fill = false; + fill = 0; // following are borrowed references: do not free sof = eof = NULL; NAstrings = NULL; @@ -1618,7 +1621,7 @@ int freadMain(freadMainArgs _args) { if (eol(&ch)) ch++; } firstJumpEnd = ch; // size of first 100 lines in bytes is used later for nrow estimate - fill = true; // so that blank lines are read as empty + fill = 1; // so that blank lines are read as empty ch = pos; } else { int nseps; @@ -1750,7 +1753,7 @@ int freadMain(freadMainArgs _args) { } sep = topSep; whiteChar = (sep==' ' ? '\t' : (sep=='\t' ? ' ' : 0)); - ncol = topNumFields; + ncol = fill > topNumFields ? fill : topNumFields; // overwrite user guess if estimated number is higher if (fill || sep==127) { // leave pos on the first populated line; that is start of data ch = pos; @@ -2125,6 +2128,7 @@ int freadMain(freadMainArgs _args) { int nTypeBump=0, nTypeBumpCols=0; double tRead=0, tReread=0; double thRead=0, thPush=0; // reductions of timings within the parallel region + int max_col=0; char *typeBumpMsg=NULL; size_t typeBumpMsgSize=0; int typeCounts[NUMTYPE]; // used for verbose output; needs populating after first read and before reread (if any) -- see later comment #define internalErrSize 1000 @@ -2218,7 +2222,7 @@ int freadMain(freadMainArgs _args) { } prepareThreadContext(&ctx); - #pragma omp for ordered schedule(dynamic) reduction(+:thRead,thPush) + #pragma omp for ordered schedule(dynamic) reduction(+:thRead,thPush) reduction(max:max_col) for (int jump = jump0; jump < nJumps; jump++) { if (stopTeam) continue; // must continue and not break. We desire not to depend on (relatively new) omp cancel directive, yet double tLast = 0.0; // thread local wallclock time at last measuring point for verbose mode only. @@ -2299,6 +2303,7 @@ int freadMain(freadMainArgs _args) { tch++; j++; } + if (j > max_col) max_col = j; //*** END HOT. START TEPID ***// if (tch==tLineStart) { skip_white(&tch); // skips \0 before eof @@ -2310,6 +2315,7 @@ int freadMain(freadMainArgs _args) { int8_t thisSize = size[j]; if (thisSize) ((char **) targets)[thisSize] += thisSize; j++; + if (j > max_col) max_col = j; if (j==ncol) { tch++; myNrow++; continue; } // next line. Back up to while (tch1 && max_col0) { + int ndropFill = ncol - max_col; + if (verbose) { + DTPRINT(_(" Provided number of fill columns: %d but only found %d\n"), ncol, max_col); + DTPRINT(_(" Dropping %d overallocated columns\n"), ndropFill); + } + dropFill = (int *)malloc((size_t)ndropFill * sizeof(int)); + int i=0; + for (int j=max_col; j>"), + if (fill>0) { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=%d or even more based on your knowledge of the input file. First discarded non-empty line: <<%s>>"), + (uint64_t)DTi+row1line, ncol, tt, tt, strlim(skippedFooter,500)); + } else { + DTWARN(_("Stopped early on line %"PRIu64". Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>"), (uint64_t)DTi+row1line, ncol, tt, strlim(skippedFooter,500)); + } } } } diff --git a/src/fread.h b/src/fread.h index 7035615a55..1e27836433 100644 --- a/src/fread.h +++ b/src/fread.h @@ -124,8 +124,10 @@ typedef struct freadMainArgs bool skipEmptyLines; // If True, then rows are allowed to have variable number of columns, and - // all ragged rows will be filled with NAs on the right. - bool fill; + // all ragged rows will be filled with NAs on the right. Supplying integer + // argument > 1 results in setting an upper bound estimate for the number + // of columns. + int fill; // If True, then emit progress messages during the parsing. bool showProgress; @@ -348,6 +350,11 @@ void pushBuffer(ThreadLocalFreadParsingContext *ctx); void setFinalNrow(size_t nrows); +/** + * Called at the end to delete columns added due to too high user guess for fill. + */ +void dropFilledCols(int* dropArg, int ndrop); + /** * Free any srtuctures associated with the thread-local parsing context. */ diff --git a/src/freadR.c b/src/freadR.c index 6b12210f5f..97fbfadaca 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -45,7 +45,7 @@ static int64_t dtnrows = 0; static bool verbose = false; static bool warningsAreErrors = false; static bool oldNoDateTime = false; - +static int *dropFill; SEXP freadR( // params passed to freadMain @@ -82,7 +82,7 @@ SEXP freadR( freadMainArgs args; ncol = 0; dtnrows = 0; - + if (!isString(inputArg) || LENGTH(inputArg)!=1) error(_("Internal error: freadR input not a single character string: a filename or the data itself. Should have been caught at R level.")); // # nocov const char *ch = (const char *)CHAR(STRING_ELT(inputArg,0)); @@ -152,7 +152,7 @@ SEXP freadR( // here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL args.stripWhite = LOGICAL(stripWhiteArg)[0]; args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0]; - args.fill = LOGICAL(fillArg)[0]; + args.fill = INTEGER(fillArg)[0]; args.showProgress = LOGICAL(showProgressArg)[0]; if (INTEGER(nThreadArg)[0]<1) error(_("nThread(%d)<1"), INTEGER(nThreadArg)[0]); args.nth = (uint32_t)INTEGER(nThreadArg)[0]; @@ -533,6 +533,16 @@ void setFinalNrow(size_t nrow) { R_FlushConsole(); // # 2481. Just a convenient place; nothing per se to do with setFinalNrow() } +void dropFilledCols(int* dropArg, int ndelete) { + dropFill = dropArg; + int ndt=length(DT); + for (int i=0; i