From 505a034125a59d6002e40b417d80740e2b9a9065 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 5 Feb 2018 11:53:21 -0800 Subject: [PATCH 01/14] Interim --- R/fread.R | 9 ++++++--- inst/tests/tests.Rraw | 13 ++++++++----- src/fread.c | 38 +++++++++++++++++++++++++++++++------- src/freadR.c | 8 +++----- 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/R/fread.R b/R/fread.R index f6064bdcf7..6bf53829bf 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ -fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=NA,skip=0,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=interactive(),data.table=getOption("datatable.fread.datatable"),nThread=getDTthreads(),logical01=TRUE) +fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),skip="auto",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=interactive(),data.table=getOption("datatable.fread.datatable"),nThread=getDTthreads(),logical01=TRUE,autostart=NA) { if (is.null(sep)) sep="\n" # C level knows that \n means \r\n on Windows, for example else { @@ -21,7 +21,6 @@ fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows= if (is.na(nrows) || nrows<0) nrows=Inf # accept -1 to mean Inf, as read.table does if (identical(header,"auto")) header=NA stopifnot(isTrueFalseNA(header)) - stopifnot(length(skip)==1L) stopifnot(is.numeric(nThread) && length(nThread)==1L) nThread=as.integer(nThread) stopifnot(nThread>=1) @@ -87,7 +86,11 @@ fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows= colClasses = tapply(names(colClasses), colClasses, c, simplify=FALSE) } } - if (is.numeric(skip)) skip = as.integer(skip) + stopifnot(length(skip)==1L, !is.na(skip), is.character(skip) || is.numeric(skip)) + if (skip=="auto") skip=-1L + # so, skip="string" so long as "string" is not "auto". The skip="auto" default best conveys something + # is automatic there (better than skip=-1 or skip=NA). skip="string" is rarely used, so ok to treat "auto" specially. + if (is.double(skip)) skip = as.integer(skip) warnings2errors = getOption("warn") >= 2 ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, fill,showProgress,nThread,verbose,warnings2errors,logical01,select,drop,colClasses,integer64,encoding) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2d76a826c3..bc57f04975 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11510,11 +11510,14 @@ test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L))) test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) -# nrows= now ignores errors after those rows as expected, #1267 -# txt = "V1, V2, V3\n1,2,3\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" -# fread(txt) -# fread(txt, nrows = 1, header = TRUE, skip = 0) -# fread("1,2,3\n1,2", nrows=1) +# nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 +txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" +test(1871.1, fread(txt), data.table(V4=INT(4,8), V5=INT(5,9), V6=INT(6,10), V7=INT(7,11))) +test(1871.2, fread(txt, nrows=1), data.table(V4=4L, V5=5L, V6=6L, V7=7L)) +test(1871.3, fread(txt, skip=0), ans<-data.table(V1=2L, V2=3L, V3=4L), warning="discarded line V4, V5") +test(1871.4, fread(txt, skip=0, nrows=1), ans) +test(1871.5, fread(txt, skip=0, nrows=1, header=TRUE), ans) +test(1871.6, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1=c("V1","2"), V2=c("V2","3"), V3=c("V3","4"))) # for ( i in 100:1) { # lines <- paste0(paste(rep("1,2,3", i), collapse='\n'), "\n1,2") # fread(lines, nrows=i) diff --git a/src/fread.c b/src/fread.c index fa59f3b3c5..a69c80d22b 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1073,7 +1073,7 @@ int freadMain(freadMainArgs _args) { else DTPRINT(" None of the NAstrings look like numbers.\n"); } - if (args.skipNrow) DTPRINT(" skip num lines = %llu\n", (llu)args.skipNrow); + if (args.skipNrow >= 0) DTPRINT(" skip num lines = %llu\n", (llu)args.skipNrow); if (args.skipString) DTPRINT(" skip to string = <<%s>>\n", args.skipString); DTPRINT(" show progress = %d\n", args.showProgress); DTPRINT(" 0/1 column will be read as %s\n", args.logical01? "boolean" : "integer"); @@ -1269,6 +1269,7 @@ int freadMain(freadMainArgs _args) { //********************************************************************************************* const char *pos; // Location where the actual data in the file begins int row1line = 1; // The line number where the data starts. Normally row 1 is column names and row1line ends up == 2. + bool skipAuto = true; { // First, set 'LFpresent' for use by eol() to know if \r-only line ending is allowed, #2371 @@ -1298,12 +1299,14 @@ int freadMain(freadMainArgs _args) { if (verbose) DTPRINT("Found skip='%s' on line %llu. Taking this to be header row or first row of data.\n", args.skipString, (llu)row1line); ch = pos; + skipAuto = false; } - // Skip the first `skipNrow` lines of input. - else if (args.skipNrow>0) { + // Skip the first `skipNrow` lines of input, including 0 to force the first line to be the start + else if (args.skipNrow >= 0) { while (ch=eof) STOP("skip=%llu but the input only has %llu line%s", (llu)args.skipNrow, (llu)row1line, row1line>1?"s":""); pos = ch; + skipAuto = false; } // skip blank input at the start @@ -1383,7 +1386,12 @@ int freadMain(freadMainArgs _args) { while (ch0.0) args.skipNrow = (uint64_t)REAL(skipArg)[0]; } else if (isInteger(skipArg)) { - if (INTEGER(skipArg)[0]>0) args.skipNrow = (uint64_t)INTEGER(skipArg)[0]; - } else error("skip must be a single positive numeric (integer or double), or a string to search for"); + args.skipNrow = (int64_t)INTEGER(skipArg)[0]; + } else error("Internal error: skip not integer or string in freadR.c"); if (!isNull(NAstringsArg) && !isString(NAstringsArg)) error("'na.strings' is type '%s'. Must be either NULL or a character vector.", type2char(TYPEOF(NAstringsArg))); From 71b52e0789ca0431994434cea4ca4259c62b15f3 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 9 Feb 2018 11:39:19 -0800 Subject: [PATCH 02/14] Interim --- src/fread.c | 70 +++++++++++++++++++++------------------------------- src/freadR.c | 2 +- 2 files changed, 29 insertions(+), 43 deletions(-) diff --git a/src/fread.c b/src/fread.c index a69c80d22b..dc249e8bd1 100644 --- a/src/fread.c +++ b/src/fread.c @@ -65,8 +65,6 @@ static bool skipEmptyLines=false, fill=false; static double NA_FLOAT64; // takes fread.h:NA_FLOAT64_VALUE -#define JUMPLINES 100 // at each of the 100 jumps how many lines to guess column types (10,000 sample lines) - // Private globals so they can be cleaned up both on error and on successful return static void *mmp = NULL; static size_t fileSize; @@ -166,6 +164,7 @@ bool freadCleanup(void) #define CEIL(x) ((size_t)(double)ceil(x)) static inline size_t umax(size_t a, size_t b) { return a > b ? a : b; } +static inline size_t umin(size_t a, size_t b) { return a < b ? a : b; } static inline int imin(int a, int b) { return a < b ? a : b; } /** Return value of `x` clamped to the range [upper, lower] */ @@ -1333,6 +1332,7 @@ int freadMain(freadMainArgs _args) { int ncol; // Detected number of columns in the file const char *firstJumpEnd=NULL; // remember where the winning jumpline from jump 0 ends, to know its size excluding header const char *prevStart = NULL; // the start of the non-empty line before the first not-ignored row (for warning message later, or taking as column names) + int jumpLines = (int)umin(100,nrowLimit); // how many lines from each jump point to use. If nrowLimit is supplied, nJumps is later set to 1 as well. { if (verbose) DTPRINT("[06] Detect separator, quoting rule, and ncolumns\n"); @@ -1364,26 +1364,28 @@ int freadMain(freadMainArgs _args) { // (when fill=true, the max is usually the header row and is the longest but there are more // lines of fewer) - // We will scan the input line-by-line (at most `JUMPLINES + 1` lines; "+1" + // We will scan the input line-by-line (at most 100+1 lines; "+1" // covers the header row, at this stage we don't know if it's present), and // detect the number of fields on each line. If several consecutive lines // have the same number of fields, we'll call them a "contiguous group of // lines". Arrays `numFields` and `numLines` contain information about each - // contiguous group of lines encountered while scanning the first JUMPLINES - // + 1 lines: 'numFields` gives the count of fields in each group, and - // `numLines` has the number of lines in each group. - int numFields[JUMPLINES+1]; - int numLines[JUMPLINES+1]; + // contiguous group of lines encountered while scanning the first 100+1 + // lines: 'numFields` gives the count of fields in each group, and + // `numLines` has the number of lines in each group. There is always a lot + // of unused space at the end of these vectors. They are only jumpLines+1 big + // for the worst case that no adjacent lines have the same number of fields. + int numFields[jumpLines+1]; + int numLines[jumpLines+1]; for (int s=0; s0) { if (jump0size*100*2 < sz) nJumps=100; // 100 jumps * 100 lines = 10,000 line sample else if (jump0size*10*2 < sz) nJumps=10; // *2 to get a good spacing. We don't want overlaps resulting in double counting. - // nJumps==1 means the whole (small) file will be sampled with one thread } nJumps++; // the extra sample at the very end (up to eof) is sampled and format checked but not jumped to when reading + if (nrowLimit in the last field of last row where finalByte=='A' and N caused bump to character (test 894.0221) - if (verbose) DTPRINT(" Reverted bump of final column from %d to %d on final field due to finalByte='%c'." - " If the bump was actually correct, there will be a reread. Finish the file properly with newline to avoid the reread.\n", - previousLastColType, tmpType[ncol-1], finalByte); - tmpType[ncol-1] = previousLastColType; - } + if (ch==eof && finalByte && tmpType[ncol-1]!=previousLastColType) { + // revert bump due to e.g. ,NA in the last field of last row where finalByte=='A' and N caused bump to character (test 894.0221) + if (verbose) DTPRINT(" Reverted bump of final column from %d to %d on final field due to finalByte='%c'." + " If the bump was actually correct, there will be a reread. Finish the file properly with newline to avoid the reread.\n", + previousLastColType, tmpType[ncol-1], finalByte); + tmpType[ncol-1] = previousLastColType; } if (!eol(&ch) && *ch!='\0') { if (jump==0) { @@ -1715,7 +1701,7 @@ int freadMain(freadMainArgs _args) { meanLineLen=0.0; // Average length (in bytes) of a single line in the input file bytesRead=0; // Bytes in the data section (i.e. excluding column names, header and footer, if any) - if (nJumps==1) { + if (sampleLines < jumpLines) { if (verbose) DTPRINT(" All rows were sampled since file is small so we know nrow=%llu exactly\n", (llu)sampleLines); estnrow = allocnrow = sampleLines; } else { @@ -1882,7 +1868,7 @@ int freadMain(freadMainArgs _args) { else if (nJumps>nth) nJumps = nth*(1+(nJumps-1)/nth); chunkBytes = bytesRead / (size_t)nJumps; } else { - nJumps = 1; + ASSERT(nJumps==1, "nJumps (%d) != 1", nJumps); } size_t initialBuffRows = allocnrow / (size_t)nJumps; diff --git a/src/freadR.c b/src/freadR.c index 17da5a92d9..ce969e5873 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -121,7 +121,7 @@ SEXP freadR( if (isReal(nrowLimitArg)) { if (R_FINITE(REAL(nrowLimitArg)[0]) && REAL(nrowLimitArg)[0]>=0.0) args.nrowLimit = (int64_t)(REAL(nrowLimitArg)[0]); } else { - if (INTEGER(nrowLimitArg)[0]>=0) args.nrowLimit = (int64_t)INTEGER(nrowLimitArg)[0]; + if (INTEGER(nrowLimitArg)[0]>=1) args.nrowLimit = (int64_t)INTEGER(nrowLimitArg)[0]; } args.logical01 = LOGICAL(logical01Arg)[0]; From 6e3841ed51291cfb3105a657ad9dcd3763aded57 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 9 Feb 2018 13:23:44 -0800 Subject: [PATCH 03/14] Tidied test numbers --- inst/tests/tests.Rraw | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index a7d388fe54..3ff735564d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11515,9 +11515,9 @@ test(1869.6, fread(testDir("colnames4096.csv")), error="very unusual.*one single test(1869.7, fread(testDir("onecol4096.csv")), error="very unusual.*single column.*multiple of 4096.*ends with 2 or more end-of-line") # better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526 -test(1871.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L))) -test(1871.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L))) -test(1871.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) +test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L))) +test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L))) +test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) # nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" @@ -11590,11 +11590,13 @@ DT = data.table(x=rep(c("b","a","c"),each=3), y=c(1,3,6), v=1:9) test(1872.14, DT[X, on=.(x, v>=v), verbose = TRUE], output = 'Non-equi join operators.*forder took.*group lengths.*done.*non-equi group ids.*done') - +# out-of-sample bump from int to quoted field containing comma, #2614 DT = data.table(A=rep(10L, 2200), B="20") DT[111, B:="3,456"] -fwrite(DT, f<-tempfile()) +fwrite(DT,f<-tempfile()) test(1873, fread(f), DT) +unlink(f) + ########################## From 0fdec61652baf13c91cc0e2fc41f63824f75ba79 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 9 Feb 2018 14:15:00 -0800 Subject: [PATCH 04/14] Interim --- R/fread.R | 6 ++---- src/fread.c | 9 +++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/R/fread.R b/R/fread.R index 0f046b8cb5..f6c885aa49 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,5 +1,5 @@ -fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),skip="auto",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=interactive(),data.table=getOption("datatable.fread.datatable"),nThread=getDTthreads(),logical01=TRUE,autostart=NA) +fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),skip="__auto__",select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=interactive(),data.table=getOption("datatable.fread.datatable"),nThread=getDTthreads(),logical01=TRUE,autostart=NA) { if (is.null(sep)) sep="\n" # C level knows that \n means \r\n on Windows, for example else { @@ -87,9 +87,7 @@ fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows= } } stopifnot(length(skip)==1L, !is.na(skip), is.character(skip) || is.numeric(skip)) - if (skip=="auto") skip=-1L - # so, skip="string" so long as "string" is not "auto". The skip="auto" default best conveys something - # is automatic there (better than skip=-1 or skip=NA). skip="string" is rarely used, so ok to treat "auto" specially. + if (skip=="__auto__") skip=-1L # skip="string" so long as "string" is not "__auto__". Best conveys to user something is automatic there (than -1 or NA). if (is.double(skip)) skip = as.integer(skip) warnings2errors = getOption("warn") >= 2 ans = .Call(CfreadR,input,sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, diff --git a/src/fread.c b/src/fread.c index 841f53163b..ccb9780bf4 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1518,7 +1518,7 @@ int freadMain(freadMainArgs _args) { // *2 to get a good spacing. We don't want overlaps resulting in double counting. } nJumps++; // the extra sample at the very end (up to eof) is sampled and format checked but not jumped to when reading - if (nrowLimit1) { + if (nJumps/*from sampling*/>2) { // ensure data size is split into same sized chunks (no remainder in last chunk) and a multiple of nth // when nth==1 we still split by chunk for consistency (testing) and code sanity nJumps = (int)(bytesRead/chunkBytes); @@ -1868,7 +1868,8 @@ int freadMain(freadMainArgs _args) { else if (nJumps>nth) nJumps = nth*(1+(nJumps-1)/nth); chunkBytes = bytesRead / (size_t)nJumps; } else { - ASSERT(nJumps==1, "nJumps (%d) != 1", nJumps); + ASSERT(nJumps==1 /*when nrowLimit supplied*/ || nJumps==2 /*small files*/, "nJumps (%d) != 1|2", nJumps); + nJumps=1; } size_t initialBuffRows = allocnrow / (size_t)nJumps; From f4bf93e229f56fa156527f37a7c6f767c31211b5 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 9 Feb 2018 20:30:55 -0800 Subject: [PATCH 05/14] Interim --- inst/tests/tests.Rraw | 39 ++++++++++--------- src/fread.c | 89 +++++++++++++++++++++++++++++-------------- 2 files changed, 81 insertions(+), 47 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3ff735564d..c97bd80f89 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2746,9 +2746,9 @@ test(966, fread(input, colClasses=list(character=2:4)), error="Column number 4 ( test(967, nrow(fread( paste( rep('a\tb\n', 10000), collapse=''), header=FALSE)), 10000L) # Test fread warns about removal of any footer (and autostart skips up over it) -test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Found the last.*discarded.*Rowcount: 2") -test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Found the last.*discarded.*Rowcount: 2") -test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Found the last.*discarded.*Rowcount: 2") +test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Discarded footer.*Rowcount: 2") +test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Discarded footer.*Rowcount: 2") +test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Discarded footer.*Rowcount: 2") # fread skip override input = "some,bad,data\nA,B,C\n1,3,5\n2,4,6\n" @@ -2758,11 +2758,11 @@ test(973, fread(input, skip=2), data.table(V1=1:2,V2=3:4,V3=5:6)) test(974, fread(input, skip=2, header=TRUE), data.table("1"=2L,"3"=4L,"5"=6L)) test(975, fread(input, skip="B"), data.table(A=1:2,B=3:4,C=5:6)) input = "\n\nA,B\n1,3\n2,4\n\nC,D\n5,7\n6,8\n\nE,F\n9,11\n10,12\n" # 3 tables in one file -test(976, fread(input), data.table(A=1:2,B=3:4), warning="Found the last.*discarded.*C,D") -test(977, fread(input, skip="C"), data.table(C=5:6,D=7:8), warning="Found the last.*discarded.*E,F") -test(978.1, fread(input, skip="D"), data.table(C=5:6,D=7:8), warning="Found the last.*discarded.*E,F") -test(978.2, fread(input, skip=",F"), data.table(E=9:10,F=11:12)) -test(978.3, fread(input, skip=3), data.table(V1=1:2, V2=3:4), warning="Found the last.*discarded.*C,D") +test(976, fread(input), error="Line 6 has too few fields. Expecting 2 fields but found 0") +test(977, fread(input, skip="C"), error="Line 10 has too few fields. Expecting 2 fields but found 0") +test(978.1, fread(input, skip="D"), error="Line 10") +test(978.2, fread(input, skip=",F"), data.table(E=9:10, F=11:12)) +test(978.3, fread(input, skip=9), data.table(E=9:10, F=11:12)) # mixed add and update in same `:=` bug/crash, #2528 and #2778 DT = data.table(x=rep(1:2, c(3,2)), y=6:10) @@ -7409,8 +7409,9 @@ str2="YYYY MM DD HH mm 19490 40790 test(1555.14, fread(str1), fread(str2)) # fix for #1330 -test(1556.1, fread(testDir("issue_1330_fread.txt"), nrow=2), data.table(a=1:2, b=1:2), warning="Found.*discarded.*<<3.*3>>") -test(1556.2, fread(testDir("issue_1330_fread.txt"), nrow=4), data.table(a=1:2, b=1:2), warning="Found.*discarded.*<<3.*3>>") +test(1556.1, fread(testDir("issue_1330_fread.txt"), nrow=2), data.table(a=1:2, b=1:2)) +test(1556.2, fread(testDir("issue_1330_fread.txt"), nrow=3), error="Line 4 has too few fields") +test(1556.3, fread(testDir("issue_1330_fread.txt"), nrow=4), error="Line 4 has too few fields") # FR #768 str="1,2\n3,4\n" @@ -7766,7 +7767,7 @@ test(1585.2, f1(testDir("536_fread_fill_1.txt"), b=TRUE), f2(testDir("536_fread_ test(1585.3, f1(testDir("536_fread_fill_2.txt")), f2(testDir("536_fread_fill_2.txt"))) test(1585.4, f1(testDir("536_fread_fill_2.txt"), b=TRUE), f2(testDir("536_fread_fill_2.txt"), b=TRUE)) -test(1585.5, f1(testDir("536_fread_fill_3_extreme.txt")), f2(testDir("536_fread_fill_3_extreme.txt"))[-(7:9),]) +test(1585.5, f1(testDir("536_fread_fill_3_extreme.txt")), f2(testDir("536_fread_fill_3_extreme.txt"))[-9,]) test(1585.6, f1(testDir("536_fread_fill_3_extreme.txt"), b=TRUE), f2(testDir("536_fread_fill_3_extreme.txt"), b=TRUE)) # no warning about bumping type. when fill=TRUE, column type detection starts at first non-empty line (which makes sense). test(1585.7, f1(testDir("536_fread_fill_4.txt")), f2(testDir("536_fread_fill_4.txt"))[-29,]) @@ -10978,8 +10979,8 @@ test(1808.2, fread("A,B\r1,2\r3,4\r"), data.table(A=c(1L,3L),B=c(2L,4L))) cat("A,B\r1,2\r3,4",file=f<-tempfile()) test(1808.3, fread(f), data.table(A=c(1L,3L),B=c(2L,4L))) unlink(f) -test(1808.4, fread("A,B\r1,3\r\r\r2,4\r"), data.table(A=TRUE, B=3L), warning="last consistent line") -test(1808.5, fread("A,B\r4,3\r\r \r2,4\r"), data.table(A=4L, B=3L), warning="afterwards.*discarded.*<<2,4>>") +test(1808.4, fread("A,B\r1,3\r\r\r2,4\r"), data.table(A=TRUE, B=3L), warning="Discarded footer: <<2,4>>") +test(1808.5, fread("A,B\r4,3\r\r \r2,4\r"), data.table(A=4L, B=3L), warning="Discarded footer: <<2,4>>") test(1808.6, fread("A,B\r1,3\r\r \r2,4\r", blank.lines.skip=TRUE), data.table(A=1:2, B=3:4)) test(1808.7, fread("A,B\r1,3\r\r \r2,4\r", fill=TRUE), data.table(A=c(1L,NA,NA,2L), B=c(3L,NA,NA,4L))) test(1808.8, fread("A,B\r1,3\r\r \r2,\r", blank.lines.skip=TRUE, fill=TRUE), data.table(A=1:2, B=c(3L,NA))) @@ -11015,10 +11016,10 @@ test(1818, fread(testDir("session_aborted_fatal_error.txt"))[c(1,.N),c(1,2,250,2 # expansion of uses of as.ITime.character, PR#1796 test(1819, as.ITime("2015-09-29 08:22:00"), structure(30120L, class = "ITime")) -# Issue 2287: the % sign in the error message should not be interpreted as a format string! -test(1820.1, fread("name,id\nfoo,1\nbar%\n"), error="Line 3 has too few.*Expecting 2 fields but found 1.*<>") -test(1820.2, fread("name,id\nfoo,1\nbar%d"), error="Line 3 has too few.*Expecting 2 fields but found 1.*<>") -test(1820.3, fread("name,id\nfoo,1\nbar%s"), error="Line 3 has too few.*Expecting 2 fields but found 1.*<>") +# Issue 2287: the % sign in the error/warning message should not be interpreted as a format string! +test(1820.1, fread("name,id\nfoo,2\nbar%\n"), data.table(name="foo", id=2L), warning="Discarded footer: <>") +test(1820.2, fread("name,id\nfoo,2\nbar%d"), data.table(name="foo", id=2L), warning="Discarded footer: <>") +test(1820.3, fread("name,id\nfoo,2\nbar%s"), data.table(name="foo", id=2L), warning="Discarded footer: <>") # new argument for print.data.table: col.names # issue #1482 / PR #1483 @@ -11369,7 +11370,7 @@ test(1856.2, fread("A,B\n\n"), ans) test(1856.3, fread("A,B\n\n\n"), ans) test(1856.4, fread("A,B\n3,4\n\n\n"), data.table(A=3L, B=4L)) test(1856.5, fread("A,B\n3,4\n,\n\n\n"), data.table(A=c(3L,NA), B=c(4L,NA))) -test(1856.6, fread("A,B\n3,4\n\n5,6\n"), data.table(A=3L, B=4L), warning="text exists afterwards") +test(1856.6, fread("A,B\n3,4\n\n5,6\n"), data.table(A=3L, B=4L), warning="Discarded footer: <<5,6>>") DTs = list( # passed fread(fwrite(DT))==DT before fix? data.table(A=logical(0)), # yes data.table(A=NA), # no @@ -11510,7 +11511,7 @@ test(1869.1, fread("A\r1\r\r\r2\r"), data.table(A=c(1L,NA,NA,2L))) test(1869.2, fread("A\r1\r\r\r2\r\r"), data.table(A=c(1L,NA,NA,2L,NA))) test(1869.3, fread("A\r1\r\r\r2\r\r\r"), data.table(A=c(1L,NA,NA,2L,NA,NA))) test(1869.4, fread("A,B\r2,3\r,\r,\r4,5\r\r"), data.table(A=c(2L,NA,NA,4L), B=c(3L,NA,NA,5L))) -test(1869.5, fread("A,B\r2,3\r\r,\r2,4\r\r"), data.table(A=2L, B=3L), warning="consistent line") +test(1869.5, fread("A,B\r2,3\r\r,\r2,4\r\r"), error="Line 3 has too few fields. Expecting 2 fields but found 0.") # two line footer because of the comma. Only 1 line footers are auto discarded. test(1869.6, fread(testDir("colnames4096.csv")), error="very unusual.*one single line without any.*r.*n at the end.*and.*multiple of 4096") test(1869.7, fread(testDir("onecol4096.csv")), error="very unusual.*single column.*multiple of 4096.*ends with 2 or more end-of-line") diff --git a/src/fread.c b/src/fread.c index ccb9780bf4..d64881801b 100644 --- a/src/fread.c +++ b/src/fread.c @@ -181,14 +181,17 @@ static inline size_t clamp_szt(size_t x, size_t lower, size_t upper) { * Parameter `limit` cannot exceed 500. */ static const char* strlim(const char *ch, size_t limit) { - static char buf[1002]; + static char buf[1004]; static int flip = 0; - char *ptr = buf + 501 * flip; + char *ptr = buf + 502 * flip; flip = 1 - flip; char *ch2 = ptr; if (limit>500) limit=500; size_t width = 0; - while ((*ch>'\r' || (*ch!='\0' && *ch!='\r' && *ch!='\n')) && width++'\r' || (*ch!='\0' && *ch!='\r' && *ch!='\n')) && width++=eof) break; // The 9th jump could reach the end in the same situation and that's ok. As long as the end is sampled is what we want. if (jump>0 && !nextGoodLine(&ch, ncol)) { // skip this jump for sampling. Very unusual and in such unusual cases, we don't mind a slightly worse guess. + //lastSampleJumpOk = false; continue; } + //lastSampleJumpOk = true; bool bumped = false; // did this jump find any different types; to reduce verbose output to relevant lines bool skipThisJump = false; int jumpLine = 0; // line from this jump point start @@ -1603,22 +1608,25 @@ int freadMain(freadMainArgs _args) { if (thisLineLen>maxLen) maxLen=thisLineLen; } if (skipThisJump) continue; - if (jump==nJumps-1) lastSampleJumpOk = true; + // if (jump==nJumps-1) lastSampleJumpOk = true; if (bumped) memcpy(type, tmpType, (size_t)ncol); if (verbose && (bumped || jump==0 || jump==nJumps-1)) { DTPRINT(" Type codes (jump %03d) : %s Quote rule %d\n", jump, typesAsString(ncol), quoteRule); } } - if (lastSampleJumpOk) { - while (ch>", strlim(ch,200)); - } else { - // nextGoodLine() was false for the last (extra) jump to check the end - // must set lastRowEnd to eof accordingly otherwise it'll be left wherever the last good jump finished - lastRowEnd = eof; + } else { + // nextGoodLine() was false for the last (extra) jump to check the end + // must set lastRowEnd to eof accordingly otherwise it'll be left wherever the last good jump finished + lastRowEnd = eof; + } } - +*/ ch = pos; if (args.header==NA_BOOL8) { for (int j=0; j=0) { - stopTeam = true; - if (myWrongNumberFields>", - (llu)ctx.DTi+myNrow+row1line, ncol, myWrongNumberFields, strlim(tlineStart, 500)); - } else { - snprintf(stopErr, stopErrSize, - "Line %llu has more than the expected %d fields. Stopped on <<%s>> at character %d. " - "Consider setting 'comment.char=' if there is a trailing comment to be ignored. First 500 characters of line: <<%s>>", - (llu)ctx.DTi+myNrow+row1line, ncol, strlim(tch+1,10), (int)(tch-tlineStart+2), strlim(tlineStart,500)); + if (jump==nJumps-1) { // the last jump; we should be at the end of the file or at the start of the footer + const char *tt = tlineStart; + while (tt=0) { + stopTeam = true; + if (myWrongNumberFields>", + (llu)ctx.DTi+myNrow+row1line, ncol, myWrongNumberFields, strlim(tlineStart, 500)); + } else { + snprintf(stopErr, stopErrSize, + "Line %llu has more than the expected %d fields. Stopped on <<%s>> at character %d. " + "Consider setting 'comment.char=' if there is a trailing comment to be ignored. First 500 characters of line: <<%s>>", + (llu)ctx.DTi+myNrow+row1line, ncol, strlim(tch+1,10), (int)(tch-tlineStart+2), strlim(tlineStart,500)); + } } } // tell next thread (she not me) 2 things : @@ -2344,6 +2374,9 @@ int freadMain(freadMainArgs _args) { } } setFinalNrow(DTi); + if (skippedFooter) { + DTWARN("Discarded footer: <<%s>>", strlim(skippedFooter,500)); + } if (verbose) { DTPRINT("=============================\n"); From 67b39b39074d96c2a3cab7ab48dda338d1dd43ca Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 9 Feb 2018 22:03:48 -0800 Subject: [PATCH 06/14] Interim --- inst/tests/tests.Rraw | 24 +++++++++++++++--------- src/fread.c | 21 ++++++++++++++++++++- 2 files changed, 35 insertions(+), 10 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c97bd80f89..e6521467a5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7423,9 +7423,9 @@ test(1557.5, names(fread(str, col.names=1:2)), error="Passed a vector of type") # Fix for #773 f = testDir("issue_773_fread.txt") -ans = data.table(AAA=as.character(c(4,7,rep(1,17),31,21)), - BBB=as.character(c(5,8,rep(2,17),32,22)), - CCC=as.integer(c(6,9,rep(3,17),33,23))) +ans = data.table(AAA=INT(c(4,7,rep(1,17),31,21)), + BBB=INT(c(5,8,rep(2,17),32,22)), + CCC=INT(c(6,9,rep(3,17),33,23))) test(1558.1, fread(f), error="Line 23 has too few fields.*Expecting 3 fields but found 2.*<>") test(1558.2, fread(f, nrow=21L), ans) test(1558.3, fread(f, nrow=21L, fill=TRUE), ans) @@ -11522,12 +11522,18 @@ test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA # nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" -test(1871.1, fread(txt), data.table(V4=INT(4,8), V5=INT(5,9), V6=INT(6,10), V7=INT(7,11))) -test(1871.2, fread(txt, nrows=1), data.table(V4=4L, V5=5L, V6=6L, V7=7L)) -test(1871.3, fread(txt, skip=0), ans<-data.table(V1=2L, V2=3L, V3=4L), warning="discarded line V4, V5") -test(1871.4, fread(txt, skip=0, nrows=1), ans) -test(1871.5, fread(txt, skip=0, nrows=1, header=TRUE), ans) -test(1871.6, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1=c("V1","2"), V2=c("V2","3"), V3=c("V3","4"))) +test(1871.1, fread(txt), ans <- data.table(V4=INT(4,8), V5=INT(5,9), V6=INT(6,10), V7=INT(7,11))) +test(1871.2, fread(txt, skip=2), ans) +test(1871.3, fread(txt, skip=2, nrow=1), ans[1,]) +test(1871.4, fread(txt, skip=2, nrow=3), ans) +test(1871.5, fread(txt, skip=3), ans <- data.table(V1=INT(4,8), V2=INT(5,9), V3=INT(6,10), V4=INT(7,11))) +test(1871.6, fread(txt, skip=3, nrow=1), ans[1,]) +test(1871.7, fread(txt, nrows=1), data.table(V1=2L, V2=3L, V3=4L)) +test(1871.8, fread(txt, skip=0), error="Line 3 has more than the expected 3 fields.*<>") +test(1871.9, fread(txt, skip=0, nrows=1), ans<-data.table(V1=2L, V2=3L, V3=4L)) +test(1871.11, fread(txt, skip=0, nrows=1, header=TRUE), ans) +test(1871.12, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1="V1", V2="V2", V3="V3")) +test(1871.13, fread(txt, skip=0, nrows=2, header=FALSE), data.table(V1=c("V1","2"), V2=c("V2","3"), V3=c("V3","4"))) # for ( i in 100:1) { # lines <- paste0(paste(rep("1,2,3", i), collapse='\n'), "\n1,2") # fread(lines, nrows=i) diff --git a/src/fread.c b/src/fread.c index d64881801b..4c849189e5 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1997,7 +1997,7 @@ int freadMain(freadMainArgs _args) { .anchor = thisJumpStart, }; - while (tch1 || DTi+myNrow>", strlim(skippedFooter,500)); } + else if (prevJumpEnd>", strlim(skippedFooter,500)); + } + else { + STOP("More than one line: <<%s>>", strlim(skippedFooter,500)); + } + } + } if (verbose) { DTPRINT("=============================\n"); From 5ebf28bc9e881367fcda8b22ea0048e1ee281e21 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Feb 2018 03:22:14 -0800 Subject: [PATCH 07/14] Interim --- inst/tests/tests.Rraw | 81 ++++++++++++----------- src/fread.c | 145 ++++++++++++++++++------------------------ 2 files changed, 105 insertions(+), 121 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e6521467a5..f8af6e4fd3 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2746,9 +2746,9 @@ test(966, fread(input, colClasses=list(character=2:4)), error="Column number 4 ( test(967, nrow(fread( paste( rep('a\tb\n', 10000), collapse=''), header=FALSE)), 10000L) # Test fread warns about removal of any footer (and autostart skips up over it) -test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Discarded footer.*Rowcount: 2") -test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Discarded footer.*Rowcount: 2") -test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Discarded footer.*Rowcount: 2") +test(968, fread("A,B\n1,3\n2,4\n\nRowcount: 2\n"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2") +test(969, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2") +test(970, fread("A,B\n1,3\n2,4\n\n\nRowcount: 2\n\n"), data.table(A=1:2,B=3:4), warning="Discarded single-line footer.*Rowcount: 2") # fread skip override input = "some,bad,data\nA,B,C\n1,3,5\n2,4,6\n" @@ -2758,9 +2758,9 @@ test(973, fread(input, skip=2), data.table(V1=1:2,V2=3:4,V3=5:6)) test(974, fread(input, skip=2, header=TRUE), data.table("1"=2L,"3"=4L,"5"=6L)) test(975, fread(input, skip="B"), data.table(A=1:2,B=3:4,C=5:6)) input = "\n\nA,B\n1,3\n2,4\n\nC,D\n5,7\n6,8\n\nE,F\n9,11\n10,12\n" # 3 tables in one file -test(976, fread(input), error="Line 6 has too few fields. Expecting 2 fields but found 0") -test(977, fread(input, skip="C"), error="Line 10 has too few fields. Expecting 2 fields but found 0") -test(978.1, fread(input, skip="D"), error="Line 10") +test(976, fread(input), data.table(A=1:2, B=3:4), warning="Stopped early on line 6.*First discarded non-empty line: <>") +test(977, fread(input, skip="C"), ans<-data.table(C=5:6, D=7:8), warning="Stopped early on line 10.*First discarded non-empty line: <>") +test(978.1, fread(input, skip="D"), ans, warning="Stopped.*line 10.*<>") test(978.2, fread(input, skip=",F"), data.table(E=9:10, F=11:12)) test(978.3, fread(input, skip=9), data.table(E=9:10, F=11:12)) @@ -2887,17 +2887,19 @@ DT = data.table(a=c(NA,NA,FALSE,FALSE), b=c(1,1,2,2)) test(1009, DT[,list(mean(a), sum(a)),by=b], data.table(b=c(1,2),V1=c(NA,0),V2=c(NA_integer_,0L))) # sum(logical()) should be integer, not real # an fread error shouldn't hold a lock on the file on Windows -f = tempfile() -cat('A,B\n1,2\n3\n5,6\n', file=f) -test(1010.1, fread(f), error="Line 3 has too few fields.*Expecting 2 fields but found 1.*fill.*TRUE") +cat('A,B\n1,2\n3\n5,6\n', file=(f<-tempfile())) +test(1010.1, fread(f), ans<-data.table(A=TRUE, B=2L), warning=(txt<-"Stopped early on line 3.*Expected 2 fields but found 1.*fill.*TRUE.*<<3>>")) +oldw = options(warn=2) # !!TODO!!: this doesn't seem sufficient in test framework to turn the warning into error. +test(1010.2, fread(f), ans, warning=txt) cat('7\n8,9',file=f,append=TRUE) # that append works after error -test(1010.2, fread(f,fill=TRUE), data.table(A=INT(1,3,5,7,8), B=INT(2,NA,6,NA,9))) -test(1010.3, fread(f), error="Line 3 has too few fields.*Expecting 2 fields but found 1.*fill.*TRUE") +test(1010.3, fread(f,fill=TRUE), data.table(A=INT(1,3,5,7,8), B=INT(2,NA,6,NA,9))) +test(1010.4, fread(f), ans, warning=txt) cat('A,B\n1,2\n3\n5,6\n', file=f) # that overwrite works after error -test(1010.4, fread(f,fill=TRUE), data.table(A=INT(1,3,5), B=INT(2,NA,6))) -test(1010.5, fread(f), error="Line 3 has too few fields.*Expecting 2 fields but found 1.*fill.*TRUE") +test(1010.5, fread(f,fill=TRUE), data.table(A=INT(1,3,5), B=INT(2,NA,6))) +test(1010.6, fread(f), ans, warning=txt) unlink(f) # that file can be removed after error -test(1010.6, !file.exists(f)) +test(1010.7, !file.exists(f)) +options(oldw) # detection of unescaped quotes, quote rule 3 test(1011, fread('A,B\n"aa",1\n"bb,2\n"cc",3\n'), data.table(A=c('aa', '"bb', 'cc'), B=1:3)) @@ -6189,7 +6191,8 @@ test(1451.8, shallow(DT, character(0)), null.data.table()) # length-0 input wor test(1452, fread("notexist.csv"), error="File 'notexist.csv' does not exist; getwd()==") # Test for #802 -test(1453, fread(testDir("fread_line_error.csv")), error="Line 12 has more than.*24 fields.*Stopped on <<,M,B.Y,Q.B>> at character 61.*<<31,3-0-7 4:1:7.5 HVV,") +test(1453, fread(testDir("fread_line_error.csv")), fread(testDir("fread_line_error.csv"), nrow=11), + warning="Stopped.*line 12. Expected 24 fields but found 47.*First discarded non-empty line: <<31,3-0-7 4:1:7.5 HVV,") # TODO: add comment=="#". Ensure only after last field is observed. # no-sep-found => sep="\n", use case for this in #738 @@ -7409,9 +7412,9 @@ str2="YYYY MM DD HH mm 19490 40790 test(1555.14, fread(str1), fread(str2)) # fix for #1330 -test(1556.1, fread(testDir("issue_1330_fread.txt"), nrow=2), data.table(a=1:2, b=1:2)) -test(1556.2, fread(testDir("issue_1330_fread.txt"), nrow=3), error="Line 4 has too few fields") -test(1556.3, fread(testDir("issue_1330_fread.txt"), nrow=4), error="Line 4 has too few fields") +test(1556.1, fread(testDir("issue_1330_fread.txt"), nrow=2), ans<-data.table(a=1:2, b=1:2)) +test(1556.2, fread(testDir("issue_1330_fread.txt"), nrow=3), ans, warning=w<-"Stopped early on line 4. Expected 2.*found 0.*First discarded non-empty line: <<3.*3>>") +test(1556.3, fread(testDir("issue_1330_fread.txt"), nrow=4), ans, warning=w) # FR #768 str="1,2\n3,4\n" @@ -7426,10 +7429,10 @@ f = testDir("issue_773_fread.txt") ans = data.table(AAA=INT(c(4,7,rep(1,17),31,21)), BBB=INT(c(5,8,rep(2,17),32,22)), CCC=INT(c(6,9,rep(3,17),33,23))) -test(1558.1, fread(f), error="Line 23 has too few fields.*Expecting 3 fields but found 2.*<>") +test(1558.1, fread(f), ans, warning=w<-"Stopped early on line 23. Expected 3 fields but found 2[.].*First discarded non-empty line: <>") test(1558.2, fread(f, nrow=21L), ans) test(1558.3, fread(f, nrow=21L, fill=TRUE), ans) -test(1558.4, fread(f, nrow=22L), error="Line 23 has too few fields.*Expecting 3 fields but found 2.*<>") +test(1558.4, fread(f, nrow=22L), ans, warning=w) test(1558.5, fread(f, nrow=22L, fill=TRUE), rbind(ans, list("ZZZ","YYY",NA))) # FR # 1338 -- check.names argument of setDT @@ -7600,7 +7603,7 @@ test(1577.3, levels(X$b), character(0)) input = "Header not 2 columns\n\n1,3\n2,4" test(1578.0, fread(input), data.table(V1=1:2, V2=3:4)) input = "a,b\n\n1,3\n2,4" -test(1578.1, fread(input), data.table(a=logical(), b=logical()), warning="Found.*discarded.*<<1,3>>") +test(1578.1, fread(input), data.table(a=logical(), b=logical()), warning="Stopped early on line 2[.].*First discarded.*<<1,3>>") test(1578.2, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4)) input = "a,b\n\n\n1,3\n2,4" test(1578.3, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4)) @@ -7608,9 +7611,10 @@ input = "a,b\n\n\n1,3\n\n2,4\n\n" test(1578.4, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4)) f = testDir("530_fread.txt") -test(1578.5, fread(f, skip=47L), data.table(a=logical(), b=logical()), warning="Found.*discarded.*<<1,3>>") +test(1578.5, fread(f, skip=47L), data.table(a=logical(), b=logical()), warning="Stopped early.*discarded.*<<1,3>>") test(1578.6, fread(f, skip=49L), data.table(V1=1:2, V2=3:4)) test(1578.7, fread(f, skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4)) +test(1578.8, fread(f, skip=48L), data.table(V1=1:2, V2=3:4)) # start on blank line 49 and skip="auto" to first data row on line 50 # gforce optimisations dt = data.table(x = sample(letters, 300, TRUE), @@ -10528,8 +10532,8 @@ if ("package:nanotime" %in% search()) { # check too many fields error from ,\n line ending highlighted in #2044 test(1753.1, fread("X,Y\n1,2\n3,4\n5,6"), data.table(X=INT(1,3,5),Y=INT(2,4,6))) -test(1753.2, fread("X,Y\n1,2\n3,4,\n5,6"), error="Line 3 has more than.*2 fields. Stopped on <<,>> at character 4.*<<3,4,>>") -test(1753.3, fread("X,Y\n1,2\n3,4,7\n5,6"), error="Line 3 has more than.*2 fields. Stopped on <<,7>> at character 4.*<<3,4,7>>") +test(1753.2, fread("X,Y\n1,2\n3,4,\n5,6"), ans<-data.table(X=TRUE,Y=2L), warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,>>") +test(1753.3, fread("X,Y\n1,2\n3,4,7\n5,6"), ans, warning="Stopped.*line 3. Expected 2 fields but found 3.*discarded.*<<3,4,7>>") # issue 2051 where a quoted field contains ", New quote rule detection handles it. test(1753.4, fread(testDir("issue_2051.csv"))[2,grep("^Our.*tool$",COLUMN50)], 1L) @@ -10979,8 +10983,8 @@ test(1808.2, fread("A,B\r1,2\r3,4\r"), data.table(A=c(1L,3L),B=c(2L,4L))) cat("A,B\r1,2\r3,4",file=f<-tempfile()) test(1808.3, fread(f), data.table(A=c(1L,3L),B=c(2L,4L))) unlink(f) -test(1808.4, fread("A,B\r1,3\r\r\r2,4\r"), data.table(A=TRUE, B=3L), warning="Discarded footer: <<2,4>>") -test(1808.5, fread("A,B\r4,3\r\r \r2,4\r"), data.table(A=4L, B=3L), warning="Discarded footer: <<2,4>>") +test(1808.4, fread("A,B\r1,3\r\r\r2,4\r"), data.table(A=TRUE, B=3L), warning="Discarded single-line footer: <<2,4>>") +test(1808.5, fread("A,B\r4,3\r\r \r2,4\r"), data.table(A=4L, B=3L), warning="Discarded single-line footer: <<2,4>>") test(1808.6, fread("A,B\r1,3\r\r \r2,4\r", blank.lines.skip=TRUE), data.table(A=1:2, B=3:4)) test(1808.7, fread("A,B\r1,3\r\r \r2,4\r", fill=TRUE), data.table(A=c(1L,NA,NA,2L), B=c(3L,NA,NA,4L))) test(1808.8, fread("A,B\r1,3\r\r \r2,\r", blank.lines.skip=TRUE, fill=TRUE), data.table(A=1:2, B=c(3L,NA))) @@ -11017,9 +11021,9 @@ test(1818, fread(testDir("session_aborted_fatal_error.txt"))[c(1,.N),c(1,2,250,2 test(1819, as.ITime("2015-09-29 08:22:00"), structure(30120L, class = "ITime")) # Issue 2287: the % sign in the error/warning message should not be interpreted as a format string! -test(1820.1, fread("name,id\nfoo,2\nbar%\n"), data.table(name="foo", id=2L), warning="Discarded footer: <>") -test(1820.2, fread("name,id\nfoo,2\nbar%d"), data.table(name="foo", id=2L), warning="Discarded footer: <>") -test(1820.3, fread("name,id\nfoo,2\nbar%s"), data.table(name="foo", id=2L), warning="Discarded footer: <>") +test(1820.1, fread("name,id\nfoo,2\nbar%\n"), data.table(name="foo", id=2L), warning="Discarded single-line footer: <>") +test(1820.2, fread("name,id\nfoo,2\nbar%d"), data.table(name="foo", id=2L), warning="Discarded single-line footer: <>") +test(1820.3, fread("name,id\nfoo,2\nbar%s"), data.table(name="foo", id=2L), warning="Discarded single-line footer: <>") # new argument for print.data.table: col.names # issue #1482 / PR #1483 @@ -11046,7 +11050,8 @@ src = paste(c("A,B", paste(rep("3,4", 10000), collapse="\n"), ""), collapse="\n") -test(1822, fread(src), error="Line 102 has too few.*Expecting 2 fields but found 1.*<<999>>") +test(1822, fread(src), data.table(A=rep(1L,100L), B=2L), warning="Stopped early on line 102. Expected 2 fields but found 1.*discarded.*<<999>>") +# NB: The first sample jump uses the first 100 rows and just misses the 999. Since the data is large enough, the other jumps capture the type bump from 1 (bool) to 3 (int). # Issue 2326: .SD mistakenly includes column being set when get() appears in j DT <- data.table(x = seq(1, 10), y = seq(10, 1)) @@ -11174,8 +11179,8 @@ for (i in 0:1000) { if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE) } test(1835, fread(f, verbose=TRUE), - output = "Not using sample from jump 50.*could not establish the next true line start.*jumps=[0..2)", - error = "Line 42253 has more than the expected 5 fields.*<<-999,Bad,Line,0.0,0.0,extra>>") + output = "A line with too-few or too-many.*jump 50.*Type bumps.*ignored", + warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>") unlink(f) test(1836, fread('1,2,"3,a"\n4,5,"6,b"'), data.table(V1=c(1L,4L), V2=c(2L,5L), V3=c("3,a","6,b"))) # 2196 @@ -11209,7 +11214,7 @@ test(1839.6, fread(txt, sep=""), data.table("DECLARATION OF INDEPENDENCE"=lines[ txt = 'a,b\n ab,cd,ce\n abcdef\n hjkli \n' # now auto detected as ncol 1 anyway test(1840.1, fread(txt), data.table("a,b" = c("ab,cd,ce","abcdef","hjkli"))) write('a,b\n ab,cd,ce\nabc,def \n hj,kli ', f<-tempfile()) # write to file to generate \r\n line ending on Windows, test 1840.6 below -test(1840.2, fread(f), error="more than the expected") +test(1840.2, fread(f), data.table(a=logical(), b=logical()), warning="Stopped early on line 2.*discarded.*<>") test(1840.3, fread(f, sep=NA), error="!is.na(sep) is not TRUE") test(1840.4, fread(f, sep=NA_character_), error="!is.na(sep) is not TRUE") test(1840.5, fread(f, sep=""), ans<-data.table("a,b"=c("ab,cd,ce","abc,def","hj,kli"))) @@ -11370,7 +11375,7 @@ test(1856.2, fread("A,B\n\n"), ans) test(1856.3, fread("A,B\n\n\n"), ans) test(1856.4, fread("A,B\n3,4\n\n\n"), data.table(A=3L, B=4L)) test(1856.5, fread("A,B\n3,4\n,\n\n\n"), data.table(A=c(3L,NA), B=c(4L,NA))) -test(1856.6, fread("A,B\n3,4\n\n5,6\n"), data.table(A=3L, B=4L), warning="Discarded footer: <<5,6>>") +test(1856.6, fread("A,B\n3,4\n\n5,6\n"), data.table(A=3L, B=4L), warning="Discarded single-line footer: <<5,6>>") DTs = list( # passed fread(fwrite(DT))==DT before fix? data.table(A=logical(0)), # yes data.table(A=NA), # no @@ -11430,7 +11435,9 @@ test(1864.2, DT[J("\u516c\u5141\u4ef7\u503c\u53d8\u52a8\u635f\u76ca"), z], 1L) data = rep("a,b,c,d,e,f,g", 2100) data[111] = "a,b,c,d,e,f,g," cat(data, file=(f<-tempfile()), sep="\n") -test(1865, fread(f, header=FALSE), error="Line 111.*more than.*7 fields.*Stopped on <<,>> at character 14.*<>") +test(1865, fread(f, header=FALSE), + data.table(V1=rep("a",110),V2="b",V3="c",V4="d",V5="e",V6="f",V7="g"), + warning="Stopped early on line 111. Expected 7.*found 8.*discarded.*<>") unlink(f) # "Natural" provision of value.name in measure.vars list, #1547 and #2551 @@ -11511,7 +11518,7 @@ test(1869.1, fread("A\r1\r\r\r2\r"), data.table(A=c(1L,NA,NA,2L))) test(1869.2, fread("A\r1\r\r\r2\r\r"), data.table(A=c(1L,NA,NA,2L,NA))) test(1869.3, fread("A\r1\r\r\r2\r\r\r"), data.table(A=c(1L,NA,NA,2L,NA,NA))) test(1869.4, fread("A,B\r2,3\r,\r,\r4,5\r\r"), data.table(A=c(2L,NA,NA,4L), B=c(3L,NA,NA,5L))) -test(1869.5, fread("A,B\r2,3\r\r,\r2,4\r\r"), error="Line 3 has too few fields. Expecting 2 fields but found 0.") # two line footer because of the comma. Only 1 line footers are auto discarded. +test(1869.5, fread("A,B\r2,3\r\r,\r2,4\r\r"), data.table(A=2L, B=3L), warning="Stopped.*line 3. Expected 2 fields but found 0.*First discarded non-empty line: <<,>>") # two line footer because of the comma test(1869.6, fread(testDir("colnames4096.csv")), error="very unusual.*one single line without any.*r.*n at the end.*and.*multiple of 4096") test(1869.7, fread(testDir("onecol4096.csv")), error="very unusual.*single column.*multiple of 4096.*ends with 2 or more end-of-line") @@ -11529,7 +11536,7 @@ test(1871.4, fread(txt, skip=2, nrow=3), ans) test(1871.5, fread(txt, skip=3), ans <- data.table(V1=INT(4,8), V2=INT(5,9), V3=INT(6,10), V4=INT(7,11))) test(1871.6, fread(txt, skip=3, nrow=1), ans[1,]) test(1871.7, fread(txt, nrows=1), data.table(V1=2L, V2=3L, V3=4L)) -test(1871.8, fread(txt, skip=0), error="Line 3 has more than the expected 3 fields.*<>") +test(1871.8, fread(txt, skip=0), data.table(V1=2L, V2=3L, V3=4L), warning="Stopped early.*line 3. Expected 3 fields but found 4.*discarded.*<>") test(1871.9, fread(txt, skip=0, nrows=1), ans<-data.table(V1=2L, V2=3L, V3=4L)) test(1871.11, fread(txt, skip=0, nrows=1, header=TRUE), ans) test(1871.12, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1="V1", V2="V2", V3="V3")) diff --git a/src/fread.c b/src/fread.c index 4c849189e5..cc1b22e7cf 100644 --- a/src/fread.c +++ b/src/fread.c @@ -939,11 +939,10 @@ static reader_fun_t fun[NUMTYPE] = { static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -static bool detect_types( const char **pch, int8_t type[], int ncol) { +static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) { // used in sampling column types and whether column names are present // test at most ncol fields. If there are fewer fields, the data read step later // will error (if fill==false) when the line number is known, so we don't need to handle that here. - bool bumped=false; const char *ch = *pch; double trash; // double so that this throw-away storage is aligned. char trash[8] would not be aligned. void *targets[9] = {NULL, &trash, NULL, NULL, &trash, NULL, NULL, NULL, &trash}; @@ -985,16 +984,16 @@ static bool detect_types( const char **pch, int8_t type[], int ncol) { // quoteRule, quoteRule+1, field+1, jline, j, strlim(fieldStart,200)); quoteRule++; } - bumped = true; + *bumped = true; ch = fieldStart; } field++; - if (*ch!=sep) break; + if (*ch!=sep || field==ncol) break; // field==ncol is needed for 1753.2 where line ends with an extra comma but shouldn't, so shouldn't be moved over if (sep==' ') while (ch[1]==' ') ch++; ch++; } *pch = ch; - return bumped; + return field; // the number of fields so caller knows if ncol were read } @@ -1557,7 +1556,7 @@ int freadMain(freadMainArgs _args) { } //lastSampleJumpOk = true; bool bumped = false; // did this jump find any different types; to reduce verbose output to relevant lines - bool skipThisJump = false; + //bool skipThisJump = false; int jumpLine = 0; // line from this jump point start while(ch0) DTPRINT(" A line with too-few or too-many fields was found in sample from jump %d. Type bumps from this jump will be ignored.\n", jump); + bumped = false; + break; + } if (ch==eof && finalByte && tmpType[ncol-1]!=previousLastColType) { // revert bump due to e.g. ,NA in the last field of last row where finalByte=='A' and N caused bump to character (test 894.0221) if (verbose) DTPRINT(" Reverted bump of final column from %d to %d on final field due to finalByte='%c'." @@ -1585,19 +1586,6 @@ int freadMain(freadMainArgs _args) { previousLastColType, tmpType[ncol-1], finalByte); tmpType[ncol-1] = previousLastColType; } - if (!eol(&ch) && *ch!='\0') { - if (jump==0) { - STOP("Line %d has more than the expected %d fields. Stopped on <<%s>> at character %d. " - "Consider setting 'comment.char=' if there is a trailing comment to be ignored. First 500 characters of line: <<%s>>", - row1line+jumpLine-1, ncol, strlim(ch-1,10), (int)(ch-lineStart), strlim(lineStart,500)); - } - if (verbose) { - DTPRINT(" Not using sample from jump %d. Looks like a complicated file where nextGoodLine could not establish the next true line start.\n", jump); - // the nrow estimate will still include the (probably wrong) row widths so far from this sample, but that's ok as it's just an estimate - } - skipThisJump = true; - break; - } ch += (*ch=='\n' || *ch=='\r'); lastRowEnd = ch; int thisLineLen = (int)(ch-lineStart); // ch is now on start of next line so this includes line ending already @@ -1606,10 +1594,18 @@ int freadMain(freadMainArgs _args) { sumLenSq += thisLineLen*thisLineLen; if (thisLineLenmaxLen) maxLen=thisLineLen; + if (jump==0 && bumped) { + // apply bumps after each line in the first jump from the start in case invalid line stopped early on is in the first 100 lines. + // otherwise later jumps must complete fully before their bumps are appplied. Invalid lines in those are more likely to be due to bad jump start. + memcpy(type, tmpType, (size_t)ncol); + bumped = false; // detect_types() only updates &bumped when it's true. So reset to false here. + } + } + if (bumped) { + // when jump>0, apply the bumps (if any) at the end of the successfully completed jump sample + ASSERT(jump>0, "jump(%d)>0", jump); + memcpy(type, tmpType, (size_t)ncol); } - if (skipThisJump) continue; - // if (jump==nJumps-1) lastSampleJumpOk = true; - if (bumped) memcpy(type, tmpType, (size_t)ncol); if (verbose && (bumped || jump==0 || jump==nJumps-1)) { DTPRINT(" Type codes (jump %03d) : %s Quote rule %d\n", jump, typesAsString(ncol), quoteRule); } @@ -1630,7 +1626,8 @@ int freadMain(freadMainArgs _args) { ch = pos; if (args.header==NA_BOOL8) { for (int j=0; j0) for (int j=0; jtype0 && type[j]type0 can only happen if the column is not all blank @@ -1854,7 +1851,7 @@ int freadMain(freadMainArgs _args) { char stopErr[stopErrSize+1]=""; // must be compile time size: the message is generated and we can't free before STOP size_t DTi = 0; // the current row number in DT that we are writing to const char *prevJumpEnd = pos; // the position after the last line the last thread processed (for checking) - const char *skippedFooter = NULL; // if footer is skipped, this is its location to be printed. + // const char *skippedFooter = NULL; // if footer is skipped, this is its location to be printed. int buffGrown=0; // chunkBytes is the distance between each jump point; it decides the number of jumps // We may want each chunk to write to its own page of the final column, hence 1000*maxLen @@ -1912,7 +1909,7 @@ int freadMain(freadMainArgs _args) { const char *thisJumpStart=NULL; // The first good start-of-line after the jump point size_t myNrow = 0; // the number of rows in my chunk size_t myBuffRows = initialBuffRows; // Upon realloc, myBuffRows will increase to grown capacity - int myWrongNumberFields = -1; // -1 means false. If set, it's set to >=0 holding the (wrong) number of fields observed + bool myStoppingEarly = false; // true when an empty or too-short or too-long row is encountered when fill=false // Allocate thread-private row-major `myBuff`s ThreadLocalFreadParsingContext ctx = { @@ -1940,7 +1937,7 @@ int freadMain(freadMainArgs _args) { #pragma omp for ordered schedule(dynamic) reduction(+:thNextGoodLine,thRead,thPush) for (int jump = jump0; jump < nJumps; jump++) { - if (stopTeam) continue; // must continue and not break. We desire not to depend on (relatively new) omp cancel directive, yet + if (stopTeam && !myStoppingEarly) continue; // must continue and not break. We desire not to depend on (relatively new) omp cancel directive, yet double tLast = 0.0; // thread local wallclock time at last measuring point for verbose mode only. if (verbose) tLast = wallclock(); if (myNrow) { @@ -1969,10 +1966,11 @@ int freadMain(freadMainArgs _args) { progress((int)(100.0*jump/nJumps), ETA); } } + if (myStoppingEarly) continue; } const char *tch = pos + (size_t)jump*chunkBytes; - const char *tlineStart = tch; + const char *tLineStart = tch; const char *nextJump = jump= allocnrow) { // a previous thread has already reached the `allocnrow` limit stopTeam = true; myNrow = 0; - myWrongNumberFields = -1; // forget the error, as it occured after the nrow limit requested by user } else if (myNrow + ctx.DTi >= allocnrow) { // current thread's rows will fill all allocnrow if (allocnrow == nrowLimit) { - // allocnrow is the same as nrowLimit, no need to reallocate the DT, - // just truncate the rows in the current chunk - myNrow = nrowLimit - ctx.DTi; - myWrongNumberFields = -1; // e.g. test 1558.2 where the format error is after nrowLimit + // the loop above should have stopped when the nrowLimit was reached + ASSERT(myNrow == nrowLimit-ctx.DTi, "myNrow[%llu] == nrowLimit[%llu]-ctx.DTi[%llu]", myNrow, nrowLimit, ctx.DTi); + ASSERT(nth==1, "nth[%d]==1", nth); } else if (myNrow + ctx.DTi > allocnrow) { // We reached `allocnrow` limit, but there are more data to read // left. In this case we arrange to terminate all threads but @@ -2230,41 +2234,11 @@ int freadMain(freadMainArgs _args) { stopTeam = true; } } - if (myWrongNumberFields>=0) { - if (jump==nJumps-1) { // the last jump; we should be at the end of the file or at the start of the footer - const char *tt = tlineStart; - while (tt=0) { - stopTeam = true; - if (myWrongNumberFields>", - (llu)ctx.DTi+myNrow+row1line, ncol, myWrongNumberFields, strlim(tlineStart, 500)); - } else { - snprintf(stopErr, stopErrSize, - "Line %llu has more than the expected %d fields. Stopped on <<%s>> at character %d. " - "Consider setting 'comment.char=' if there is a trailing comment to be ignored. First 500 characters of line: <<%s>>", - (llu)ctx.DTi+myNrow+row1line, ncol, strlim(tch+1,10), (int)(tch-tlineStart+2), strlim(tlineStart,500)); - } - } + if (myStoppingEarly) { + if (stopTeam || myNrow==0) myStoppingEarly=false; + stopTeam=true; } - // tell next thread (she not me) 2 things : + // tell next thread 2 things : prevJumpEnd = tch; // i) the \n I finished on so she can check (above) she started exactly on that \n good line start DTi += myNrow; // ii) which row in the final result she should start writing to since now I know myNrow. ctx.nRows = myNrow; @@ -2375,24 +2349,27 @@ int freadMain(freadMainArgs _args) { } setFinalNrow(DTi); - if (skippedFooter) { + /*if (skippedFooter) { DTWARN("Discarded footer: <<%s>>", strlim(skippedFooter,500)); - } - else if (prevJumpEnd>", strlim(skippedFooter,500)); + DTWARN("Discarded single-line footer: <<%s>>", strlim(skippedFooter,500)); } else { - STOP("More than one line: <<%s>>", strlim(skippedFooter,500)); + ch = prevJumpEnd; + int tt = countfields(&ch); + DTWARN("Stopped early on line %llu. Expected %d fields but found %d. Consider fill=TRUE and comment.char=. First discarded non-empty line: <<%s>>", + DTi+row1line, ncol, tt, strlim(skippedFooter,500)); } } } From 469458cda454fde6fba3eaa979132a648690851e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Feb 2018 13:22:37 -0800 Subject: [PATCH 08/14] Passing tests locally --- cc.R | 4 ++-- man/fread.Rd | 8 ++++---- src/fread.c | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/cc.R b/cc.R index d0014704b3..541e7b6da8 100644 --- a/cc.R +++ b/cc.R @@ -53,9 +53,9 @@ cc = function(test=TRUE, clean=FALSE, debug=FALSE, cc_dir=Sys.getenv("CC_DIR")) cat(getwd(),"\n") if (clean) system("rm *.o *.so") if (debug) { - ret = system("MAKEFLAGS='-j CC=gcc-7 PKG_CFLAGS=-fno-openmp CFLAGS=-std=c99\\ -Og\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o data.table.so *.c") + ret = system("MAKEFLAGS='-j CC=gcc PKG_CFLAGS=-fno-openmp CFLAGS=-std=c99\\ -O0\\ -ggdb\\ -pedantic' R CMD SHLIB -d -o data.table.so *.c") } else { - ret = system("MAKEFLAGS='-j CC=gcc-7 CFLAGS=-fopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic' R CMD SHLIB -o data.table.so *.c") + ret = system("MAKEFLAGS='-j CC=gcc CFLAGS=-fopenmp\\ -std=c99\\ -O3\\ -pipe\\ -Wall\\ -pedantic' R CMD SHLIB -o data.table.so *.c") # TODO add -Wextra too? } if (ret) return() diff --git a/man/fread.Rd b/man/fread.Rd index a300b97ffb..d6b00cca8e 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -11,15 +11,15 @@ \usage{ fread(input, file, sep="auto", sep2="auto", dec=".", quote="\"", nrows=Inf, header="auto", na.strings="NA", -stringsAsFactors=FALSE, verbose=getOption("datatable.verbose"), autostart=NA, -skip=0, select=NULL, drop=NULL, colClasses=NULL, +stringsAsFactors=FALSE, verbose=getOption("datatable.verbose"), +skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64"), # default: "integer64" col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=interactive(), data.table=getOption("datatable.fread.datatable"), -nThread=getDTthreads(), logical01=TRUE +nThread=getDTthreads(), logical01=TRUE, autostart=NA ) } \arguments{ @@ -32,7 +32,6 @@ nThread=getDTthreads(), logical01=TRUE \item{file}{ File path, useful when we want to ensure that no shell commands will be executed. File path can also be provided to \code{input} argument. } \item{stringsAsFactors}{ Convert all character columns to factors? } \item{verbose}{ Be chatty and report timings? } - \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } \item{skip}{ If 0 (default) start on the first line and from there finds the first row with a consistent number of columns. This automatically avoids irregular header information before the column names row. \code{skip>0} means ignore the first \code{skip} rows manually. \code{skip="string"} searches for \code{"string"} in the file (e.g. a substring of the column names row) and starts on that line (inspired by read.xls in package gdata). } \item{select}{ Vector of column names or numbers to keep, drop the rest. } \item{drop}{ Vector of column names or numbers to drop, keep the rest. } @@ -51,6 +50,7 @@ nThread=getDTthreads(), logical01=TRUE \item{data.table}{ TRUE returns a \code{data.table}. FALSE returns a \code{data.frame}. } \item{nThread}{The number of threads to use. Experiment to see what works best for your data on your hardware.} \item{logical01}{If TRUE a column containing only 0s and 1s will be read as logical, otherwise as integer.} + \item{autostart}{ Deprecated and ignored with warning. Please use \code{skip} instead. } } \details{ diff --git a/src/fread.c b/src/fread.c index cc1b22e7cf..68f6b94efc 100644 --- a/src/fread.c +++ b/src/fread.c @@ -992,6 +992,7 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped if (sep==' ') while (ch[1]==' ') ch++; ch++; } + if (ch==eof && finalByte && finalByte==sep && sep!=' ') field++; // for test 1776.2 *pch = ch; return field; // the number of fields so caller knows if ncol were read } @@ -2110,7 +2111,7 @@ int freadMain(freadMainArgs _args) { } if (thisType != joldType // rare out-of-sample type exception. - && (!finalByte || finalSep)) { // don't bump the final field until we've replaced the finalByte (if any) test 894.0221 where final field is NA and finalByte=='A' + && (tch Date: Mon, 12 Feb 2018 15:26:13 -0800 Subject: [PATCH 09/14] Added another test from the issue --- NEWS.md | 1 + inst/tests/tests.Rraw | 8 ++++---- src/fread.c | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5c299a2603..9b7c05bac5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -27,6 +27,7 @@ * `sep=NULL` or `sep=""` (i.e., no column separator) can now be used to specify single column input reliably like `base::readLines`, [#1616](https://github.com/Rdatatable/data.table/issues/1616). `sep='\\n'` still works (even on Windows where line ending is actually `\\r\\n`) but `NULL` or `""` are now documented and recommended. Thanks to Dmitriy Selivanov for the pull request and many others for comments. As before, `sep=NA` is not valid; use the default `"auto"` for automatic separator detection. `sep='\\n'` may be deprecated in future. * Single-column input with blank lines is now valid and the blank lines are significant (meaning an NA in the single column). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing NA which are written as blank. There is no change when `ncol>1` (i.e., input stops with detailed warning at the first blank line) because a blank line when `ncol>1` is invalid input due to no separators present instead of `ncol-1` separators. * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. + * `skip=` and `nrow=` are more reliable and no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). Tests added. * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526) 2. `fwrite()`: diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0a6097c7e7..984415f2df 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11541,10 +11541,10 @@ test(1871.9, fread(txt, skip=0, nrows=1), ans<-data.table(V1=2L, V2=3L, V3=4L)) test(1871.11, fread(txt, skip=0, nrows=1, header=TRUE), ans) test(1871.12, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1="V1", V2="V2", V3="V3")) test(1871.13, fread(txt, skip=0, nrows=2, header=FALSE), data.table(V1=c("V1","2"), V2=c("V2","3"), V3=c("V3","4"))) -# for ( i in 100:1) { -# lines <- paste0(paste(rep("1,2,3", i), collapse='\n'), "\n1,2") -# fread(lines, nrows=i) -# } +for (i in 100:1) { + lines <- paste(c(rep("2,3,4",i), "2,3"), collapse='\n') + test(1871.2 + i/1000, fread(lines, nrows=i), data.table(V1=rep.int(2L,i), V2=3L, V3=4L)) +} # miscellaneous missing tests uncovered by CodeCov difference # in the process of PR #2573 diff --git a/src/fread.c b/src/fread.c index 68f6b94efc..8acd946acc 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1707,7 +1707,7 @@ int freadMain(freadMainArgs _args) { meanLineLen=0.0; // Average length (in bytes) of a single line in the input file bytesRead=0; // Bytes in the data section (i.e. excluding column names, header and footer, if any) - if (sampleLines < jumpLines) { + if (sampleLines <= jumpLines) { if (verbose) DTPRINT(" All rows were sampled since file is small so we know nrow=%llu exactly\n", (llu)sampleLines); estnrow = allocnrow = sampleLines; } else { From e1feed34c31b3c084279fa715826913419690569 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Feb 2018 15:52:21 -0800 Subject: [PATCH 10/14] Tidy --- inst/tests/tests.Rraw | 3 +-- src/fread.c | 25 ++----------------------- 2 files changed, 3 insertions(+), 25 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 984415f2df..27c3a548b0 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2887,9 +2887,9 @@ DT = data.table(a=c(NA,NA,FALSE,FALSE), b=c(1,1,2,2)) test(1009, DT[,list(mean(a), sum(a)),by=b], data.table(b=c(1,2),V1=c(NA,0),V2=c(NA_integer_,0L))) # sum(logical()) should be integer, not real # an fread error shouldn't hold a lock on the file on Windows +# TODO: now that these are warnings and not errors, we need another way to trigger a STOP() inside fread.c. options(warn=2) isn't enough. cat('A,B\n1,2\n3\n5,6\n', file=(f<-tempfile())) test(1010.1, fread(f), ans<-data.table(A=TRUE, B=2L), warning=(txt<-"Stopped early on line 3.*Expected 2 fields but found 1.*fill.*TRUE.*<<3>>")) -oldw = options(warn=2) # !!TODO!!: this doesn't seem sufficient in test framework to turn the warning into error. test(1010.2, fread(f), ans, warning=txt) cat('7\n8,9',file=f,append=TRUE) # that append works after error test(1010.3, fread(f,fill=TRUE), data.table(A=INT(1,3,5,7,8), B=INT(2,NA,6,NA,9))) @@ -2899,7 +2899,6 @@ test(1010.5, fread(f,fill=TRUE), data.table(A=INT(1,3,5), B=INT(2,NA,6))) test(1010.6, fread(f), ans, warning=txt) unlink(f) # that file can be removed after error test(1010.7, !file.exists(f)) -options(oldw) # detection of unescaped quotes, quote rule 3 test(1011, fread('A,B\n"aa",1\n"bb,2\n"cc",3\n'), data.table(A=c('aa', '"bb', 'cc'), B=1:3)) diff --git a/src/fread.c b/src/fread.c index 8acd946acc..13850f8799 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1303,8 +1303,8 @@ int freadMain(freadMainArgs _args) { ch = pos; skipAuto = false; } - // Skip the first `skipNrow` lines of input, including 0 to force the first line to be the start else if (args.skipNrow >= 0) { + // Skip the first `skipNrow` lines of input, including 0 to force the first line to be the start while (ch=eof) STOP("skip=%llu but the input only has %llu line%s", (llu)args.skipNrow, (llu)row1line, row1line>1?"s":""); pos = ch; @@ -1488,7 +1488,6 @@ int freadMain(freadMainArgs _args) { //********************************************************************************************* int nJumps; // How many jumps to use when pre-scanning the file size_t sampleLines; // How many lines were sampled during the initial pre-scan - //const char *lastRowEnd; // Pointer to the end of the data section bool autoFirstColName = false; // true when there's one less column name and then it's assumed that the first column is row names or index size_t estnrow=1; size_t allocnrow=0; // Number of rows in the allocated DataTable @@ -1535,7 +1534,6 @@ int freadMain(freadMainArgs _args) { int minLen=INT32_MAX, maxLen=-1; // int_max so the first if(thisLen=eof) break; // The 9th jump could reach the end in the same situation and that's ok. As long as the end is sampled is what we want. if (jump>0 && !nextGoodLine(&ch, ncol)) { // skip this jump for sampling. Very unusual and in such unusual cases, we don't mind a slightly worse guess. - //lastSampleJumpOk = false; continue; } - //lastSampleJumpOk = true; bool bumped = false; // did this jump find any different types; to reduce verbose output to relevant lines - //bool skipThisJump = false; int jumpLine = 0; // line from this jump point start while(ch>", strlim(ch,200)); - } else { - // nextGoodLine() was false for the last (extra) jump to check the end - // must set lastRowEnd to eof accordingly otherwise it'll be left wherever the last good jump finished - lastRowEnd = eof; - } - } -*/ + ch = pos; if (args.header==NA_BOOL8) { for (int j=0; j>", strlim(skippedFooter,500)); - }*/ if (prevJumpEnd Date: Mon, 12 Feb 2018 16:59:42 -0800 Subject: [PATCH 11/14] Added test from #2518 --- NEWS.md | 2 +- inst/tests/tests.Rraw | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 9b7c05bac5..67b06119ab 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,7 +28,7 @@ * Single-column input with blank lines is now valid and the blank lines are significant (meaning an NA in the single column). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing NA which are written as blank. There is no change when `ncol>1` (i.e., input stops with detailed warning at the first blank line) because a blank line when `ncol>1` is invalid input due to no separators present instead of `ncol-1` separators. * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. * `skip=` and `nrow=` are more reliable and no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). Tests added. - * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526) + * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526), [#2518](https://github.com/Rdatatable/data.table/issues/2518) 2. `fwrite()`: * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 27c3a548b0..3bccd5fed7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11540,6 +11540,7 @@ test(1871.9, fread(txt, skip=0, nrows=1), ans<-data.table(V1=2L, V2=3L, V3=4L)) test(1871.11, fread(txt, skip=0, nrows=1, header=TRUE), ans) test(1871.12, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1="V1", V2="V2", V3="V3")) test(1871.13, fread(txt, skip=0, nrows=2, header=FALSE), data.table(V1=c("V1","2"), V2=c("V2","3"), V3=c("V3","4"))) +test(1871.14, fread("A\n100\n200", verbose=TRUE), data.table(A=c(100L,200L)), output="All rows were sampled since file is small so we know nrow=2 exactly") for (i in 100:1) { lines <- paste(c(rep("2,3,4",i), "2,3"), collapse='\n') test(1871.2 + i/1000, fread(lines, nrows=i), data.table(V1=rep.int(2L,i), V2=3L, V3=4L)) From d3caa5f6332cf07025140d898c784b7b0f1cf086 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Feb 2018 17:16:15 -0800 Subject: [PATCH 12/14] Added test from #2515 --- NEWS.md | 2 +- inst/tests/test0.txt | 1 + inst/tests/tests.Rraw | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 inst/tests/test0.txt diff --git a/NEWS.md b/NEWS.md index 67b06119ab..f989f8a963 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,7 +28,7 @@ * Single-column input with blank lines is now valid and the blank lines are significant (meaning an NA in the single column). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing NA which are written as blank. There is no change when `ncol>1` (i.e., input stops with detailed warning at the first blank line) because a blank line when `ncol>1` is invalid input due to no separators present instead of `ncol-1` separators. * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. * `skip=` and `nrow=` are more reliable and no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). Tests added. - * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526), [#2518](https://github.com/Rdatatable/data.table/issues/2518) + * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526), [#2518](https://github.com/Rdatatable/data.table/issues/2518), [#2515](https://github.com/Rdatatable/data.table/issues/2515) 2. `fwrite()`: * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). diff --git a/inst/tests/test0.txt b/inst/tests/test0.txt new file mode 100644 index 0000000000..343641777b --- /dev/null +++ b/inst/tests/test0.txt @@ -0,0 +1 @@ +x0 656609 701231 733130 603634 355257 598656 368172 154195 328918 541999 378803 322161 321092 791071 150389 419669 180705 188717 274702 800259 100932 113270 509670 423688 412085 767369 790109 351661 959688 142249 593856 430035 881849 883353 932253 727230 319403 970870 769759 557740 283302 121615 609275 458244 53145 645010 919631 286721 63482 397957 360723 102529 81232 930175 666775 227586 87954 931600 314102 819515 474886 307681 555198 256257 935567 584995 887227 631934 434291 895515 795983 76195 307116 482805 486231 705261 785018 288274 628695 484178 34233 400298 489154 493941 744274 575616 704744 830367 289528 94032 101765 224100 791138 90153 11788 977400 421275 298837 805904 969139 70402 640508 927283 148022 424566 270942 923478 555518 248364 286337 974164 248739 32678 546732 24499 813317 981579 545526 955469 972214 940743 644527 622021 785924 23817 870966 940461 590656 675131 823011 205458 190691 291547 414287 713461 613051 126750 930891 779069 761445 638559 410482 250950 534637 191131 749656 870264 639621 966972 73639 580494 190103 331573 115388 480143 240431 302584 880387 396715 466719 202512 762275 30761 289286 324542 962755 74055 596781 557808 858438 167748 551506 313974 398181 700113 91957 814308 560534 894768 234128 275400 154705 451191 82213 846542 140500 801347 105546 881607 320445 434178 272078 918623 789413 221578 336189 893454 288401 738358 196732 269610 906540 199487 248174 151170 920877 281825 11999 229927 142709 865073 821730 733870 927047 382099 501732 209365 111025 629165 40013 781585 625607 873297 390798 950249 150346 546167 696964 774745 924600 562044 546639 820513 875737 4613 268007 538964 54024 147940 887098 259041 432766 712760 361465 541462 707152 217728 656004 749406 164417 178438 725605 930454 552495 778309 172520 585342 -6993 135061 388732 169809 62254 -3252 351291 442562 108310 810094 383020 191313 831278 966205 628916 396364 161747 52626 389767 643418 459668 73641 837384 449943 962037 101682 156039 215569 541052 556353 715341 863959 420742 187277 76339 55960 792854 169862 605135 196136 507606 874500 40006 589341 921790 174259 835893 789012 58689 999570 183003 12804 33132 16762 700847 328861 951624 152948 147249 824065 464931 119416 679783 260417 256339 365195 474795 951240 388833 597366 422250 481521 240251 94093 470069 267104 876396 941440 5060 59123 973339 407419 697781 70640 697698 890818 301616 186050 529539 430533 590844 317713 770167 908137 184259 7662 811278 667116 369633 704257 463585 770546 567979 768563 179296 823750 66862 774164 655603 603775 709287 848672 125761 222582 791991 153699 551297 379311 306024 511210 214245 437379 321291 504286 981317 617686 413759 236307 759616 437084 597432 248273 332715 304108 634669 377569 223991 659156 98602 569501 817043 -3139 34273 343602 62985 978268 501113 676726 172614 734484 459624 536897 577632 993373 468857 118235 285947 273862 557753 77861 208124 53740 813437 196687 780632 456756 737824 73209 193248 814739 82946 620531 937509 534977 750713 792793 154008 719548 63369 50103 789700 705314 598162 957370 939796 482935 136924 769343 401354 960856 372173 72628 562835 261494 443922 252954 808805 244167 993007 357420 319382 716637 131148 491938 10387 225293 215295 601463 15300 99585 955999 899293 692080 430896 939463 130638 249144 732186 706193 298076 625366 211715 466801 539754 260766 939737 175824 386850 526747 97302 345468 360414 313365 193665 933930 223183 566652 485281 221511 559204 134626 447527 616350 963250 67709 561389 223309 481791 184012 192084 495185 176337 225176 211027 620818 795659 626412 693440 854162 21056 295476 667959 583901 288218 374506 333942 580931 91268 436843 798558 951687 388189 71434 445894 977016 293562 868269 104872 665782 402427 704246 555812 112731 705347 320555 381071 513045 961812 424932 750972 723433 797141 963732 583124 534123 528064 755494 765100 945616 319267 816319 544736 939945 858844 724481 127355 89824 809157 323435 796710 982505 55151 281952 266858 103704 278013 326863 625396 286453 37590 691841 180121 324560 242657 942644 313051 18052 273293 899965 478179 806404 894543 627957 468200 179024 832540 984899 688120 916036 882768 314820 980983 287823 316173 627346 334888 845949 534463 248454 403857 751145 447842 322956 549981 200715 488329 769497 90061 860605 785771 943363 998468 496620 77270 696221 413011 166754 151521 116447 831564 163226 848260 884366 166701 589100 824493 512837 555067 808754 928003 692445 960730 697769 841303 971950 860491 983013 653710 530517 219003 644211 837924 566243 37342 545639 839650 475874 682796 32342 245297 91611 219337 357445 537100 587478 692140 906003 842543 19578 167882 982194 698333 611028 407855 224180 391638 784633 143842 116728 880111 374703 749623 73736 457614 209872 610555 255371 367870 403419 735033 562208 895083 911063 551045 991328 726404 730649 586811 464660 999369 961951 46272 612162 896926 273792 253320 625856 432386 220498 805981 218562 339073 373035 48807 265383 36535 346042 239965 683477 976158 172863 530536 527694 420081 702072 112393 453258 861480 993586 945009 968521 933886 785678 313589 333287 655906 532286 25058 985104 393032 886681 490590 364607 561365 560067 204463 319445 656924 576824 978768 780295 39908 946103 143274 771073 342473 762406 41341 309350 693254 127688 462478 793635 402445 511759 393560 858302 127338 444183 776580 985153 204304 586211 497882 412152 107519 129459 909550 726244 808115 338429 887961 962179 846652 706925 599625 848476 174466 915803 971757 717132 861039 650894 45015 60327 63774 173479 846497 813495 815334 544183 831596 617676 428664 655030 63478 201239 927460 760012 629036 204727 932502 212201 684350 878241 578776 158199 447942 655891 308078 709745 900637 548270 578200 54213 832238 181724 900876 613349 606173 523540 820104 869225 574145 172267 904974 238439 970837 927617 933646 339006 810331 208790 28266 4996 298034 797752 638315 341766 298888 924946 711613 490629 606538 126793 610571 239295 289220 785128 248223 647844 861089 406627 599125 410243 312553 107253 697336 35212 228437 541347 551765 576224 627122 751956 664415 417495 -2368 955199 \ No newline at end of file diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3bccd5fed7..408399de4d 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11375,6 +11375,7 @@ test(1856.3, fread("A,B\n\n\n"), ans) test(1856.4, fread("A,B\n3,4\n\n\n"), data.table(A=3L, B=4L)) test(1856.5, fread("A,B\n3,4\n,\n\n\n"), data.table(A=c(3L,NA), B=c(4L,NA))) test(1856.6, fread("A,B\n3,4\n\n5,6\n"), data.table(A=3L, B=4L), warning="Discarded single-line footer: <<5,6>>") +test(1856.7, fread(testDir("test0.txt"))[c(1,997,998,999)], data.table(x0=c(656609L, NA, -2368L, 955199L))) # issue 2515 DTs = list( # passed fread(fwrite(DT))==DT before fix? data.table(A=logical(0)), # yes data.table(A=NA), # no From 4ac7e26ec7381e3970ac33c5fce4c45265256cc0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Feb 2018 17:36:06 -0800 Subject: [PATCH 13/14] Added test from #1671 --- NEWS.md | 2 +- inst/tests/tests.Rraw | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index f989f8a963..8c88319078 100644 --- a/NEWS.md +++ b/NEWS.md @@ -28,7 +28,7 @@ * Single-column input with blank lines is now valid and the blank lines are significant (meaning an NA in the single column). The blank lines are significant even at the very end, which may be surprising on first glance. The change is so that `fread(fwrite(DT))==DT` for single-column inputs containing NA which are written as blank. There is no change when `ncol>1` (i.e., input stops with detailed warning at the first blank line) because a blank line when `ncol>1` is invalid input due to no separators present instead of `ncol-1` separators. * Too few column names are now auto filled with default column names, with warning, [#1625](https://github.com/Rdatatable/data.table/issues/1625). If there is just one missing column name it is guessed to be for the first column (row names or an index), otherwise the column names are filled at the end. Similarly, too many column names now automatically sets `fill=TRUE`, with warning. * `skip=` and `nrow=` are more reliable and no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). Tests added. - * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526), [#2518](https://github.com/Rdatatable/data.table/issues/2518), [#2515](https://github.com/Rdatatable/data.table/issues/2515) + * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526), [#2518](https://github.com/Rdatatable/data.table/issues/2518), [#2515](https://github.com/Rdatatable/data.table/issues/2515), [#1671](https://github.com/Rdatatable/data.table/issues/1671) 2. `fwrite()`: * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 408399de4d..aec8c35a79 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11542,6 +11542,7 @@ test(1871.11, fread(txt, skip=0, nrows=1, header=TRUE), ans) test(1871.12, fread(txt, skip=0, nrows=1, header=FALSE), data.table(V1="V1", V2="V2", V3="V3")) test(1871.13, fread(txt, skip=0, nrows=2, header=FALSE), data.table(V1=c("V1","2"), V2=c("V2","3"), V3=c("V3","4"))) test(1871.14, fread("A\n100\n200", verbose=TRUE), data.table(A=c(100L,200L)), output="All rows were sampled since file is small so we know nrow=2 exactly") +test(1871.15, fread("col1, col2, col3\n1, 2, 3\n3, 5, 6\n7, 8, 9\n\nsome text to ignore", nrows = 3L), data.table(col1=INT(1,3,7), col2=INT(2,5,8), col3=INT(3,6,9))) # from #1671 (no warning expected) for (i in 100:1) { lines <- paste(c(rep("2,3,4",i), "2,3"), collapse='\n') test(1871.2 + i/1000, fread(lines, nrows=i), data.table(V1=rep.int(2L,i), V2=3L, V3=4L)) From 7f48c74989ca0144a5002b1f478b265970229f49 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 12 Feb 2018 18:09:11 -0800 Subject: [PATCH 14/14] Pencilled in test from #2267. Added 'nocov' in C code to see if that works. --- inst/tests/tests.Rraw | 4 ++++ src/fread.c | 10 +++++----- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index aec8c35a79..d98afeef0f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11613,6 +11613,10 @@ fwrite(DT,f<-tempfile()) test(1873, fread(f), DT) unlink(f) +# no good jump start, #2267 +# At 35MB, the bad_fill.csv file size exceeds CRAN limit. Need to reduce its size. +# test(1874.1, fread(testDir("bad_fill.csv")), error="No good line could be found from jump point") +# test(1874.2, fread(testDir("bad_fill.csv"), fill=TRUE), error="No good line could be found from jump point") ########################## diff --git a/src/fread.c b/src/fread.c index 13850f8799..dd19b37bd9 100644 --- a/src/fread.c +++ b/src/fread.c @@ -139,10 +139,10 @@ bool freadCleanup(void) // may call freadCleanup(), thus resulting in an infinite loop. #ifdef WIN32 if (!UnmapViewOfFile(mmp)) - DTPRINT("System error %d unmapping view of file\n", GetLastError()); + DTPRINT("System error %d unmapping view of file\n", GetLastError()); // nocov #else if (munmap(mmp, fileSize)) - DTPRINT("System errno %d unmapping file: %s\n", errno, strerror(errno)); + DTPRINT("System errno %d unmapping file: %s\n", errno, strerror(errno)); // nocov #endif mmp = NULL; } @@ -206,9 +206,9 @@ static char *typesAsString(int ncol) { if (ncol<=100) { for (; i