diff --git a/NEWS.md b/NEWS.md index b905811c7c..67764c472a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -56,6 +56,8 @@ 4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288). +5. `fread()` could crash if `quote=""` (i.e. ignore quotes), the last line is too short, and `fill=TRUE`, [#3524](https://github.com/Rdatatable/data.table/pull/3524). Thanks to Jiucang Hao for the report and reproducible example. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/inst/tests/noquote.csv.gz b/inst/tests/noquote.csv.gz new file mode 100644 index 0000000000..f65f56dbae Binary files /dev/null and b/inst/tests/noquote.csv.gz differ diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8f8f3e71ec..12f354eb05 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14072,6 +14072,16 @@ test(2027.1, DT[, list(1, ), by=a], error = 'Item 2 of the .() or list() passed test(2027.2, DT[, list(1,2,)], error = 'Item 3 of the .() or list() passed to j is missing') test(2027.3, DT[, .(1,,3,), by=a], error = 'Item 2 of the .() or list() passed to j is missing') +# fread quote="" when last line too short and filled with fill=TRUE (provided via email), crash in v1.12.2 and before +test(2028.1, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="")[c(1,.N), c(1,2,9,10)], + data.table(H=c("D","T"), "Locate Reply"=c("BCS","Locate Reply"), V9=c("A",""), V10=c("4/23/2019 7:11:11 AM",""))) +test(2028.2, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="", header=FALSE)[c(1,.N), c(1,2,3,8,9,10)], + data.table(V1=c("H","T"), V2="Locate Reply", V3=c("GORLTR","2093"), V8=INT(NA,NA), V9="", V10="")) +txt = "A,B,C\n1,4,7\n2,5,8\n3,6\n" +test(2029.1, fread(txt), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>") +test(2029.2, fread(txt, quote=""), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>") +test(2029.3, fread(txt, quote="", fill=TRUE), data.table(A=1:3, B=4:6, C=c(7:8,NA))) + ################################### # Add new tests above this line # diff --git a/src/fread.c b/src/fread.c index bcbda32a59..970562393c 100644 --- a/src/fread.c +++ b/src/fread.c @@ -213,11 +213,11 @@ static char *typesAsString(int ncol) { static char str[101]; int i=0; if (ncol<=100) { - for (; ich) = ch; @@ -562,7 +562,7 @@ static void Field(FieldParseContext *ctx) } target->len = (int32_t)(ch - fieldStart); target->off = (int32_t)(fieldStart - ctx->anchor); - if (*ch==quote) { + if (*ch==quote) { // quote=='\0' (user set quote="") would have returned earlier above in the same branch as quoteRule 3 ch++; skip_white(&ch); *(ctx->ch) = ch; @@ -798,7 +798,7 @@ static void parse_double_extended(FieldParseContext *ctx) double *target = (double*) ctx->targets[sizeof(double)]; bool neg, quoted; init(); - ch += (quoted = (*ch=='"')); + ch += (quoted = (*ch==quote && quote)); ch += (neg = (*ch=='-')) + (*ch=='+'); if (ch[0]=='n' && ch[1]=='a' && ch[2]=='n' && (ch += 3)) goto return_nan; @@ -843,7 +843,7 @@ static void parse_double_extended(FieldParseContext *ctx) return_na: *target = NA_FLOAT64; ok: - if (quoted && *ch!='"') { + if (quoted && *ch!=quote) { *target = NA_FLOAT64; } else { *(ctx->ch) = ch + quoted; @@ -1048,7 +1048,7 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped // thread at headPos which has full lineage to sof may bump the quoteRule. break; // caller will detect this line hasn't finished properly } - if (*ch==quote) { + if (*ch==quote && quote) { // && quote to exclude quote='\0' (user passed quote="") ch++; fun[tmpType[field]](&fctx); if (*ch==quote) { @@ -1169,6 +1169,8 @@ int freadMain(freadMainArgs _args) { if (dec=='\0') STOP("dec='' not allowed. Should be '.' or ','"); if (args.sep == dec) STOP("sep == dec ('%c') is not allowed", dec); if (quote == dec) STOP("quote == dec ('%c') is not allowed", dec); + // since quote=='\0' when user passed quote="", the logic in this file uses '*ch==quote && quote' otherwise + // the ending \0 at eof could be treated as a quote (test xxx) // File parsing context: pointer to the start of file, and to the end of // the file. The `sof` pointer may be shifted in order to skip over @@ -1459,7 +1461,7 @@ int freadMain(freadMainArgs _args) { int topSkip=0; // how many rows to auto-skip const char *topStart=NULL; - for (quoteRule=0; quoteRule<4; quoteRule++) { + for (quoteRule=quote?0:3; quoteRule<4; quoteRule++) { // quote rule in order of preference. // when top is tied the first wins, so do all seps for the first quoteRule, then all seps for the second quoteRule, etc for (int s=0; s1) continue; // turn off self-healing quote rules when filling + if (quoteRule>1 && quote) continue; // turn off self-healing quote rule when filling int firstRowNcol = countfields(&ch); int thisncol=0, maxncol=firstRowNcol, thisRow=0; while (ch1) { + if (quoteRule>1 && quote) { DTWARN("Found and resolved improper quoting in first %d rows. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning.", jumpLines); // TODO: include line number and text in warning. Could loop again with the standard quote rule to find the line that fails. } @@ -2172,10 +2174,10 @@ int freadMain(freadMainArgs _args) { tch = end_NA_string(tch); skip_white(&tch); if (!end_of_field(tch)) tch = afterSpace; // else it is the field_end, we're on closing sep|eol and we'll let processor write appropriate NA as if field was empty - if (*tch==quote) { quoted=true; tch++; } + if (*tch==quote && quote) { quoted=true; tch++; } } // else Field() handles NA inside it unlike other processors e.g. ,, is interpretted as "" or NA depending on option read inside Field() fun[abs(thisType)](&fctx); - if (quoted) { + if (quoted) { // quoted was only set to true with '&& quote' above (=> quote!='\0' now) if (*tch==quote) tch++; else goto typebump; } @@ -2362,6 +2364,7 @@ int freadMain(freadMainArgs _args) { for (int i=0; i