Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@

4. `fwrite()` could crash when writing very long strings such as 30 million characters, [#2974](https://github.com/Rdatatable/data.table/issues/2974), and could be unstable in memory constrained environments, [#2612](https://github.com/Rdatatable/data.table/issues/2612). Thanks to @logworthy and @zachokeeffe for reporting and Philippe Chataignon for fixing in PR [#3288](https://github.com/Rdatatable/data.table/pull/3288).

5. `fread()` could crash if `quote=""` (i.e. ignore quotes), the last line is too short, and `fill=TRUE`, [#3524](https://github.com/Rdatatable/data.table/pull/3524). Thanks to Jiucang Hao for the report and reproducible example.

#### NOTES

1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
Expand Down
Binary file added inst/tests/noquote.csv.gz
Binary file not shown.
10 changes: 10 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -14072,6 +14072,16 @@ test(2027.1, DT[, list(1, ), by=a], error = 'Item 2 of the .() or list() passed
test(2027.2, DT[, list(1,2,)], error = 'Item 3 of the .() or list() passed to j is missing')
test(2027.3, DT[, .(1,,3,), by=a], error = 'Item 2 of the .() or list() passed to j is missing')

# fread quote="" when last line too short and filled with fill=TRUE (provided via email), crash in v1.12.2 and before
test(2028.1, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="")[c(1,.N), c(1,2,9,10)],
data.table(H=c("D","T"), "Locate Reply"=c("BCS","Locate Reply"), V9=c("A",""), V10=c("4/23/2019 7:11:11 AM","")))
test(2028.2, fread(testDir("noquote.csv.gz"), fill=TRUE, quote="", header=FALSE)[c(1,.N), c(1,2,3,8,9,10)],
data.table(V1=c("H","T"), V2="Locate Reply", V3=c("GORLTR","2093"), V8=INT(NA,NA), V9="", V10=""))
txt = "A,B,C\n1,4,7\n2,5,8\n3,6\n"
test(2029.1, fread(txt), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>")
test(2029.2, fread(txt, quote=""), data.table(A=1:2, B=4:5, C=7:8), warning="Discarded single-line footer: <<3,6>>")
test(2029.3, fread(txt, quote="", fill=TRUE), data.table(A=1:3, B=4:6, C=c(7:8,NA)))


###################################
# Add new tests above this line #
Expand Down
29 changes: 16 additions & 13 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,11 @@ static char *typesAsString(int ncol) {
static char str[101];
int i=0;
if (ncol<=100) {
for (; i<ncol; i++) str[i] = typeLetter[type[i]];
for (; i<ncol; i++) str[i] = typeLetter[abs(type[i])]; // abs for out-of-sample type bumps (negative)
} else {
for (; i<80; i++) str[i] = typeLetter[type[i]];
for (; i<80; i++) str[i] = typeLetter[abs(type[i])];
str[i++]='.'; str[i++]='.'; str[i++]='.';
for (int j=ncol-10; j<ncol; j++) str[i++] = typeLetter[type[j]];
for (int j=ncol-10; j<ncol; j++) str[i++] = typeLetter[abs(type[j])];
}
str[i] = '\0';
return str;
Expand Down Expand Up @@ -495,7 +495,7 @@ static void Field(FieldParseContext *ctx)
if ((*ch==' ' && stripWhite) || (*ch=='\0' && ch<eof))
while(*++ch==' ' || (*ch=='\0' && ch<eof)); // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
const char *fieldStart=ch;
if (*ch!=quote || quoteRule==3) {
if (*ch!=quote || quoteRule==3 || quote=='\0') {
// Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that.
while(!end_of_field(ch)) ch++; // sep, \r, \n or eof will end
*(ctx->ch) = ch;
Expand Down Expand Up @@ -562,7 +562,7 @@ static void Field(FieldParseContext *ctx)
}
target->len = (int32_t)(ch - fieldStart);
target->off = (int32_t)(fieldStart - ctx->anchor);
if (*ch==quote) {
if (*ch==quote) { // quote=='\0' (user set quote="") would have returned earlier above in the same branch as quoteRule 3
ch++;
skip_white(&ch);
*(ctx->ch) = ch;
Expand Down Expand Up @@ -798,7 +798,7 @@ static void parse_double_extended(FieldParseContext *ctx)
double *target = (double*) ctx->targets[sizeof(double)];
bool neg, quoted;
init();
ch += (quoted = (*ch=='"'));
ch += (quoted = (*ch==quote && quote));
ch += (neg = (*ch=='-')) + (*ch=='+');

if (ch[0]=='n' && ch[1]=='a' && ch[2]=='n' && (ch += 3)) goto return_nan;
Expand Down Expand Up @@ -843,7 +843,7 @@ static void parse_double_extended(FieldParseContext *ctx)
return_na:
*target = NA_FLOAT64;
ok:
if (quoted && *ch!='"') {
if (quoted && *ch!=quote) {
*target = NA_FLOAT64;
} else {
*(ctx->ch) = ch + quoted;
Expand Down Expand Up @@ -1048,7 +1048,7 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
// thread at headPos which has full lineage to sof may bump the quoteRule.
break; // caller will detect this line hasn't finished properly
}
if (*ch==quote) {
if (*ch==quote && quote) { // && quote to exclude quote='\0' (user passed quote="")
ch++;
fun[tmpType[field]](&fctx);
if (*ch==quote) {
Expand Down Expand Up @@ -1169,6 +1169,8 @@ int freadMain(freadMainArgs _args) {
if (dec=='\0') STOP("dec='' not allowed. Should be '.' or ','");
if (args.sep == dec) STOP("sep == dec ('%c') is not allowed", dec);
if (quote == dec) STOP("quote == dec ('%c') is not allowed", dec);
// since quote=='\0' when user passed quote="", the logic in this file uses '*ch==quote && quote' otherwise
// the ending \0 at eof could be treated as a quote (test xxx)

// File parsing context: pointer to the start of file, and to the end of
// the file. The `sof` pointer may be shifted in order to skip over
Expand Down Expand Up @@ -1459,7 +1461,7 @@ int freadMain(freadMainArgs _args) {
int topSkip=0; // how many rows to auto-skip
const char *topStart=NULL;

for (quoteRule=0; quoteRule<4; quoteRule++) {
for (quoteRule=quote?0:3; quoteRule<4; quoteRule++) {
// quote rule in order of preference.
// when top is tied the first wins, so do all seps for the first quoteRule, then all seps for the second quoteRule, etc
for (int s=0; s<nseps; s++) {
Expand All @@ -1469,7 +1471,7 @@ int freadMain(freadMainArgs _args) {
// if (verbose) DTPRINT(" Trying sep='%c' with quoteRule %d ...\n", sep, quoteRule);

if (fill) {
if (quoteRule>1) continue; // turn off self-healing quote rules when filling
if (quoteRule>1 && quote) continue; // turn off self-healing quote rule when filling
int firstRowNcol = countfields(&ch);
int thisncol=0, maxncol=firstRowNcol, thisRow=0;
while (ch<eof && ++thisRow<jumpLines) { // TODO: rename 'jumpLines' to 'jumpRows'
Expand Down Expand Up @@ -1562,7 +1564,7 @@ int freadMain(freadMainArgs _args) {
}

quoteRule = topQuoteRule;
if (quoteRule>1) {
if (quoteRule>1 && quote) {
DTWARN("Found and resolved improper quoting in first %d rows. If the fields are not quoted (e.g. field separator does not appear within any field), try quote=\"\" to avoid this warning.", jumpLines);
// TODO: include line number and text in warning. Could loop again with the standard quote rule to find the line that fails.
}
Expand Down Expand Up @@ -2172,10 +2174,10 @@ int freadMain(freadMainArgs _args) {
tch = end_NA_string(tch);
skip_white(&tch);
if (!end_of_field(tch)) tch = afterSpace; // else it is the field_end, we're on closing sep|eol and we'll let processor write appropriate NA as if field was empty
if (*tch==quote) { quoted=true; tch++; }
if (*tch==quote && quote) { quoted=true; tch++; }
} // else Field() handles NA inside it unlike other processors e.g. ,, is interpretted as "" or NA depending on option read inside Field()
fun[abs(thisType)](&fctx);
if (quoted) {
if (quoted) { // quoted was only set to true with '&& quote' above (=> quote!='\0' now)
if (*tch==quote) tch++;
else goto typebump;
}
Expand Down Expand Up @@ -2362,6 +2364,7 @@ int freadMain(freadMainArgs _args) {
for (int i=0; i<ncol; i++) typeCounts[ abs(type[i]) ]++;

if (nTypeBump) {
if (verbose) DTPRINT(" %d out-of-sample type bumps: %s\n", nTypeBump, typesAsString(ncol));
rowSize1 = rowSize4 = rowSize8 = 0;
nStringCols = 0;
nNonStringCols = 0;
Expand Down