From 97d40c9f1d0b5dd313f6d34f729a0d824305c399 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 19 Apr 2019 16:15:46 -0700 Subject: [PATCH 1/6] nul in the middle of a field fixed too --- inst/tests/tests.Rraw | 5 ++++- src/fread.c | 12 ++++++------ src/freadR.c | 20 +++++++++++++++++--- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1e9ea9c12d..3e408f87d7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14033,8 +14033,11 @@ for (nNUL in 0:3) { test(2025.08, fread(f, skip=1, header=TRUE), ans) test(2025.09, fread(f), ans) } -writeBin(c(charToRaw("A,B,C\n1,foo,5\n2,"), as.raw(0),charToRaw("bar"), as.raw(0),as.raw(0), charToRaw(",6\n")), con=f) +makeNul = function(str){ tt=charToRaw(str); tt[tt==42L]=as.raw(0); writeBin(tt, con=f)} # "*" (42) represents NUL +makeNul("A,B,C\n1,foo,5\n2,*bar**,6\n") test(2025.10, fread(f), data.table(A=1:2, B=c("foo","bar"), C=5:6)) +makeNul('A,B,C\n1,foo*bar,3\n2,**"**b*az*",4\n') +test(2025.11, fread(f), data.table(A=1:2, B=c("foobar","baz"), C=3:4)) # printing timezone, #2842 DT = data.table(t1 = as.POSIXct("1982-04-26 13:34:56", tz = "Europe/Madrid"),t2 = as.POSIXct("2019-01-01 19:00:01",tz = "UTC")) diff --git a/src/fread.c b/src/fread.c index 4ded050a7d..2ae2e64cc3 100644 --- a/src/fread.c +++ b/src/fread.c @@ -497,7 +497,7 @@ static void Field(FieldParseContext *ctx) const char *fieldStart=ch; if (*ch!=quote || quoteRule==3) { // Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that. - while(!end_of_field(ch)) ch++; // sep, \r, \n or \0 will end + while(!end_of_field(ch)) ch++; // sep, \r, \n or eof will end *(ctx->ch) = ch; int fieldLen = (int)(ch-fieldStart); //if (stripWhite) { // TODO: do this if and the next one together once in bulk afterwards before push @@ -516,7 +516,7 @@ static void Field(FieldParseContext *ctx) fieldStart++; // step over opening quote switch(quoteRule) { case 0: // quoted with embedded quotes doubled; the final unescaped " must be followed by sep|eol - while (*++ch) { + while (*++ch || chch) = ch; } else { *(ctx->ch) = ch; - if (*ch=='\0' && quoteRule!=2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2 + if (ch==eof && quoteRule!=2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2 while(target->len>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST " } } diff --git a/src/freadR.c b/src/freadR.c index 7d025c81a8..84ed093745 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -427,10 +427,24 @@ void pushBuffer(ThreadLocalFreadParsingContext *ctx) lenOff *source = buff8_lenoffs + off8; for (int i=0; ilen; - if (strLen) { + if (strLen<=0) { // stringLen == INT_MIN => NA, otherwise not a NAstring was checked inside fread_mean - SET_STRING_ELT(dest, DTi+i, strLen<0 ? NA_STRING : mkCharLenCE(anchor + source->off, strLen, ienc)); - } // else dest was already initialized with R_BlankString by allocVector() + if (strLen<0) SET_STRING_ELT(dest, DTi+i, NA_STRING); // else leave the "" in place that was initialized by allocVector() + } else { + const char *str = anchor + source->off; + int c=0; + while (c Date: Fri, 19 Apr 2019 16:46:50 -0700 Subject: [PATCH 2/6] was it really this test that failed on windows ci? Temporarily disabling to test --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3e408f87d7..8ff6cf643b 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5708,11 +5708,11 @@ test(1378.1, fread(file=testDir("russellCRLF.csv"))[19,`Value With Dividends`], f = paste0("file://",testDir("russellCRLF.csv")) # simulates a http:// request as far as file.download() and unlink() goes, without internet # download.file() in fread() changes the input data from \r\n to \n, on Windows. -test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) +# test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) f = paste("file://",testDir("russellCRCRLF.csv"),sep="") # actually has 3 \r in the file, download.file() from file:// changes that to \r\r\n, so we can simulate download.file from http: in text mode. -test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) +# test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) #==================================== options(datatable.fread.datatable = FALSE) From d52f4654402a25c7ed4fa43e28d538b06d6f02d8 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 19 Apr 2019 16:56:26 -0700 Subject: [PATCH 3/6] maybe it is new test then. initialize nafill'd toc just to clear compile warning --- inst/tests/tests.Rraw | 2 +- src/nafill.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 8ff6cf643b..4fefc64801 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14037,7 +14037,7 @@ makeNul = function(str){ tt=charToRaw(str); tt[tt==42L]=as.raw(0); writeBin(tt, makeNul("A,B,C\n1,foo,5\n2,*bar**,6\n") test(2025.10, fread(f), data.table(A=1:2, B=c("foo","bar"), C=5:6)) makeNul('A,B,C\n1,foo*bar,3\n2,**"**b*az*",4\n') -test(2025.11, fread(f), data.table(A=1:2, B=c("foobar","baz"), C=3:4)) +# test(2025.11, fread(f), data.table(A=1:2, B=c("foobar","baz"), C=3:4)) # printing timezone, #2842 DT = data.table(t1 = as.POSIXct("1982-04-26 13:34:56", tz = "Europe/Madrid"),t2 = as.POSIXct("2019-01-01 19:00:01",tz = "UTC")) diff --git a/src/nafill.c b/src/nafill.c index 51cf023646..cfaff89dc5 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -41,7 +41,7 @@ SEXP colnamesInt(SEXP x, SEXP cols) { } void nafillDouble(double *x, uint_fast64_t nx, unsigned int type, double fill, ans_t *ans, bool verbose) { - double tic; + double tic=0.0; if (verbose) tic = omp_get_wtime(); if (type==0) { // const for (uint_fast64_t i=0; i1) schedule(auto) num_threads(getDTthreads()) for (R_len_t i=0; i Date: Fri, 19 Apr 2019 17:06:27 -0700 Subject: [PATCH 4/6] another attempt --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4fefc64801..5ae5231816 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11969,7 +11969,7 @@ test(1885.4, fread(txt, fill=TRUE, na.strings=""), ans[c(1,2,NA,3),]) # file detected as no header automatically # (TOOD: undoubling double quotes #1109, #1299) but otherwise, auto mode correct -test(1886, fread(testDir("quoted_no_header.csv"))[c(1,.N),list(V1,V6)], data.table(V1=c("John","Joan \"\"the bone\"\", Anne"), V6=INT(8075,123))) +# test(1886, fread(testDir("quoted_no_header.csv"))[c(1,.N),list(V1,V6)], data.table(V1=c("John","Joan \"\"the bone\"\", Anne"), V6=INT(8075,123))) # na.omit with invert & no NAs works, #2660 DT = data.table(a = 1:5) From 604efe3a58eeca230c9ac1524bc3634380411bf4 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 19 Apr 2019 17:18:37 -0700 Subject: [PATCH 5/6] new test back on --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5ae5231816..be7f8ef456 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14037,7 +14037,7 @@ makeNul = function(str){ tt=charToRaw(str); tt[tt==42L]=as.raw(0); writeBin(tt, makeNul("A,B,C\n1,foo,5\n2,*bar**,6\n") test(2025.10, fread(f), data.table(A=1:2, B=c("foo","bar"), C=5:6)) makeNul('A,B,C\n1,foo*bar,3\n2,**"**b*az*",4\n') -# test(2025.11, fread(f), data.table(A=1:2, B=c("foobar","baz"), C=3:4)) +test(2025.11, fread(f), data.table(A=1:2, B=c("foobar","baz"), C=3:4)) # printing timezone, #2842 DT = data.table(t1 = as.POSIXct("1982-04-26 13:34:56", tz = "Europe/Madrid"),t2 = as.POSIXct("2019-01-01 19:00:01",tz = "UTC")) From 195b7753966b75f48a215268b3c1e76a33e53f79 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 19 Apr 2019 17:39:39 -0700 Subject: [PATCH 6/6] tests back on (ch2 not ch) --- inst/tests/tests.Rraw | 6 +++--- src/fread.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index be7f8ef456..3e408f87d7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5708,11 +5708,11 @@ test(1378.1, fread(file=testDir("russellCRLF.csv"))[19,`Value With Dividends`], f = paste0("file://",testDir("russellCRLF.csv")) # simulates a http:// request as far as file.download() and unlink() goes, without internet # download.file() in fread() changes the input data from \r\n to \n, on Windows. -# test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) +test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) f = paste("file://",testDir("russellCRCRLF.csv"),sep="") # actually has 3 \r in the file, download.file() from file:// changes that to \r\r\n, so we can simulate download.file from http: in text mode. -# test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) +test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) #==================================== options(datatable.fread.datatable = FALSE) @@ -11969,7 +11969,7 @@ test(1885.4, fread(txt, fill=TRUE, na.strings=""), ans[c(1,2,NA,3),]) # file detected as no header automatically # (TOOD: undoubling double quotes #1109, #1299) but otherwise, auto mode correct -# test(1886, fread(testDir("quoted_no_header.csv"))[c(1,.N),list(V1,V6)], data.table(V1=c("John","Joan \"\"the bone\"\", Anne"), V6=INT(8075,123))) +test(1886, fread(testDir("quoted_no_header.csv"))[c(1,.N),list(V1,V6)], data.table(V1=c("John","Joan \"\"the bone\"\", Anne"), V6=INT(8075,123))) # na.omit with invert & no NAs works, #2660 DT = data.table(a = 1:5) diff --git a/src/fread.c b/src/fread.c index 2ae2e64cc3..bcbda32a59 100644 --- a/src/fread.c +++ b/src/fread.c @@ -545,7 +545,7 @@ static void Field(FieldParseContext *ctx) // if there is a ", afterwards but before the next \n, use that; the field was quoted and it's still case (i) above. // Otherwise break here at this first sep as it's case (ii) above (the data contains a quote at the start and no sep) ch2 = ch; - while ((*++ch2 || ch