From 2fb8aa2987d9ffb2cba26f999bb007994fae6e10 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Fri, 12 Apr 2019 16:49:16 -0700 Subject: [PATCH 1/3] news item and added Roy to ctb list --- DESCRIPTION | 3 ++- NEWS.md | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 29f6b5f1d0..41be910570 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,7 +38,8 @@ Authors@R: c( person("Martin","Morgan", role="ctb"), person("Michael","Quinn", role="ctb"), person("@javrucebo","", role="ctb"), - person("@marc-outins","", role="ctb")) + person("@marc-outins","", role="ctb"), + person("Roy","Storey", role="ctb")) Depends: R (>= 3.1.0) Imports: methods Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo diff --git a/NEWS.md b/NEWS.md index 11ed12ef94..fb088bdbeb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,7 +6,10 @@ 1. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R. -2. `rleid` functions now support long vectors (length > 2 billion). +2. `rleid()` functions now support long vectors (length > 2 billion). + +3. `fread()`: + * skips embeded NUL characters with warning, [#3400](https://github.com/Rdatatable/data.table/issues/3400) #### BUG FIXES From 05589d14ea1cd0e757296d45f0b29d0ca9d34fa6 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 15 Apr 2019 14:11:05 -0700 Subject: [PATCH 2/3] interim; passing all tests inc extra tests from #3400 --- inst/tests/tests.Rraw | 18 +++++++++++++-- src/fread.c | 54 +++++++++++++++++++++---------------------- 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 2d0e186ec4..5365de26d4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14018,8 +14018,22 @@ setindex(DT, y) test(2024, DT[y==6, v:=10L, verbose=TRUE], output=c("Constructing irows for.*", "Reorder irows for.*")) # fread embedded NULL, #3400 -test(2025, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3), B=INT(2,2), C=INT(3,1))) - +test(2025.1, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3,4), B=INT(2,2,5), C=INT(3,1,6))) +f = tempfile() +for (nNUL in 0:3) { + writeBin(c(charToRaw("a=b\nA B C\n1 3 5\n"), rep(as.raw(0), nNUL), charToRaw("2 4 6\n")), con=f) + test(2025.2, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) + test(2025.3, fread(f), ans) # auto detect skip and header works too + writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f) + test(2025.4, fread(f, skip=1, header=TRUE), ans) + test(2025.5, fread(f), ans) + writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A B C\n1 3 5\n2 4 6\n")), con=f) + test(2025.6, fread(f, skip=1, header=TRUE), ans) + test(2025.7, fread(f), ans) + writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f) + test(2025.8, fread(f, skip=1, header=TRUE), ans) + test(2025.9, fread(f), ans) +} ################################### # Add new tests above this line # diff --git a/src/fread.c b/src/fread.c index 3e05f97583..4ded050a7d 100644 --- a/src/fread.c +++ b/src/fread.c @@ -226,11 +226,12 @@ static char *typesAsString(int ncol) { static inline void skip_white(const char **pch) { // skip space so long as sep isn't space and skip tab so long as sep isn't tab + // always skip any \0 (NUL) that occur before end of file, #3400 const char *ch = *pch; - if (whiteChar == 0) { // whiteChar==0 means skip both ' ' and '\t'; sep is neither ' ' nor '\t'. - while (*ch == ' ' || *ch == '\t') ch++; + if (whiteChar==0) { // whiteChar==0 means skip both ' ' and '\t'; sep is neither ' ' nor '\t'. + while (*ch==' ' || *ch=='\t' || (*ch=='\0' && chch) = ch; int fieldLen = (int)(ch-fieldStart); - if (stripWhite) { // TODO: do this if and the next one together once in bulk afterwards before push - while(fieldLen>0 && ch[-1]==' ') { fieldLen--; ch--; } + //if (stripWhite) { // TODO: do this if and the next one together once in bulk afterwards before push + while(fieldLen>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { fieldLen--; ch--; } // this space can't be sep otherwise it would have stopped the field earlier inside end_of_field() - } + //} if ((fieldLen==0 && blank_is_a_NAstring) || (fieldLen && end_NA_string(fieldStart)==ch)) fieldLen=INT32_MIN; // TODO - speed up by avoiding end_NA_string when there are none target->off = (int32_t)(fieldStart - ctx->anchor); target->len = fieldLen; @@ -565,8 +568,8 @@ static void Field(FieldParseContext *ctx) *(ctx->ch) = ch; } else { *(ctx->ch) = ch; - if (*ch=='\0' && quoteRule!=2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2 - if (stripWhite) while(target->len>0 && ch[-1]==' ') { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST " + if (*ch=='\0' && quoteRule!=2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2 + while(target->len>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST " } } @@ -1061,7 +1064,7 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped field++; if (sep==' ' && *ch==sep) { while (ch[1]==' ') ch++; - if (ch[1]=='\0' || ch[1]=='\n' || ch[1]=='\r') ch++; // space at the end of line does not mean sep + if (ch[1]=='\n' || ch[1]=='\r' || (ch[1]=='\0' && ch+1=eof) STOP("Input is either empty, fully whitespace, or skip has been set after the last non-whitespace."); @@ -1683,7 +1686,7 @@ int freadMain(freadMainArgs _args) { continue; } if ( (thisNcol1 && !fill) || - (!eol(&ch) && *ch!='\0') ) { + (!eol(&ch) && ch!=eof) ) { if (verbose) DTPRINT(" A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. %s\n", thisNcol0 ? "Most likely this jump landed awkwardly so type bumps here will be skipped." : ""); bumped = false; @@ -2119,7 +2122,7 @@ int freadMain(freadMainArgs _args) { } //*** END HOT. START TEPID ***// if (tch==tLineStart) { - skip_white(&tch); + skip_white(&tch); // skips \0 before eof if (*tch=='\0') break; // empty last line if (eol(&tch) && skipEmptyLines) { tch++; continue; } tch = tLineStart; // in case white space at the beginning may need to be including in field @@ -2148,11 +2151,8 @@ int freadMain(freadMainArgs _args) { if (sep==' ') { while (*tch==' ') tch++; // multiple sep=' ' at the tLineStart does not mean sep. We're at tLineStart because the fast branch above doesn't run when sep=' ' fieldStart = tch; - skip_white(&tch); - if (*tch=='\0') { - nrowLimit = myNrow; - continue; // empty last line - } + skip_white(&tch); // skips \0 before eof + if (*tch=='\0') continue; // tch==eof; empty last line if (eol(&tch) && skipEmptyLines) { tch++; continue; } tch = fieldStart; // in case tabs at the beginning of the first field need to be included } @@ -2183,7 +2183,7 @@ int freadMain(freadMainArgs _args) { if (end_of_field(tch)) { if (sep==' ' && *tch==' ') { while (tch[1]==' ') tch++; // multiple space considered one sep so move to last - if (tch[1]=='\r' || tch[1]=='\n' || tch[1]=='\0') tch++; + if (tch[1]=='\r' || tch[1]=='\n' || tch+1==eof) tch++; } break; } @@ -2233,17 +2233,17 @@ int freadMain(freadMainArgs _args) { ((char**) targets)[size[j]] += size[j]; j++; if (*tch==sep) { tch++; continue; } - if (fill && (*tch=='\n' || *tch=='\r' || *tch=='\0') && j Date: Mon, 15 Apr 2019 14:44:07 -0700 Subject: [PATCH 3/3] extra test of NUL around field --- NEWS.md | 4 ++-- inst/tests/tests.Rraw | 23 +++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/NEWS.md b/NEWS.md index ba7ab7ee63..0fc093e43e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,9 +9,9 @@ 2. `rleid()` functions now support long vectors (length > 2 billion). 3. `fread()`: - * skips embeded NUL characters with warning, [#3400](https://github.com/Rdatatable/data.table/issues/3400) + * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, and Roy Storey for the initial PR. -3. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). +4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950). ```R > DT = data.table(A=1:3, B=list(1:2,"foo",3:5)) > DT diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5365de26d4..e821bf2deb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -14017,23 +14017,26 @@ DT = data.table(x=c("a","b","c","b","a","c"), y=c(1,3,6,1,6,3), v=1:6) setindex(DT, y) test(2024, DT[y==6, v:=10L, verbose=TRUE], output=c("Constructing irows for.*", "Reorder irows for.*")) -# fread embedded NULL, #3400 -test(2025.1, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3,4), B=INT(2,2,5), C=INT(3,1,6))) +# fread embedded '\0', #3400 +test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3,4), B=INT(2,2,5), C=INT(3,1,6))) f = tempfile() for (nNUL in 0:3) { writeBin(c(charToRaw("a=b\nA B C\n1 3 5\n"), rep(as.raw(0), nNUL), charToRaw("2 4 6\n")), con=f) - test(2025.2, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) - test(2025.3, fread(f), ans) # auto detect skip and header works too + test(2025.02, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6)) + test(2025.03, fread(f), ans) # auto detect skip and header works too writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f) - test(2025.4, fread(f, skip=1, header=TRUE), ans) - test(2025.5, fread(f), ans) + test(2025.04, fread(f, skip=1, header=TRUE), ans) + test(2025.05, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A B C\n1 3 5\n2 4 6\n")), con=f) - test(2025.6, fread(f, skip=1, header=TRUE), ans) - test(2025.7, fread(f), ans) + test(2025.06, fread(f, skip=1, header=TRUE), ans) + test(2025.07, fread(f), ans) writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f) - test(2025.8, fread(f, skip=1, header=TRUE), ans) - test(2025.9, fread(f), ans) + test(2025.08, fread(f, skip=1, header=TRUE), ans) + test(2025.09, fread(f), ans) } +writeBin(c(charToRaw("A,B,C\n1,foo,5\n2,"), as.raw(0),charToRaw("bar"), as.raw(0),as.raw(0), charToRaw(",6\n")), con=f) +test(2025.10, fread(f), data.table(A=1:2, B=c("foo","bar"), C=5:6)) + ################################### # Add new tests above this line #