Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ Authors@R: c(
person("Martin","Morgan", role="ctb"),
person("Michael","Quinn", role="ctb"),
person("@javrucebo","", role="ctb"),
person("@marc-outins","", role="ctb"))
person("@marc-outins","", role="ctb"),
person("Roy","Storey", role="ctb"))
Depends: R (>= 3.1.0)
Imports: methods
Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo
Expand Down
7 changes: 5 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@

1. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R.

2. `rleid` functions now support long vectors (length > 2 billion).
2. `rleid()` functions now support long vectors (length > 2 billion).

3. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
3. `fread()`:
* now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, and Roy Storey for the initial PR.

4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
```R
> DT = data.table(A=1:3, B=list(1:2,"foo",3:5))
> DT
Expand Down
21 changes: 19 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -14017,8 +14017,25 @@ DT = data.table(x=c("a","b","c","b","a","c"), y=c(1,3,6,1,6,3), v=1:6)
setindex(DT, y)
test(2024, DT[y==6, v:=10L, verbose=TRUE], output=c("Constructing irows for.*", "Reorder irows for.*"))

# fread embedded NULL, #3400
test(2025, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3), B=INT(2,2), C=INT(3,1)))
# fread embedded '\0', #3400
test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3,4), B=INT(2,2,5), C=INT(3,1,6)))
f = tempfile()
for (nNUL in 0:3) {
writeBin(c(charToRaw("a=b\nA B C\n1 3 5\n"), rep(as.raw(0), nNUL), charToRaw("2 4 6\n")), con=f)
test(2025.02, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6))
test(2025.03, fread(f), ans) # auto detect skip and header works too
writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f)
test(2025.04, fread(f, skip=1, header=TRUE), ans)
test(2025.05, fread(f), ans)
writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A B C\n1 3 5\n2 4 6\n")), con=f)
test(2025.06, fread(f, skip=1, header=TRUE), ans)
test(2025.07, fread(f), ans)
writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f)
test(2025.08, fread(f, skip=1, header=TRUE), ans)
test(2025.09, fread(f), ans)
}
writeBin(c(charToRaw("A,B,C\n1,foo,5\n2,"), as.raw(0),charToRaw("bar"), as.raw(0),as.raw(0), charToRaw(",6\n")), con=f)
test(2025.10, fread(f), data.table(A=1:2, B=c("foo","bar"), C=5:6))


###################################
Expand Down
54 changes: 27 additions & 27 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,12 @@ static char *typesAsString(int ncol) {

static inline void skip_white(const char **pch) {
// skip space so long as sep isn't space and skip tab so long as sep isn't tab
// always skip any \0 (NUL) that occur before end of file, #3400
const char *ch = *pch;
if (whiteChar == 0) { // whiteChar==0 means skip both ' ' and '\t'; sep is neither ' ' nor '\t'.
while (*ch == ' ' || *ch == '\t') ch++;
if (whiteChar==0) { // whiteChar==0 means skip both ' ' and '\t'; sep is neither ' ' nor '\t'.
while (*ch==' ' || *ch=='\t' || (*ch=='\0' && ch<eof)) ch++;
} else {
while (*ch == whiteChar) ch++; // sep is ' ' or '\t' so just skip the other one.
while (*ch==whiteChar || (*ch=='\0' && ch<eof)) ch++; // sep is ' ' or '\t' so just skip the other one.
}
*pch = ch;
}
Expand Down Expand Up @@ -272,7 +273,8 @@ static inline bool end_of_field(const char *ch) {
// of \r, \n, \0. We cast to unsigned first because `char` type is signed by
// default, and therefore characters in the range 0x80-0xFF are negative.
// We use eol() because that looks at eol_one_r inside it w.r.t. \r
return *ch==sep || ((uint8_t)*ch<=13 && (*ch=='\0' || eol(&ch)));
// \0 (maybe more than one) before eof are part of field and do not end it; eol() returns false for \0 but the ch==eof will return true for the \0 at eof.
return *ch==sep || ((uint8_t)*ch<=13 && (ch==eof || eol(&ch)));
}


Expand Down Expand Up @@ -322,7 +324,7 @@ static inline int countfields(const char **pch)
// Field() leaves *ch resting on sep, \r, \n or *eof=='\0'
if (sep==' ' && *ch==sep) {
while (ch[1]==' ') ch++;
if (ch[1]=='\r' || ch[1]=='\n' || ch[1]=='\0') {
if (ch[1]=='\r' || ch[1]=='\n' || (ch[1]=='\0' && ch+1==eof)) {
// reached end of line. Ignore padding spaces at the end of line.
ch++; // Move onto end of line character
}
Expand All @@ -333,7 +335,7 @@ static inline int countfields(const char **pch)
continue;
}
if (eol(&ch)) { *pch=ch+1; return ncol; }
if (*ch!='\0') return -1; // -1 means this line not valid for this sep and quote rule
if (ch!=eof) return -1; // -1 means this line not valid for this sep and quote rule
break;
}
*pch = ch;
Expand All @@ -348,7 +350,7 @@ static inline const char *nextGoodLine(const char *ch, int ncol)
// If this doesn't return the true line start, no matter. The previous thread will run-on and
// resolve it. A good guess is all we need here. Being wrong will just be a bit slower.
// If there are no embedded newlines, all newlines are true, and this guess will never be wrong.
while (*ch!='\0' && *ch!='\n' && *ch!='\r') ch++;
while (*ch!='\n' && *ch!='\r' && (*ch!='\0' || ch<eof)) ch++;
if (ch==eof) return eof;
if (eol(&ch)) // move to last byte of the line ending sequence (e.g. \r\r\n would be +2).
ch++; // and then move to first byte of next line
Expand All @@ -359,7 +361,7 @@ static inline const char *nextGoodLine(const char *ch, int ncol)
while (attempts++<5 && ch<eof) {
const char *ch2 = ch;
if (countfields(&ch2)==ncol) return ch; // returns simpleNext here on first attempt, almost all the time
while (*ch!='\0' && *ch!='\n' && *ch!='\r') ch++;
while (*ch!='\n' && *ch!='\r' && (*ch!='\0' || ch<eof)) ch++;
if (eol(&ch)) ch++;
}
return simpleNext;
Expand Down Expand Up @@ -490,17 +492,18 @@ static void Field(FieldParseContext *ctx)

// need to skip_white first for the reason that a quoted field might have space before the
// quote; e.g. test 1609. We need to skip the space(s) to then switch on quote or not.
if (*ch==' ' && stripWhite) while(*++ch==' '); // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
if ((*ch==' ' && stripWhite) || (*ch=='\0' && ch<eof))
while(*++ch==' ' || (*ch=='\0' && ch<eof)); // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
const char *fieldStart=ch;
if (*ch!=quote || quoteRule==3) {
// Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that.
while(!end_of_field(ch)) ch++; // sep, \r, \n or \0 will end
*(ctx->ch) = ch;
int fieldLen = (int)(ch-fieldStart);
if (stripWhite) { // TODO: do this if and the next one together once in bulk afterwards before push
while(fieldLen>0 && ch[-1]==' ') { fieldLen--; ch--; }
//if (stripWhite) { // TODO: do this if and the next one together once in bulk afterwards before push
while(fieldLen>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { fieldLen--; ch--; }
// this space can't be sep otherwise it would have stopped the field earlier inside end_of_field()
}
//}
if ((fieldLen==0 && blank_is_a_NAstring) || (fieldLen && end_NA_string(fieldStart)==ch)) fieldLen=INT32_MIN; // TODO - speed up by avoiding end_NA_string when there are none
target->off = (int32_t)(fieldStart - ctx->anchor);
target->len = fieldLen;
Expand Down Expand Up @@ -565,8 +568,8 @@ static void Field(FieldParseContext *ctx)
*(ctx->ch) = ch;
} else {
*(ctx->ch) = ch;
if (*ch=='\0' && quoteRule!=2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
if (stripWhite) while(target->len>0 && ch[-1]==' ') { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST "
if (*ch=='\0' && quoteRule!=2) { target->off--; target->len++; } // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
while(target->len>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { target->len--; ch--; } // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST "
}
}

Expand Down Expand Up @@ -1061,7 +1064,7 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
field++;
if (sep==' ' && *ch==sep) {
while (ch[1]==' ') ch++;
if (ch[1]=='\0' || ch[1]=='\n' || ch[1]=='\r') ch++; // space at the end of line does not mean sep
if (ch[1]=='\n' || ch[1]=='\r' || (ch[1]=='\0' && ch+1<eof)) ch++; // space at the end of line does not mean sep
}
if (*ch!=sep || field==ncol) break; // field==ncol is needed for 1753.2 where line ends with an extra comma but shouldn't, so shouldn't be moved over
ch++;
Expand Down Expand Up @@ -1394,7 +1397,7 @@ int freadMain(freadMainArgs _args) {

// skip blank input at the start
const char *lineStart = ch;
while (ch<eof && isspace(*ch)) { // isspace matches ' ', \t, \n and \r
while (ch<eof && (isspace(*ch) || *ch=='\0')) { // isspace matches ' ', \t, \n and \r; \0 before eof should be skipped too
if (*ch=='\n') { ch++; lineStart=ch; row1line++; } else ch++;
}
if (ch>=eof) STOP("Input is either empty, fully whitespace, or skip has been set after the last non-whitespace.");
Expand Down Expand Up @@ -1683,7 +1686,7 @@ int freadMain(freadMainArgs _args) {
continue;
}
if ( (thisNcol<ncol && ncol>1 && !fill) ||
(!eol(&ch) && *ch!='\0') ) {
(!eol(&ch) && ch!=eof) ) {
if (verbose) DTPRINT(" A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. %s\n",
thisNcol<ncol ? "few" : "many", thisNcol, ncol, jumpLine, jump, jump>0 ? "Most likely this jump landed awkwardly so type bumps here will be skipped." : "");
bumped = false;
Expand Down Expand Up @@ -2119,7 +2122,7 @@ int freadMain(freadMainArgs _args) {
}
//*** END HOT. START TEPID ***//
if (tch==tLineStart) {
skip_white(&tch);
skip_white(&tch); // skips \0 before eof
if (*tch=='\0') break; // empty last line
if (eol(&tch) && skipEmptyLines) { tch++; continue; }
tch = tLineStart; // in case white space at the beginning may need to be including in field
Expand Down Expand Up @@ -2148,11 +2151,8 @@ int freadMain(freadMainArgs _args) {
if (sep==' ') {
while (*tch==' ') tch++; // multiple sep=' ' at the tLineStart does not mean sep. We're at tLineStart because the fast branch above doesn't run when sep=' '
fieldStart = tch;
skip_white(&tch);
if (*tch=='\0') {
nrowLimit = myNrow;
continue; // empty last line
}
skip_white(&tch); // skips \0 before eof
if (*tch=='\0') continue; // tch==eof; empty last line
if (eol(&tch) && skipEmptyLines) { tch++; continue; }
tch = fieldStart; // in case tabs at the beginning of the first field need to be included
}
Expand Down Expand Up @@ -2183,7 +2183,7 @@ int freadMain(freadMainArgs _args) {
if (end_of_field(tch)) {
if (sep==' ' && *tch==' ') {
while (tch[1]==' ') tch++; // multiple space considered one sep so move to last
if (tch[1]=='\r' || tch[1]=='\n' || tch[1]=='\0') tch++;
if (tch[1]=='\r' || tch[1]=='\n' || tch+1==eof) tch++;
}
break;
}
Expand Down Expand Up @@ -2233,17 +2233,17 @@ int freadMain(freadMainArgs _args) {
((char**) targets)[size[j]] += size[j];
j++;
if (*tch==sep) { tch++; continue; }
if (fill && (*tch=='\n' || *tch=='\r' || *tch=='\0') && j<ncol) continue; // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here
if (fill && (*tch=='\n' || *tch=='\r' || tch==eof) && j<ncol) continue; // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here
break;
}
if (j<ncol || (!eol(&tch) && *tch!='\0')) {
if (j<ncol || (!eol(&tch) && tch!=eof)) {
// Too few or too many columns observed (but not empty lines when skipEmptyLines as they were found and skipped earlier above).
// If fill==true, fields should already have been filled above due to continue inside while(j<ncol).
myStopEarly = true;
tch = tLineStart;
break;
}
if (*tch!='\0') tch++;
if (tch!=eof) tch++;
myNrow++;
}
if (verbose) { double now = wallclock(); thRead += now-tLast; tLast = now; }
Expand Down