Rdatatable · mattdowle · Apr 15, 2019 · Apr 12, 2019 · Apr 15, 2019 · Apr 15, 2019
@@ -38,7 +38,8 @@ Authors@R: c(
   person("Martin","Morgan",        role="ctb"),
   person("Michael","Quinn",        role="ctb"),
   person("@javrucebo","",          role="ctb"),
-  person("@marc-outins","",        role="ctb"))
+  person("@marc-outins","",        role="ctb"),
+  person("Roy","Storey",           role="ctb"))
 Depends: R (>= 3.1.0)
 Imports: methods
 Suggests: bit64, curl, R.utils, knitr, xts, nanotime, zoo

@@ -6,9 +6,12 @@
 
 1. New option `options(datatable.quiet = TRUE)` turns off the package startup message, [#3489](https://github.com/Rdatatable/data.table/issues/3489). `suppressPackageStartupMessages()` continues to work too. Thanks to @leobarlach for the suggestion inspired by `options(tidyverse.quiet = TRUE)`. We don't know of a way to make a package respect the `quietly=` option of `library()` and `require()` because the `quietly=` isn't passed through for use by the package's own `.onAttach`. If you can see how to do that, please submit a patch to R.
 
-2. `rleid` functions now support long vectors (length > 2 billion).
+2. `rleid()` functions now support long vectors (length > 2 billion).
 
-3. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
+3. `fread()`:
+    * now skips embedded `NUL` (`\0`), [#3400](https://github.com/Rdatatable/data.table/issues/3400). Thanks to Marcus Davy for reporting with examples, and Roy Storey for the initial PR.
+
+4. Assigning to one item of a list column no longer requires the RHS to be wrapped with `list` or `.()`, [#950](https://github.com/Rdatatable/data.table/issues/950).
     ```R
     > DT = data.table(A=1:3, B=list(1:2,"foo",3:5))
     > DT

@@ -14017,8 +14017,25 @@ DT = data.table(x=c("a","b","c","b","a","c"), y=c(1,3,6,1,6,3), v=1:6)
 setindex(DT, y)
 test(2024, DT[y==6, v:=10L, verbose=TRUE], output=c("Constructing irows for.*", "Reorder irows for.*"))
 
-# fread embedded NULL, #3400
-test(2025, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3), B=INT(2,2), C=INT(3,1)))
+# fread embedded '\0', #3400
+test(2025.01, fread(testDir("issue_3400_fread.txt"), skip=1, header=TRUE), data.table(A=INT(1,3,4), B=INT(2,2,5), C=INT(3,1,6)))
+f = tempfile()
+for (nNUL in 0:3) {
+  writeBin(c(charToRaw("a=b\nA  B  C\n1  3  5\n"), rep(as.raw(0), nNUL), charToRaw("2  4  6\n")), con=f)
+  test(2025.02, fread(f, skip=1, header=TRUE), ans<-data.table(A=1:2, B=3:4, C=5:6))
+  test(2025.03, fread(f), ans)  # auto detect skip and header works too
+  writeBin(c(charToRaw("a=b\nA,B,C\n1,3,5\n"), rep(as.raw(0), nNUL), charToRaw("2,4,6\n")), con=f)
+  test(2025.04, fread(f, skip=1, header=TRUE), ans)
+  test(2025.05, fread(f), ans)
+  writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A  B  C\n1  3  5\n2  4  6\n")), con=f)
+  test(2025.06, fread(f, skip=1, header=TRUE), ans)
+  test(2025.07, fread(f), ans)
+  writeBin(c(charToRaw("a=b\n"), rep(as.raw(0), nNUL), charToRaw("A,B,C\n1,3,5\n2,4,6\n")), con=f)
+  test(2025.08, fread(f, skip=1, header=TRUE), ans)
+  test(2025.09, fread(f), ans)
+}
+writeBin(c(charToRaw("A,B,C\n1,foo,5\n2,"), as.raw(0),charToRaw("bar"), as.raw(0),as.raw(0), charToRaw(",6\n")), con=f)
+test(2025.10, fread(f), data.table(A=1:2, B=c("foo","bar"), C=5:6))
 
 
 ###################################

@@ -226,11 +226,12 @@ static char *typesAsString(int ncol) {
 
 static inline void skip_white(const char **pch) {
   // skip space so long as sep isn't space and skip tab so long as sep isn't tab
+  // always skip any \0 (NUL) that occur before end of file, #3400
   const char *ch = *pch;
-  if (whiteChar == 0) {   // whiteChar==0 means skip both ' ' and '\t';  sep is neither ' ' nor '\t'.
-    while (*ch == ' ' || *ch == '\t') ch++;
+  if (whiteChar==0) {   // whiteChar==0 means skip both ' ' and '\t';  sep is neither ' ' nor '\t'.
+    while (*ch==' ' || *ch=='\t' || (*ch=='\0' && ch<eof)) ch++;
   } else {
-    while (*ch == whiteChar) ch++;  // sep is ' ' or '\t' so just skip the other one.
+    while (*ch==whiteChar || (*ch=='\0' && ch<eof)) ch++;  // sep is ' ' or '\t' so just skip the other one.
   }
   *pch = ch;
 }
@@ -272,7 +273,8 @@ static inline bool end_of_field(const char *ch) {
   // of \r, \n, \0. We cast to unsigned first because `char` type is signed by
   // default, and therefore characters in the range 0x80-0xFF are negative.
   // We use eol() because that looks at eol_one_r inside it w.r.t. \r
-  return *ch==sep || ((uint8_t)*ch<=13 && (*ch=='\0' || eol(&ch)));
+  // \0 (maybe more than one) before eof are part of field and do not end it; eol() returns false for \0 but the ch==eof will return true for the \0 at eof.
+  return *ch==sep || ((uint8_t)*ch<=13 && (ch==eof || eol(&ch)));
 }
 
 
@@ -322,7 +324,7 @@ static inline int countfields(const char **pch)
     // Field() leaves *ch resting on sep, \r, \n or *eof=='\0'
     if (sep==' ' && *ch==sep) {
       while (ch[1]==' ') ch++;
-      if (ch[1]=='\r' || ch[1]=='\n' || ch[1]=='\0') {
+      if (ch[1]=='\r' || ch[1]=='\n' || (ch[1]=='\0' && ch+1==eof)) {
         // reached end of line. Ignore padding spaces at the end of line.
         ch++;  // Move onto end of line character
       }
@@ -333,7 +335,7 @@ static inline int countfields(const char **pch)
       continue;
     }
     if (eol(&ch)) { *pch=ch+1; return ncol; }
-    if (*ch!='\0') return -1;  // -1 means this line not valid for this sep and quote rule
+    if (ch!=eof) return -1;  // -1 means this line not valid for this sep and quote rule
     break;
   }
   *pch = ch;
@@ -348,7 +350,7 @@ static inline const char *nextGoodLine(const char *ch, int ncol)
   // If this doesn't return the true line start, no matter. The previous thread will run-on and
   // resolve it. A good guess is all we need here. Being wrong will just be a bit slower.
   // If there are no embedded newlines, all newlines are true, and this guess will never be wrong.
-  while (*ch!='\0' && *ch!='\n' && *ch!='\r') ch++;
+  while (*ch!='\n' && *ch!='\r' && (*ch!='\0' || ch<eof)) ch++;
   if (ch==eof) return eof;
   if (eol(&ch)) // move to last byte of the line ending sequence (e.g. \r\r\n would be +2).
     ch++;       // and then move to first byte of next line
@@ -359,7 +361,7 @@ static inline const char *nextGoodLine(const char *ch, int ncol)
   while (attempts++<5 && ch<eof) {
     const char *ch2 = ch;
     if (countfields(&ch2)==ncol) return ch;  // returns simpleNext here on first attempt, almost all the time
-    while (*ch!='\0' && *ch!='\n' && *ch!='\r') ch++;
+    while (*ch!='\n' && *ch!='\r' && (*ch!='\0' || ch<eof)) ch++;
     if (eol(&ch)) ch++;
   }
   return simpleNext;
@@ -490,17 +492,18 @@ static void Field(FieldParseContext *ctx)
 
   // need to skip_white first for the reason that a quoted field might have space before the
   // quote; e.g. test 1609. We need to skip the space(s) to then switch on quote or not.
-  if (*ch==' ' && stripWhite) while(*++ch==' ');  // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
+  if ((*ch==' ' && stripWhite) || (*ch=='\0' && ch<eof))
+    while(*++ch==' ' || (*ch=='\0' && ch<eof));  // if sep==' ' the space would have been skipped already and we wouldn't be on space now.
   const char *fieldStart=ch;
   if (*ch!=quote || quoteRule==3) {
     // Most common case. Unambiguously not quoted. Simply search for sep|eol. If field contains sep|eol then it should have been quoted and we do not try to heal that.
     while(!end_of_field(ch)) ch++;  // sep, \r, \n or \0 will end
     *(ctx->ch) = ch;
     int fieldLen = (int)(ch-fieldStart);
-    if (stripWhite) {   // TODO:  do this if and the next one together once in bulk afterwards before push
-      while(fieldLen>0 && ch[-1]==' ') { fieldLen--; ch--; }
+    //if (stripWhite) {   // TODO:  do this if and the next one together once in bulk afterwards before push
+      while(fieldLen>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { fieldLen--; ch--; }
       // this space can't be sep otherwise it would have stopped the field earlier inside end_of_field()
-    }
+    //}
     if ((fieldLen==0 && blank_is_a_NAstring) || (fieldLen && end_NA_string(fieldStart)==ch)) fieldLen=INT32_MIN;  // TODO - speed up by avoiding end_NA_string when there are none
     target->off = (int32_t)(fieldStart - ctx->anchor);
     target->len = fieldLen;
@@ -565,8 +568,8 @@ static void Field(FieldParseContext *ctx)
     *(ctx->ch) = ch;
   } else {
     *(ctx->ch) = ch;
-    if (*ch=='\0' && quoteRule!=2) { target->off--; target->len++; }              // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
-    if (stripWhite) while(target->len>0 && ch[-1]==' ') { target->len--; ch--; }  // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST       "
+    if (*ch=='\0' && quoteRule!=2) { target->off--; target->len++; }   // test 1324 where final field has open quote but not ending quote; include the open quote like quote rule 2
+    while(target->len>0 && ((ch[-1]==' ' && stripWhite) || ch[-1]=='\0')) { target->len--; ch--; }  // test 1551.6; trailing whitespace in field [67,V37] == "\"\"A\"\" ST       "
   }
 }
 
@@ -1061,7 +1064,7 @@ static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped
     field++;
     if (sep==' ' && *ch==sep) {
       while (ch[1]==' ') ch++;
-      if (ch[1]=='\0' || ch[1]=='\n' || ch[1]=='\r') ch++;  // space at the end of line does not mean sep
+      if (ch[1]=='\n' || ch[1]=='\r' || (ch[1]=='\0' && ch+1<eof)) ch++;  // space at the end of line does not mean sep
     }
     if (*ch!=sep || field==ncol) break;  // field==ncol is needed for 1753.2 where line ends with an extra comma but shouldn't, so shouldn't be moved over
     ch++;
@@ -1394,7 +1397,7 @@ int freadMain(freadMainArgs _args) {
 
   // skip blank input at the start
   const char *lineStart = ch;
-  while (ch<eof && isspace(*ch)) {   // isspace matches ' ', \t, \n and \r
+  while (ch<eof && (isspace(*ch) || *ch=='\0')) {   // isspace matches ' ', \t, \n and \r;  \0 before eof should be skipped too
     if (*ch=='\n') { ch++; lineStart=ch; row1line++; } else ch++;
   }
   if (ch>=eof) STOP("Input is either empty, fully whitespace, or skip has been set after the last non-whitespace.");
@@ -1683,7 +1686,7 @@ int freadMain(freadMainArgs _args) {
         continue;
       }
       if ( (thisNcol<ncol && ncol>1 && !fill) ||
-           (!eol(&ch) && *ch!='\0') ) {
+           (!eol(&ch) && ch!=eof) ) {
         if (verbose) DTPRINT("  A line with too-%s fields (%d/%d) was found on line %d of sample jump %d. %s\n",
                              thisNcol<ncol ? "few" : "many", thisNcol, ncol, jumpLine, jump, jump>0 ? "Most likely this jump landed awkwardly so type bumps here will be skipped." : "");
         bumped = false;
@@ -2119,7 +2122,7 @@ int freadMain(freadMainArgs _args) {
           }
           //*** END HOT. START TEPID ***//
           if (tch==tLineStart) {
-            skip_white(&tch);
+            skip_white(&tch);       // skips \0 before eof
             if (*tch=='\0') break;  // empty last line
             if (eol(&tch) && skipEmptyLines) { tch++; continue; }
             tch = tLineStart;  // in case white space at the beginning may need to be including in field
@@ -2148,11 +2151,8 @@ int freadMain(freadMainArgs _args) {
         if (sep==' ') {
           while (*tch==' ') tch++;  // multiple sep=' ' at the tLineStart does not mean sep. We're at tLineStart because the fast branch above doesn't run when sep=' '
           fieldStart = tch;
-          skip_white(&tch);
-          if (*tch=='\0') {
-            nrowLimit = myNrow;
-            continue; // empty last line
-          }
+          skip_white(&tch);          // skips \0 before eof
+          if (*tch=='\0') continue;  // tch==eof; empty last line
           if (eol(&tch) && skipEmptyLines) { tch++; continue; }
           tch = fieldStart;         // in case tabs at the beginning of the first field need to be included
         }
@@ -2183,7 +2183,7 @@ int freadMain(freadMainArgs _args) {
             if (end_of_field(tch)) {
               if (sep==' ' && *tch==' ') {
                 while (tch[1]==' ') tch++;  // multiple space considered one sep so move to last
-                if (tch[1]=='\r' || tch[1]=='\n' || tch[1]=='\0') tch++;
+                if (tch[1]=='\r' || tch[1]=='\n' || tch+1==eof) tch++;
               }
               break;
             }
@@ -2233,17 +2233,17 @@ int freadMain(freadMainArgs _args) {
           ((char**) targets)[size[j]] += size[j];
           j++;
           if (*tch==sep) { tch++; continue; }
-          if (fill && (*tch=='\n' || *tch=='\r' || *tch=='\0') && j<ncol) continue;  // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here
+          if (fill && (*tch=='\n' || *tch=='\r' || tch==eof) && j<ncol) continue;  // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here
           break;
         }
-        if (j<ncol || (!eol(&tch) && *tch!='\0'))  {
+        if (j<ncol || (!eol(&tch) && tch!=eof))  {
           // Too few or too many columns observed (but not empty lines when skipEmptyLines as they were found and skipped earlier above).
           // If fill==true, fields should already have been filled above due to continue inside while(j<ncol).
           myStopEarly = true;
           tch = tLineStart;
           break;
         }
-        if (*tch!='\0') tch++;
+        if (tch!=eof) tch++;
         myNrow++;
       }
       if (verbose) { double now = wallclock(); thRead += now-tLast; tLast = now; }