From 74bf37ef495b72ba5c5262ed0eac699a7088f428 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 01:36:32 +0200 Subject: [PATCH 01/32] implement comment.char argument for fread --- NEWS.md | 1 + R/fread.R | 8 +++-- inst/tests/tests.Rraw | 7 ++++ man/fread.Rd | 3 +- src/data.table.h | 2 +- src/fread.c | 79 ++++++++++++++++++++++++++++++++++++++++--- src/fread.h | 4 +++ src/freadR.c | 8 +++++ 8 files changed, 104 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 1ef9396f71..c027ffd875 100644 --- a/NEWS.md +++ b/NEWS.md @@ -291,6 +291,7 @@ # user system elapsed # 0.028 0.000 0.005 ``` +20. `fread()` has now `comment.char` argument implemented to skip trailing comments or comment-only lines similar as in `read.table`, [#856](https://github.com/Rdatatable/data.table/issues/856). Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. ### BUG FIXES diff --git a/R/fread.R b/R/fread.R index 2ea6cb796a..33935bd9d5 100644 --- a/R/fread.R +++ b/R/fread.R @@ -2,7 +2,7 @@ fread = function( input="", file=NULL, text=NULL, cmd=NULL, sep="auto", sep2="auto", dec="auto", quote="\"", nrows=Inf, header="auto", na.strings=getOption("datatable.na.strings","NA"), stringsAsFactors=FALSE, verbose=getOption("datatable.verbose",FALSE), skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64","integer64"), -col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, index=NULL, +col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, showProgress=getOption("datatable.showProgress",interactive()), data.table=getOption("datatable.fread.datatable",TRUE), nThread=getDTthreads(verbose), logical01=getOption("datatable.logical01",FALSE), logicalYN=getOption("datatable.logicalYN", FALSE), @@ -30,6 +30,10 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L ) + if (is.null(comment.char)) comment.char = "" + if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) { + stopf("comment.char= must be a single non-NA character string.") + } fill = if(identical(fill, Inf)) .Machine$integer.max else as.integer(fill) nrows=as.double(nrows) #4686 if (is.na(nrows) || nrows<0L) nrows=Inf # accept -1 to mean Inf, as read.table does @@ -289,7 +293,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") if (identical(tt,"") || is_utc(tt)) # empty TZ env variable ("") means UTC in C library, unlike R; _unset_ TZ means local tz="UTC" } - ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip, + ans = .Call(CfreadR,input,identical(input,file),sep,dec,quote,header,nrows,skip,na.strings,strip.white,blank.lines.skip,comment.char, fill,showProgress,nThread,verbose,warnings2errors,logical01,logicalYN,select,drop,colClasses,integer64,encoding,keepLeadingZeros,tz=="UTC") if (!length(ans)) return(null.data.table()) # test 1743.308 drops all columns nr = length(ans[[1L]]) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b2142520d8..6c7fc0b728 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21685,3 +21685,10 @@ d3 = unserialize(serialize(d2, NULL)) test(2340.05, .selfref.ok(d3), FALSE) setDT(d3) test(2340.06, .selfref.ok(d3), TRUE) + +# implement comment.char argument in fread, #856 +test(2341.1, fread('a,b\n#a comment\n1,2\n#another comment\n3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) +test(2341.2, fread('a,b #line-trailing comment\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L))) +test(2341.3, fread('a,b#line-trailing comment and no whitespace\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L))) +test(2341.4, fread('a,b\n1,2 #trailing after numeric', comment.char='#'), data.table(a=c(1L), b=c(2L))) +test(2341.5, fread('a\n"#quotes#"\n', comment.char="#"), data.table(a=c("#quotes#"))) diff --git a/man/fread.Rd b/man/fread.Rd index 38352662f2..1560a2acff 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -17,7 +17,7 @@ skip="__auto__", select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64", "integer64"), col.names, check.names=FALSE, encoding="unknown", -strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, +strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, comment.char="", key=NULL, index=NULL, showProgress=getOption("datatable.showProgress", interactive()), data.table=getOption("datatable.fread.datatable", TRUE), @@ -56,6 +56,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC" \item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.} \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} + \item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored. Use \code{""} to turn off the interpretation of comments altogether.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. } diff --git a/src/data.table.h b/src/data.table.h index 775d492aac..1333c9926a 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -351,7 +351,7 @@ SEXP setcharvec(SEXP, SEXP, SEXP); SEXP chmatch_R(SEXP, SEXP, SEXP); SEXP chmatchdup_R(SEXP, SEXP, SEXP); SEXP chin_R(SEXP, SEXP); -SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); +SEXP freadR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP fwriteR(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); SEXP rbindlist(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP setlistelt(SEXP, SEXP, SEXP); diff --git a/src/fread.c b/src/fread.c index 56390810a3..9fc06ae028 100644 --- a/src/fread.c +++ b/src/fread.c @@ -31,6 +31,7 @@ static const char *sof, *eof; static char sep; static char whiteChar; // what to consider as whitespace to skip: ' ', '\t' or 0 means both (when sep!=' ' && sep!='\t') static char quote, dec; +static char commentChar; static int linesForDecDot; // when dec='auto', track the balance of fields in favor of dec='.' vs dec=',', ties go to '.' static bool eol_one_r; // only true very rarely for \r-only files @@ -188,7 +189,7 @@ bool freadCleanup(void) } free(mmp_copy); mmp_copy = NULL; fileSize = 0; - sep = whiteChar = quote = dec = '\0'; + sep = whiteChar = quote = dec = commentChar = '\0'; quoteRule = -1; any_number_like_NAstrings = false; blank_is_a_NAstring = false; @@ -304,7 +305,8 @@ static inline bool end_of_field(const char *ch) // default, and therefore characters in the range 0x80-0xFF are negative. // We use eol() because that looks at eol_one_r inside it w.r.t. \r // \0 (maybe more than one) before eof are part of field and do not end it; eol() returns false for \0 but the ch==eof will return true for the \0 at eof. - return *ch == sep || ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch))); + // Comment characters terminate a field immediately and take precedence over separators. + return *ch == sep || ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch))) || (commentChar && *ch == commentChar); } static inline const char *end_NA_string(const char *start) @@ -336,8 +338,24 @@ static inline int countfields(const char **pch) static void *targets[9]; targets[8] = (void*) &trash; const char *ch = *pch; - if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep - skip_white(&ch); + for (;;) { + if (ch >= eof) { *pch = ch; return 0; } + if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep + skip_white(&ch); + if (commentChar && *ch == commentChar) { + while (ch < eof && *ch != '\n' && *ch != '\r') ch++; + if (ch < eof) { + if (*ch == '\r' || *ch == '\n') { + eol(&ch); + if (ch < eof) ch++; + } + continue; // rescan next line + } + *pch = ch; + return 0; + } + break; + } if (eol(&ch) || ch == eof) { *pch = ch + 1; return 0; @@ -350,6 +368,17 @@ static inline int countfields(const char **pch) }; while (ch < eof) { Field(&ctx); + if (commentChar && *ch == commentChar) { + while (ch < eof && *ch != '\n' && *ch != '\r') ch++; + if (ch < eof) { + if (*ch == '\r' || *ch == '\n') { + eol(&ch); + if (ch < eof) ch++; + } + } + *pch = ch; + return ncol; + } // Field() leaves *ch resting on sep, \r, \n or *eof=='\0' if (sep == ' ' && *ch == sep) { while (ch[1] == ' ') ch++; @@ -1422,6 +1451,7 @@ int freadMain(freadMainArgs _args) fill = args.fill; dec = args.dec; quote = args.quote; + commentChar = args.comment; if (args.sep == quote && quote!='\0') STOP(_("sep == quote ('%c') is not allowed"), quote); if (args.sep == dec && dec != '\0') STOP(_("sep == dec ('%c') is not allowed"), dec); if (quote == dec && dec != '\0') STOP(_("quote == dec ('%c') is not allowed"), dec); @@ -2206,12 +2236,31 @@ int freadMain(freadMainArgs _args) ch++; Field(&fctx); // stores the string length and offset as in colNames[i] ((lenOff**) fctx.targets)[8]++; + if (commentChar) { + // skip leading whitespace to detect inline comment marker in header row + const char *commentPos = ch; + while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++; + if (commentPos < eof && *commentPos == commentChar) { + ch = commentPos; + while (ch < eof && *ch != '\n' && *ch != '\r') ch++; + break; // stop header parsing after comment + } + } if (*ch != sep) break; if (sep == ' ') { while (ch[1] == ' ') ch++; if (ch[1] == '\r' || ch[1] == '\n' || ch[1] == '\0') { ch++; break; } } } + if (commentChar) { + // fast-trim trailing comment text after the header names + const char *commentPos = ch; + while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++; + if (commentPos < eof && *commentPos == commentChar) { + ch = commentPos; + while (ch < eof && *ch != '\n' && *ch != '\r') ch++; + } + } if (eol(&ch)) pos = ++ch; else if (*ch == '\0') pos = ch; else INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov @@ -2452,6 +2501,19 @@ int freadMain(freadMainArgs _args) tLineStart = tch; // for error message const char *fieldStart = tch; int j = 0; + + if (commentChar) { + // treat lines whose first non-space character is the comment marker as empty + const char *afterWhite = tLineStart; + while (afterWhite < eof && (*afterWhite == ' ' || *afterWhite == '\t' || *afterWhite == '\0')) afterWhite++; + if (afterWhite < eof && *afterWhite == commentChar) { + const char *skip = afterWhite; + while (skip < eof && *skip != '\n' && *skip != '\r') skip++; + if (skip < eof && eol(&skip)) skip++; + tch = skip; + continue; + } + } //*** START HOT ***// if (sep != ' ' && !any_number_like_NAstrings) { // TODO: can this 'if' be dropped somehow? Can numeric NAstrings be dealt with afterwards in one go as numeric comparison? @@ -2596,6 +2658,15 @@ int freadMain(freadMainArgs _args) int8_t thisSize = size[j]; if (thisSize) ((char**) targets)[size[j]] += size[j]; // 'if' to avoid undefined NULL+=0 when rereading j++; + if (commentChar) { + const char *commentPtr = tch; + while (commentPtr < eof && (*commentPtr == ' ' || *commentPtr == '\t' || *commentPtr == '\0')) commentPtr++; + if (commentPtr < eof && *commentPtr == commentChar) { + tch = commentPtr; + while (tch < eof && *tch != '\n' && *tch != '\r') tch++; + break; + } + } if (*tch == sep) { tch++; continue; } if (fill && (*tch == '\n' || *tch == '\r' || tch == eof) && j < ncol) continue; // reuse processors to write appropriate NA to target; saves maintenance of a type switch down here break; diff --git a/src/fread.h b/src/fread.h index da4837222f..645a2a47d5 100644 --- a/src/fread.h +++ b/src/fread.h @@ -123,6 +123,10 @@ typedef struct freadMainArgs // non-ASCII, or different open/closing quotation marks are not supported. char quote; + // Character that marks the beginning of a comment. When '\0', comment + // parsing is disabled. + char comment; + // Is there a header at the beginning of the file? // 0 = no, 1 = yes, -128 = autodetect int8_t header; diff --git a/src/freadR.c b/src/freadR.c index 11bf027598..7e1af2b22b 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -61,6 +61,7 @@ SEXP freadR( SEXP NAstringsArg, SEXP stripWhiteArg, SEXP skipEmptyLinesArg, + SEXP commentCharArg, SEXP fillArg, SEXP showProgressArg, SEXP nThreadArg, @@ -158,6 +159,13 @@ SEXP freadR( // here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL args.stripWhite = LOGICAL(stripWhiteArg)[0]; args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0]; + if (!isString(commentCharArg) || LENGTH(commentCharArg) != 1) + error(_("comment.char must be a single character vector of length 1")); // # notranslate + const char *commentStr = CHAR(STRING_ELT(commentCharArg, 0)); + size_t commentLen = strlen(commentStr); + if (commentLen > 1) + error(_("comment.char must be a single character or \"\"")); // # notranslate + args.comment = commentLen == 0 ? '\0' : commentStr[0]; args.fill = INTEGER(fillArg)[0]; args.showProgress = LOGICAL(showProgressArg)[0]; if (INTEGER(nThreadArg)[0] < 1) From 6f34ff2455bba297a68258f6a08e8c3434573ed7 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 17:52:16 +0200 Subject: [PATCH 02/32] remove handling of comment.char=NULL --- R/fread.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 33935bd9d5..583fdb5163 100644 --- a/R/fread.R +++ b/R/fread.R @@ -30,7 +30,6 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") isTRUEorFALSE(stringsAsFactors) || (is.double(stringsAsFactors) && length(stringsAsFactors)==1L && 0.0<=stringsAsFactors && stringsAsFactors<=1.0), is.numeric(nrows), length(nrows)==1L ) - if (is.null(comment.char)) comment.char = "" if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) { stopf("comment.char= must be a single non-NA character string.") } From 2d4ae58986dea9845b4b5c3a06a65cf31124add5 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 18:02:48 +0200 Subject: [PATCH 03/32] update NEWS --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c027ffd875..9460b3449b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -291,7 +291,7 @@ # user system elapsed # 0.028 0.000 0.005 ``` -20. `fread()` has now `comment.char` argument implemented to skip trailing comments or comment-only lines similar as in `read.table`, [#856](https://github.com/Rdatatable/data.table/issues/856). Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. + 20. `fread()` now supports the `comment.char` argument to skip trailing comments or comment-only lines, consistent with `read.table()`, [#856](https://github.com/Rdatatable/data.table/issues/856). The default remains `comment.char = ""` (no comment parsing) for backward compatibility and performance, in contrast to `read.table(comment.char = "#")`. Thanks to @arunsrinivasan and many others for the suggestion and @ben-schwen for the implementation. ### BUG FIXES From 1d2f11a21c1e72f9b6cbf89a22e347fcf6c99b78 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 18:05:55 +0200 Subject: [PATCH 04/32] update tests --- inst/tests/tests.Rraw | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 6c7fc0b728..3a18017edd 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21687,8 +21687,16 @@ setDT(d3) test(2340.06, .selfref.ok(d3), TRUE) # implement comment.char argument in fread, #856 -test(2341.1, fread('a,b\n#a comment\n1,2\n#another comment\n3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) -test(2341.2, fread('a,b #line-trailing comment\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L))) -test(2341.3, fread('a,b#line-trailing comment and no whitespace\n1,2', comment.char='#'), data.table(a=c(1L), b=c(2L))) -test(2341.4, fread('a,b\n1,2 #trailing after numeric', comment.char='#'), data.table(a=c(1L), b=c(2L))) -test(2341.5, fread('a\n"#quotes#"\n', comment.char="#"), data.table(a=c("#quotes#"))) +test(2341.1, fread('a,b +#a comment +1,2 +#another comment +3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) +test(2341.2, fread('a,b #line-trailing comment +1,2', comment.char='#'), data.table(a=1L, b=2L)) +test(2341.3, fread('a,b#line-trailing comment and no whitespace +1,2', comment.char='#'), data.table(a=1L, b=2L)) +test(2341.4, fread('a,b +1,2 #trailing after numeric', comment.char='#'), data.table(a=1L, b=2L)) +test(2341.5, fread('a +"#quotes#"', comment.char="#"), data.table(a="#quotes#")) From a9851ef36b7bb440e693d5fb93f2c2a2460d5d05 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 18:57:59 +0200 Subject: [PATCH 05/32] change wording for error --- R/fread.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 583fdb5163..16a72ed24d 100644 --- a/R/fread.R +++ b/R/fread.R @@ -31,7 +31,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") is.numeric(nrows), length(nrows)==1L ) if (!is.character(comment.char) || length(comment.char) != 1L || is.na(comment.char) || nchar(comment.char) > 1L) { - stopf("comment.char= must be a single non-NA character string.") + stopf("comment.char= must be a single non-NA character.") } fill = if(identical(fill, Inf)) .Machine$integer.max else as.integer(fill) nrows=as.double(nrows) #4686 From 9b0575917eff059a53e0d7a565a11e675c809864 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 18:59:55 +0200 Subject: [PATCH 06/32] remove unreachable code --- src/freadR.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/freadR.c b/src/freadR.c index 7e1af2b22b..d586aaed0a 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -159,13 +159,8 @@ SEXP freadR( // here we use bool and rely on fread at R level to check these do not contain NA_LOGICAL args.stripWhite = LOGICAL(stripWhiteArg)[0]; args.skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0]; - if (!isString(commentCharArg) || LENGTH(commentCharArg) != 1) - error(_("comment.char must be a single character vector of length 1")); // # notranslate const char *commentStr = CHAR(STRING_ELT(commentCharArg, 0)); - size_t commentLen = strlen(commentStr); - if (commentLen > 1) - error(_("comment.char must be a single character or \"\"")); // # notranslate - args.comment = commentLen == 0 ? '\0' : commentStr[0]; + args.comment = strlen(commentStr) == 0 ? '\0' : commentStr[0]; args.fill = INTEGER(fillArg)[0]; args.showProgress = LOGICAL(showProgressArg)[0]; if (INTEGER(nThreadArg)[0] < 1) From 86b3b1487eabaab243bdd257e15f24d035f8bebe Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 19:00:24 +0200 Subject: [PATCH 07/32] add helper function --- src/fread.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/fread.c b/src/fread.c index 9fc06ae028..ef0d7e81b7 100644 --- a/src/fread.c +++ b/src/fread.c @@ -267,6 +267,15 @@ static inline void skip_white(const char **pch) *pch = ch; } +static inline const char *skip_to_comment_or_nonwhite(const char *ch) +{ + while (ch < eof && (*ch == ' ' || *ch == '\t' || *ch == '\0')) { + if (commentChar && *ch == commentChar) break; + ch++; + } + return ch; +} + /** * eol() accepts a position and, if any of the following line endings, moves to the end of that sequence * and returns true. Repeated \\r around \n are considered one. At most one \\n will be moved over, though. @@ -2238,8 +2247,7 @@ int freadMain(freadMainArgs _args) ((lenOff**) fctx.targets)[8]++; if (commentChar) { // skip leading whitespace to detect inline comment marker in header row - const char *commentPos = ch; - while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++; + const char *commentPos = skip_to_comment_or_nonwhite(ch); if (commentPos < eof && *commentPos == commentChar) { ch = commentPos; while (ch < eof && *ch != '\n' && *ch != '\r') ch++; @@ -2254,8 +2262,7 @@ int freadMain(freadMainArgs _args) } if (commentChar) { // fast-trim trailing comment text after the header names - const char *commentPos = ch; - while (commentPos < eof && (*commentPos == ' ' || *commentPos == '\t' || *commentPos == '\0')) commentPos++; + const char *commentPos = skip_to_comment_or_nonwhite(ch); if (commentPos < eof && *commentPos == commentChar) { ch = commentPos; while (ch < eof && *ch != '\n' && *ch != '\r') ch++; @@ -2504,8 +2511,7 @@ int freadMain(freadMainArgs _args) if (commentChar) { // treat lines whose first non-space character is the comment marker as empty - const char *afterWhite = tLineStart; - while (afterWhite < eof && (*afterWhite == ' ' || *afterWhite == '\t' || *afterWhite == '\0')) afterWhite++; + const char *afterWhite = skip_to_comment_or_nonwhite(tLineStart); if (afterWhite < eof && *afterWhite == commentChar) { const char *skip = afterWhite; while (skip < eof && *skip != '\n' && *skip != '\r') skip++; @@ -2659,8 +2665,7 @@ int freadMain(freadMainArgs _args) if (thisSize) ((char**) targets)[size[j]] += size[j]; // 'if' to avoid undefined NULL+=0 when rereading j++; if (commentChar) { - const char *commentPtr = tch; - while (commentPtr < eof && (*commentPtr == ' ' || *commentPtr == '\t' || *commentPtr == '\0')) commentPtr++; + const char *commentPtr = skip_to_comment_or_nonwhite(tch); if (commentPtr < eof && *commentPtr == commentChar) { tch = commentPtr; while (tch < eof && *tch != '\n' && *tch != '\r') tch++; From 5024949ca93a4e56d6fc9512d10de49ac4703052 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 19:05:51 +0200 Subject: [PATCH 08/32] extend tests --- inst/tests/tests.Rraw | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 3a18017edd..bf6f64d0b5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21687,16 +21687,45 @@ setDT(d3) test(2340.06, .selfref.ok(d3), TRUE) # implement comment.char argument in fread, #856 -test(2341.1, fread('a,b +test(2341.01, fread('a,b #a comment 1,2 #another comment 3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) -test(2341.2, fread('a,b #line-trailing comment +test(2341.02, fread('a,b #line-trailing comment 1,2', comment.char='#'), data.table(a=1L, b=2L)) -test(2341.3, fread('a,b#line-trailing comment and no whitespace +test(2341.03, fread('a,b#line-trailing comment and no whitespace 1,2', comment.char='#'), data.table(a=1L, b=2L)) -test(2341.4, fread('a,b +test(2341.04, fread('a,b 1,2 #trailing after numeric', comment.char='#'), data.table(a=1L, b=2L)) -test(2341.5, fread('a +test(2341.05, fread('a "#quotes#"', comment.char="#"), data.table(a="#quotes#")) +test(2341.06, fread('# multi line +# comment +1,2 +# multi line +# comment +3,4 +# trailing comment', comment.char='#'), data.table(V1=c(1L,3L), V2=c(2L,4L))) +test(2341.07, fread('id;value +1;2,5! trailing comment +2;NA +!final comment', sep=';', dec=',', na.strings='NA', comment.char='!'), data.table(id=1:2, value=c(2.5, NA_real_))) +test(2341.08, fread('meta line +DATA STARTS +x,y +# skip this +1,2', header=TRUE, skip="DATA", comment.char='#'), data.table(x=1L, y=2L)) +test(2341.09, fread('value + inline comment +5', comment.char=' '), data.table(value=5L)) +test(2341.10, fread('a,b +1,2" trailing" +"comment line" +3,4', comment.char='"', quote=""), data.table(a=c(1L,3L), b=c(2L,4L))) +test(2341.11, fread('a,b +## multichar commentchar +1,2', comment.char = '##'), error = "comment.char= must be a single non-NA character") +test(2341.12, fread('a,b +NA,NA +1,2', comment.char = NA), error = "comment.char= must be a single non-NA character") From 9d18827edd8fa50204df3f85091aa2eb7185026d Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 20:49:37 +0200 Subject: [PATCH 09/32] update tests --- inst/tests/tests.Rraw | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index bf6f64d0b5..c48b87b1b8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21698,8 +21698,10 @@ test(2341.03, fread('a,b#line-trailing comment and no whitespace 1,2', comment.char='#'), data.table(a=1L, b=2L)) test(2341.04, fread('a,b 1,2 #trailing after numeric', comment.char='#'), data.table(a=1L, b=2L)) +# comment char inside quotes test(2341.05, fread('a "#quotes#"', comment.char="#"), data.table(a="#quotes#")) +# multi line comments test(2341.06, fread('# multi line # comment 1,2 @@ -21711,21 +21713,48 @@ test(2341.07, fread('id;value 1;2,5! trailing comment 2;NA !final comment', sep=';', dec=',', na.strings='NA', comment.char='!'), data.table(id=1:2, value=c(2.5, NA_real_))) +# skip test(2341.08, fread('meta line DATA STARTS x,y # skip this -1,2', header=TRUE, skip="DATA", comment.char='#'), data.table(x=1L, y=2L)) -test(2341.09, fread('value +1,2', skip="DATA", header=TRUE, comment.char='#'), data.table(x=1L, y=2L)) +# weird comment chars like space or quote +test(2341.09, fread('a inline comment -5', comment.char=' '), data.table(value=5L)) +1', comment.char=' '), data.table(a=1L)) test(2341.10, fread('a,b 1,2" trailing" "comment line" 3,4', comment.char='"', quote=""), data.table(a=c(1L,3L), b=c(2L,4L))) +# invalid comment chars test(2341.11, fread('a,b ## multichar commentchar 1,2', comment.char = '##'), error = "comment.char= must be a single non-NA character") test(2341.12, fread('a,b NA,NA 1,2', comment.char = NA), error = "comment.char= must be a single non-NA character") +# CLRF +test(2341.13, fread('a,b\r\n# cmt\r\n1,2\r\n3,4\r\n', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) +# header comment +test(2341.14, fread('# hdr cmt +x,y +1,2', header=TRUE, comment.char='#'), data.table(x=1L, y=2L)) +# nrow not counting comments +test(2341.15, fread('a,b +1,2 +# cmt +3,4 +5,6', nrows=2, comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) +# sep and comment char same +test(2341.19, fread('a#b +1#2 +# only comment', sep="#", comment.char="#"), data.table(a=1L)) +# na.strings +test(2341.20, fread('v +#NA +1 +# comment', na.strings="#NA", comment.char='#'), data.table(v=1L)) +test(2341.29, fread('a,b +"p#q",2 # tail +"r#s",3', comment.char='#'), data.table(a=c("p#q","r#s"), b=c(2L,3L))) From efd87973869bb8821ff2475f5ec54982294d9933 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 20:49:58 +0200 Subject: [PATCH 10/32] use skip_line helper --- src/fread.c | 87 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 28 deletions(-) diff --git a/src/fread.c b/src/fread.c index ef0d7e81b7..19b2a793c0 100644 --- a/src/fread.c +++ b/src/fread.c @@ -301,6 +301,15 @@ static inline bool eol(const char **pch) return eol_one_r && **pch == '\r'; } + +static inline const char *skip_line(const char *ch, const char *eof) { + while (ch < eof && *ch != '\n' && *ch != '\r') + ch++; + if (ch < eof && eol(&ch)) ch++; + return ch; +} + + /** * Return True iff `ch` is a valid field terminator character: either a field * separator or a newline. @@ -352,15 +361,12 @@ static inline int countfields(const char **pch) if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep skip_white(&ch); if (commentChar && *ch == commentChar) { - while (ch < eof && *ch != '\n' && *ch != '\r') ch++; - if (ch < eof) { - if (*ch == '\r' || *ch == '\n') { - eol(&ch); - if (ch < eof) ch++; - } + const char *next = skip_line(ch, eof); + if (next < eof) { + ch = next; continue; // rescan next line } - *pch = ch; + *pch = next; return 0; } break; @@ -378,13 +384,7 @@ static inline int countfields(const char **pch) while (ch < eof) { Field(&ctx); if (commentChar && *ch == commentChar) { - while (ch < eof && *ch != '\n' && *ch != '\r') ch++; - if (ch < eof) { - if (*ch == '\r' || *ch == '\n') { - eol(&ch); - if (ch < eof) ch++; - } - } + ch = skip_line(ch, eof); *pch = ch; return ncol; } @@ -416,10 +416,8 @@ static inline const char *nextGoodLine(const char *ch, int ncol) // If this doesn't return the true line start, no matter. The previous thread will run-on and // resolve it. A good guess is all we need here. Being wrong will just be a bit slower. // If there are no embedded newlines, all newlines are true, and this guess will never be wrong. - while (*ch != '\n' && *ch != '\r' && (*ch != '\0' || ch < eof)) ch++; + ch = skip_line(ch, eof); if (ch == eof) return eof; - if (eol(&ch)) // move to last byte of the line ending sequence (e.g. \r\r\n would be +2). - ch++; // and then move to first byte of next line const char *simpleNext = ch; // simply the first newline after the jump // if a better one can't be found, return this one (simpleNext). This will be the case when // fill=TRUE and the jump lands before 5 too-short lines, for example. @@ -427,8 +425,7 @@ static inline const char *nextGoodLine(const char *ch, int ncol) for (int attempts = 0; attempts < 5 && ch < eof; attempts++) { const char *ch2 = ch; if (countfields(&ch2) == ncol) return ch; // returns simpleNext here on first attempt, almost all the time - while (*ch != '\n' && *ch != '\r' && (*ch != '\0' || ch < eof)) ch++; - if (eol(&ch)) ch++; + ch = skip_line(ch, eof); } return simpleNext; } @@ -2021,6 +2018,21 @@ int freadMain(freadMainArgs _args) for (int jump = 0; jump < nJumps; jump++) { if (jump == 0) { ch = pos; + // Skip leading comment lines before processing header + if (commentChar) { + while (ch < eof) { + const char *lineStart = ch; + ch = skip_to_comment_or_nonwhite(ch); + if (ch < eof && *ch == commentChar) { + ch = skip_line(ch, eof); + row1line++; + continue; + } + ch = lineStart; + break; + } + pos = ch; + } if (args.header != false) { countfields(&ch); // skip first row for type guessing as it's probably column names row1line++; @@ -2225,7 +2237,17 @@ int freadMain(freadMainArgs _args) if (verbose) DTPRINT(_("[08] Assign column names\n")); ch = pos; // back to start of first row (column names if header==true) - + // Skip leading comment lines before parsing header + if (args.header != false && commentChar) { + while (ch < eof) { + ch = skip_to_comment_or_nonwhite(ch); + if (ch < eof && *ch == commentChar) { + ch = skip_line(ch, eof); + } else break; + } + pos = ch; + colNamesAnchor = pos; + } if (args.header == false) { colNames = NULL; // userOverride will assign V1, V2, etc } else { @@ -2249,8 +2271,7 @@ int freadMain(freadMainArgs _args) // skip leading whitespace to detect inline comment marker in header row const char *commentPos = skip_to_comment_or_nonwhite(ch); if (commentPos < eof && *commentPos == commentChar) { - ch = commentPos; - while (ch < eof && *ch != '\n' && *ch != '\r') ch++; + ch = skip_line(commentPos, eof); break; // stop header parsing after comment } } @@ -2264,13 +2285,23 @@ int freadMain(freadMainArgs _args) // fast-trim trailing comment text after the header names const char *commentPos = skip_to_comment_or_nonwhite(ch); if (commentPos < eof && *commentPos == commentChar) { - ch = commentPos; - while (ch < eof && *ch != '\n' && *ch != '\r') ch++; + ch = skip_line(commentPos, eof); + } + } + if (ch == eof || *ch == '\0') { + pos = ch; + } else if (*ch == '\n' || *ch == '\r') { + if (eol(&ch)) { + if (ch < eof) ch++; + pos = ch; + } else { + INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } + } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { + pos = ch; + } else { + INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } - if (eol(&ch)) pos = ++ch; - else if (*ch == '\0') pos = ch; - else INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov // now on first data row (row after column names) // when fill=TRUE and column names shorter (test 1635.2), leave calloc initialized lenOff.len==0 } @@ -2882,7 +2913,7 @@ int freadMain(freadMainArgs _args) } else { const char *skippedFooter = ENC2NATIVE(ch); // detect if it's a single line footer. Commonly the row count from SQL queries. - while (ch < eof && *ch != '\n' && *ch != '\r') ch++; + ch = skip_line(ch, eof); while (ch < eof && isspace(*ch)) ch++; if (ch == eof) { DTWARN(_("Discarded single-line footer: <<%s>>"), strlim(skippedFooter, (char[500]) {0}, 500)); From 87ba1511fab1f2d9048ba80b8a5092feaa654ce6 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 20:59:24 +0200 Subject: [PATCH 11/32] add comments for helpers --- src/fread.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/fread.c b/src/fread.c index 19b2a793c0..0560de7510 100644 --- a/src/fread.c +++ b/src/fread.c @@ -267,10 +267,14 @@ static inline void skip_white(const char **pch) *pch = ch; } +/** + * Advance `ch` past spaces/NULs until we hit a comment marker or the first + * non-whitespace character. Leaves `ch` unchanged if already on content. + */ static inline const char *skip_to_comment_or_nonwhite(const char *ch) { while (ch < eof && (*ch == ' ' || *ch == '\t' || *ch == '\0')) { - if (commentChar && *ch == commentChar) break; + if (commentChar && *ch == commentChar) break; // comment char might be space or tab ch++; } return ch; @@ -301,7 +305,10 @@ static inline bool eol(const char **pch) return eol_one_r && **pch == '\r'; } - +/** + * Walk to the start of the next line (or `eof` if none) by skipping the + * current line's contents and its newline sequence. + */ static inline const char *skip_line(const char *ch, const char *eof) { while (ch < eof && *ch != '\n' && *ch != '\r') ch++; @@ -309,7 +316,6 @@ static inline const char *skip_line(const char *ch, const char *eof) { return ch; } - /** * Return True iff `ch` is a valid field terminator character: either a field * separator or a newline. From b457e4a4d92727728cf43a11ff7423116ff04034 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 21:02:45 +0200 Subject: [PATCH 12/32] fix test numbering --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index c48b87b1b8..39015694ed 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21747,14 +21747,14 @@ test(2341.15, fread('a,b 3,4 5,6', nrows=2, comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) # sep and comment char same -test(2341.19, fread('a#b +test(2341.16, fread('a#b 1#2 # only comment', sep="#", comment.char="#"), data.table(a=1L)) # na.strings -test(2341.20, fread('v +test(2341.17, fread('v #NA 1 # comment', na.strings="#NA", comment.char='#'), data.table(v=1L)) -test(2341.29, fread('a,b +test(2341.18, fread('a,b "p#q",2 # tail "r#s",3', comment.char='#'), data.table(a=c("p#q","r#s"), b=c(2L,3L))) From 09b493289adae13b04ebeba81df96537ea813068 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 22:34:22 +0200 Subject: [PATCH 13/32] add more tests --- inst/tests/tests.Rraw | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 39015694ed..4d49addb71 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21758,3 +21758,10 @@ test(2341.17, fread('v test(2341.18, fread('a,b "p#q",2 # tail "r#s",3', comment.char='#'), data.table(a=c("p#q","r#s"), b=c(2L,3L))) +test(2341.19, fread(' # lead comment with padding +\t# and tab +a,b +1,2', comment.char='#'), data.table(a=1L, b=2L)) +test(2341.20, fread('a,b # header comment with padding +1,2 +3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) From a0a9525cd4b862d08d4eba9942086a2fbc2a101b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 22:56:13 +0200 Subject: [PATCH 14/32] simplify read --- src/fread.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/src/fread.c b/src/fread.c index 0560de7510..7ecddb67b0 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2287,26 +2287,27 @@ int freadMain(freadMainArgs _args) if (ch[1] == '\r' || ch[1] == '\n' || ch[1] == '\0') { ch++; break; } } } + const char *lineEnd = ch; + const char *nextLine = NULL; if (commentChar) { // fast-trim trailing comment text after the header names - const char *commentPos = skip_to_comment_or_nonwhite(ch); + const char *commentPos = skip_to_comment_or_nonwhite(lineEnd); if (commentPos < eof && *commentPos == commentChar) { - ch = skip_line(commentPos, eof); + lineEnd = commentPos; + nextLine = skip_line(commentPos, eof); } } - if (ch == eof || *ch == '\0') { - pos = ch; - } else if (*ch == '\n' || *ch == '\r') { - if (eol(&ch)) { - if (ch < eof) ch++; - pos = ch; + if (nextLine) { + pos = nextLine; + } else if (lineEnd == eof || *lineEnd == '\0') { + pos = lineEnd; + } else { + const char *tmp = lineEnd; + if (eol(&tmp)) { + pos = (tmp < eof) ? tmp + 1 : tmp; } else { - INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov + INTERNAL_STOP("reading colnames ending on '%c'", *lineEnd); // # nocov } - } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { - pos = ch; - } else { - INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } // now on first data row (row after column names) // when fill=TRUE and column names shorter (test 1635.2), leave calloc initialized lenOff.len==0 @@ -2550,10 +2551,7 @@ int freadMain(freadMainArgs _args) // treat lines whose first non-space character is the comment marker as empty const char *afterWhite = skip_to_comment_or_nonwhite(tLineStart); if (afterWhite < eof && *afterWhite == commentChar) { - const char *skip = afterWhite; - while (skip < eof && *skip != '\n' && *skip != '\r') skip++; - if (skip < eof && eol(&skip)) skip++; - tch = skip; + tch = skip_line(afterWhite, eof); continue; } } @@ -2704,8 +2702,7 @@ int freadMain(freadMainArgs _args) if (commentChar) { const char *commentPtr = skip_to_comment_or_nonwhite(tch); if (commentPtr < eof && *commentPtr == commentChar) { - tch = commentPtr; - while (tch < eof && *tch != '\n' && *tch != '\r') tch++; + tch = skip_line(commentPtr, eof); break; } } From 79508a3a0b890a57efec0f87ba669ec744c10a95 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 23:16:31 +0200 Subject: [PATCH 15/32] Revert "simplify read" This reverts commit a0a9525cd4b862d08d4eba9942086a2fbc2a101b. --- src/fread.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/fread.c b/src/fread.c index 7ecddb67b0..0560de7510 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2287,27 +2287,26 @@ int freadMain(freadMainArgs _args) if (ch[1] == '\r' || ch[1] == '\n' || ch[1] == '\0') { ch++; break; } } } - const char *lineEnd = ch; - const char *nextLine = NULL; if (commentChar) { // fast-trim trailing comment text after the header names - const char *commentPos = skip_to_comment_or_nonwhite(lineEnd); + const char *commentPos = skip_to_comment_or_nonwhite(ch); if (commentPos < eof && *commentPos == commentChar) { - lineEnd = commentPos; - nextLine = skip_line(commentPos, eof); + ch = skip_line(commentPos, eof); } } - if (nextLine) { - pos = nextLine; - } else if (lineEnd == eof || *lineEnd == '\0') { - pos = lineEnd; - } else { - const char *tmp = lineEnd; - if (eol(&tmp)) { - pos = (tmp < eof) ? tmp + 1 : tmp; + if (ch == eof || *ch == '\0') { + pos = ch; + } else if (*ch == '\n' || *ch == '\r') { + if (eol(&ch)) { + if (ch < eof) ch++; + pos = ch; } else { - INTERNAL_STOP("reading colnames ending on '%c'", *lineEnd); // # nocov + INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } + } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { + pos = ch; + } else { + INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } // now on first data row (row after column names) // when fill=TRUE and column names shorter (test 1635.2), leave calloc initialized lenOff.len==0 @@ -2551,7 +2550,10 @@ int freadMain(freadMainArgs _args) // treat lines whose first non-space character is the comment marker as empty const char *afterWhite = skip_to_comment_or_nonwhite(tLineStart); if (afterWhite < eof && *afterWhite == commentChar) { - tch = skip_line(afterWhite, eof); + const char *skip = afterWhite; + while (skip < eof && *skip != '\n' && *skip != '\r') skip++; + if (skip < eof && eol(&skip)) skip++; + tch = skip; continue; } } @@ -2702,7 +2704,8 @@ int freadMain(freadMainArgs _args) if (commentChar) { const char *commentPtr = skip_to_comment_or_nonwhite(tch); if (commentPtr < eof && *commentPtr == commentChar) { - tch = skip_line(commentPtr, eof); + tch = commentPtr; + while (tch < eof && *tch != '\n' && *tch != '\r') tch++; break; } } From e5bbe96f5f47f76ffae0de792ec2f8f760f9b4cd Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 23:51:49 +0200 Subject: [PATCH 16/32] separate helpers --- src/fread.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/src/fread.c b/src/fread.c index 0560de7510..0ad8b9f48a 100644 --- a/src/fread.c +++ b/src/fread.c @@ -306,16 +306,31 @@ static inline bool eol(const char **pch) } /** - * Walk to the start of the next line (or `eof` if none) by skipping the - * current line's contents and its newline sequence. + * Walk to the newline sequence terminating the current line and return the pointer to its final + * character (or `eof` if none exists). */ -static inline const char *skip_line(const char *ch, const char *eof) { +static inline const char *skip_to_eol(const char *ch, const char *eof) +{ while (ch < eof && *ch != '\n' && *ch != '\r') ch++; - if (ch < eof && eol(&ch)) ch++; + if (ch < eof) { + const char *tmp = ch; + if (eol(&tmp)) ch = tmp; + } return ch; } +/** + * Skip past the current line (including its newline sequence) and return the first character of the + * next line, or `eof` if none exists. + */ +static inline const char *skip_to_nextline(const char *ch, const char *eof) +{ + const char *lineEnd = skip_to_eol(ch, eof); + if (lineEnd < eof) lineEnd++; + return lineEnd; +} + /** * Return True iff `ch` is a valid field terminator character: either a field * separator or a newline. @@ -367,7 +382,7 @@ static inline int countfields(const char **pch) if (sep == ' ') while (*ch == ' ') ch++; // multiple sep==' ' at the start does not mean sep skip_white(&ch); if (commentChar && *ch == commentChar) { - const char *next = skip_line(ch, eof); + const char *next = skip_to_nextline(ch, eof); if (next < eof) { ch = next; continue; // rescan next line @@ -390,7 +405,7 @@ static inline int countfields(const char **pch) while (ch < eof) { Field(&ctx); if (commentChar && *ch == commentChar) { - ch = skip_line(ch, eof); + ch = skip_to_nextline(ch, eof); *pch = ch; return ncol; } @@ -422,7 +437,7 @@ static inline const char *nextGoodLine(const char *ch, int ncol) // If this doesn't return the true line start, no matter. The previous thread will run-on and // resolve it. A good guess is all we need here. Being wrong will just be a bit slower. // If there are no embedded newlines, all newlines are true, and this guess will never be wrong. - ch = skip_line(ch, eof); + ch = skip_to_nextline(ch, eof); if (ch == eof) return eof; const char *simpleNext = ch; // simply the first newline after the jump // if a better one can't be found, return this one (simpleNext). This will be the case when @@ -431,7 +446,7 @@ static inline const char *nextGoodLine(const char *ch, int ncol) for (int attempts = 0; attempts < 5 && ch < eof; attempts++) { const char *ch2 = ch; if (countfields(&ch2) == ncol) return ch; // returns simpleNext here on first attempt, almost all the time - ch = skip_line(ch, eof); + ch = skip_to_nextline(ch, eof); } return simpleNext; } @@ -2030,7 +2045,7 @@ int freadMain(freadMainArgs _args) const char *lineStart = ch; ch = skip_to_comment_or_nonwhite(ch); if (ch < eof && *ch == commentChar) { - ch = skip_line(ch, eof); + ch = skip_to_nextline(ch, eof); row1line++; continue; } @@ -2248,7 +2263,7 @@ int freadMain(freadMainArgs _args) while (ch < eof) { ch = skip_to_comment_or_nonwhite(ch); if (ch < eof && *ch == commentChar) { - ch = skip_line(ch, eof); + ch = skip_to_nextline(ch, eof); } else break; } pos = ch; @@ -2277,7 +2292,7 @@ int freadMain(freadMainArgs _args) // skip leading whitespace to detect inline comment marker in header row const char *commentPos = skip_to_comment_or_nonwhite(ch); if (commentPos < eof && *commentPos == commentChar) { - ch = skip_line(commentPos, eof); + ch = skip_to_eol(commentPos, eof); break; // stop header parsing after comment } } @@ -2291,7 +2306,7 @@ int freadMain(freadMainArgs _args) // fast-trim trailing comment text after the header names const char *commentPos = skip_to_comment_or_nonwhite(ch); if (commentPos < eof && *commentPos == commentChar) { - ch = skip_line(commentPos, eof); + ch = skip_to_eol(commentPos, eof); } } if (ch == eof || *ch == '\0') { @@ -2550,10 +2565,7 @@ int freadMain(freadMainArgs _args) // treat lines whose first non-space character is the comment marker as empty const char *afterWhite = skip_to_comment_or_nonwhite(tLineStart); if (afterWhite < eof && *afterWhite == commentChar) { - const char *skip = afterWhite; - while (skip < eof && *skip != '\n' && *skip != '\r') skip++; - if (skip < eof && eol(&skip)) skip++; - tch = skip; + tch = skip_to_nextline(afterWhite, eof); continue; } } @@ -2704,8 +2716,7 @@ int freadMain(freadMainArgs _args) if (commentChar) { const char *commentPtr = skip_to_comment_or_nonwhite(tch); if (commentPtr < eof && *commentPtr == commentChar) { - tch = commentPtr; - while (tch < eof && *tch != '\n' && *tch != '\r') tch++; + tch = skip_to_eol(commentPtr, eof); break; } } @@ -2919,7 +2930,7 @@ int freadMain(freadMainArgs _args) } else { const char *skippedFooter = ENC2NATIVE(ch); // detect if it's a single line footer. Commonly the row count from SQL queries. - ch = skip_line(ch, eof); + ch = skip_to_nextline(ch, eof); while (ch < eof && isspace(*ch)) ch++; if (ch == eof) { DTWARN(_("Discarded single-line footer: <<%s>>"), strlim(skippedFooter, (char[500]) {0}, 500)); From cc57d2798f3900148e6d1bac2847453661afcf81 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 18 Oct 2025 23:59:38 +0200 Subject: [PATCH 17/32] add comments --- src/fread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/fread.c b/src/fread.c index 0ad8b9f48a..c9c35c9da1 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2309,6 +2309,7 @@ int freadMain(freadMainArgs _args) ch = skip_to_eol(commentPos, eof); } } + // consider different cases line ending after column names if (ch == eof || *ch == '\0') { pos = ch; } else if (*ch == '\n' || *ch == '\r') { @@ -2318,7 +2319,7 @@ int freadMain(freadMainArgs _args) } else { INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } - } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { + } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { // trimmed a comment and now on next rows first byte pos = ch; } else { INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov From 3e735650d0c81a4ca1a961545a7ef088ce8f0ade Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 19 Oct 2025 00:28:14 +0200 Subject: [PATCH 18/32] add coverage --- inst/tests/tests.Rraw | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4d49addb71..775be5bd3f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21765,3 +21765,8 @@ a,b test(2341.20, fread('a,b # header comment with padding 1,2 3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) +test(2341.21, fread('# meta1 +# meta2 +a,b +1,2', comment.char = '#'), data.table(a=1L, b=2L)) +test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L)) From e77137e5a11f47b76b610b41f46ccaed7298e30e Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 19 Oct 2025 14:51:55 +0200 Subject: [PATCH 19/32] simplify header handling --- src/fread.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/fread.c b/src/fread.c index c9c35c9da1..00eb1a4ca9 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2312,17 +2312,16 @@ int freadMain(freadMainArgs _args) // consider different cases line ending after column names if (ch == eof || *ch == '\0') { pos = ch; - } else if (*ch == '\n' || *ch == '\r') { - if (eol(&ch)) { - if (ch < eof) ch++; + } else { + const char *lineEnd = ch; + if (eol(&lineEnd)) { + if (lineEnd < eof) lineEnd++; + pos = lineEnd; + } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { // trimmed a comment and now on next row's first byte pos = ch; } else { INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } - } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { // trimmed a comment and now on next rows first byte - pos = ch; - } else { - INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov } // now on first data row (row after column names) // when fill=TRUE and column names shorter (test 1635.2), leave calloc initialized lenOff.len==0 From 09f9359245d571bacb0eba30383c4e8eb987c00a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 19 Oct 2025 16:35:35 +0200 Subject: [PATCH 20/32] increase coverage --- inst/tests/tests.Rraw | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 775be5bd3f..f8910c94b9 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21762,11 +21762,10 @@ test(2341.19, fread(' # lead comment with padding \t# and tab a,b 1,2', comment.char='#'), data.table(a=1L, b=2L)) -test(2341.20, fread('a,b # header comment with padding +test(2341.20, fread('a,b # header cmt with padding # second cmt 1,2 3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) -test(2341.21, fread('# meta1 -# meta2 +test(2341.21, fread('# meta1 # meta2 a,b 1,2', comment.char = '#'), data.table(a=1L, b=2L)) test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L)) From 29bdb3df6b0bcbfd8c40a946206daf3fe479888b Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 19 Oct 2025 18:47:53 +0200 Subject: [PATCH 21/32] simplify code --- src/fread.c | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) diff --git a/src/fread.c b/src/fread.c index 00eb1a4ca9..3c4d19c855 100644 --- a/src/fread.c +++ b/src/fread.c @@ -2258,17 +2258,7 @@ int freadMain(freadMainArgs _args) if (verbose) DTPRINT(_("[08] Assign column names\n")); ch = pos; // back to start of first row (column names if header==true) - // Skip leading comment lines before parsing header - if (args.header != false && commentChar) { - while (ch < eof) { - ch = skip_to_comment_or_nonwhite(ch); - if (ch < eof && *ch == commentChar) { - ch = skip_to_nextline(ch, eof); - } else break; - } - pos = ch; - colNamesAnchor = pos; - } + if (args.header == false) { colNames = NULL; // userOverride will assign V1, V2, etc } else { @@ -2302,27 +2292,9 @@ int freadMain(freadMainArgs _args) if (ch[1] == '\r' || ch[1] == '\n' || ch[1] == '\0') { ch++; break; } } } - if (commentChar) { - // fast-trim trailing comment text after the header names - const char *commentPos = skip_to_comment_or_nonwhite(ch); - if (commentPos < eof && *commentPos == commentChar) { - ch = skip_to_eol(commentPos, eof); - } - } - // consider different cases line ending after column names - if (ch == eof || *ch == '\0') { - pos = ch; - } else { - const char *lineEnd = ch; - if (eol(&lineEnd)) { - if (lineEnd < eof) lineEnd++; - pos = lineEnd; - } else if (ch > sof && (ch[-1] == '\n' || ch[-1] == '\r')) { // trimmed a comment and now on next row's first byte - pos = ch; - } else { - INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov - } - } + if (eol(&ch)) pos = ++ch; + else if (*ch == '\0') pos = ch; + else INTERNAL_STOP("reading colnames ending on '%c'", *ch); // # nocov // now on first data row (row after column names) // when fill=TRUE and column names shorter (test 1635.2), leave calloc initialized lenOff.len==0 } From a25dd37094cbd189b391a4ae228740ca989ae8e8 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 19 Oct 2025 20:51:45 +0200 Subject: [PATCH 22/32] control skipping white spaces before comments with strip.white --- inst/tests/tests.Rraw | 8 ++++++++ src/fread.c | 12 +++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f8910c94b9..8ba4ab6b52 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21769,3 +21769,11 @@ test(2341.21, fread('# meta1 # meta2 a,b 1,2', comment.char = '#'), data.table(a=1L, b=2L)) test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L)) +# control skipping white space before comments with strip.white +test(2341.23, fread('a +b # trailing cmnt +', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a="b ")) +test(2341.24, fread('a + # leading cmnt +b +', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b"))) diff --git a/src/fread.c b/src/fread.c index 3c4d19c855..3b70f4321c 100644 --- a/src/fread.c +++ b/src/fread.c @@ -273,9 +273,15 @@ static inline void skip_white(const char **pch) */ static inline const char *skip_to_comment_or_nonwhite(const char *ch) { - while (ch < eof && (*ch == ' ' || *ch == '\t' || *ch == '\0')) { - if (commentChar && *ch == commentChar) break; // comment char might be space or tab - ch++; + while (ch < eof && *ch == '\0') ++ch; + if (!stripWhite) return ch; + + const unsigned char stopSpace = (commentChar == ' '); + const unsigned char stopTab = (commentChar == '\t'); + + while (ch < eof && (*ch == ' ' || *ch == '\t')) { + if ((stopSpace && *ch == ' ') || (stopTab && *ch == '\t')) break; + ++ch; } return ch; } From 01edb9a708496aba5a64052fa81822ff83b00577 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 19 Oct 2025 22:09:31 +0200 Subject: [PATCH 23/32] tighten helper --- src/fread.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/fread.c b/src/fread.c index 3b70f4321c..315dfe51e0 100644 --- a/src/fread.c +++ b/src/fread.c @@ -276,11 +276,8 @@ static inline const char *skip_to_comment_or_nonwhite(const char *ch) while (ch < eof && *ch == '\0') ++ch; if (!stripWhite) return ch; - const unsigned char stopSpace = (commentChar == ' '); - const unsigned char stopTab = (commentChar == '\t'); - while (ch < eof && (*ch == ' ' || *ch == '\t')) { - if ((stopSpace && *ch == ' ') || (stopTab && *ch == '\t')) break; + if (*ch == commentChar) break; ++ch; } return ch; From f61989ac33b04cef50d012097b425c65d74af261 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 19 Oct 2025 17:48:03 -0700 Subject: [PATCH 24/32] try improving readability with blank lines --- inst/tests/tests.Rraw | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7aa3d2711f..9fc9615763 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21695,15 +21695,20 @@ test(2341.01, fread('a,b 1,2 #another comment 3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) + test(2341.02, fread('a,b #line-trailing comment 1,2', comment.char='#'), data.table(a=1L, b=2L)) + test(2341.03, fread('a,b#line-trailing comment and no whitespace 1,2', comment.char='#'), data.table(a=1L, b=2L)) + test(2341.04, fread('a,b 1,2 #trailing after numeric', comment.char='#'), data.table(a=1L, b=2L)) + # comment char inside quotes test(2341.05, fread('a "#quotes#"', comment.char="#"), data.table(a="#quotes#")) + # multi line comments test(2341.06, fread('# multi line # comment @@ -21712,16 +21717,19 @@ test(2341.06, fread('# multi line # comment 3,4 # trailing comment', comment.char='#'), data.table(V1=c(1L,3L), V2=c(2L,4L))) + test(2341.07, fread('id;value 1;2,5! trailing comment 2;NA !final comment', sep=';', dec=',', na.strings='NA', comment.char='!'), data.table(id=1:2, value=c(2.5, NA_real_))) + # skip test(2341.08, fread('meta line DATA STARTS x,y # skip this 1,2', skip="DATA", header=TRUE, comment.char='#'), data.table(x=1L, y=2L)) + # weird comment chars like space or quote test(2341.09, fread('a inline comment @@ -21730,52 +21738,65 @@ test(2341.10, fread('a,b 1,2" trailing" "comment line" 3,4', comment.char='"', quote=""), data.table(a=c(1L,3L), b=c(2L,4L))) + # invalid comment chars test(2341.11, fread('a,b ## multichar commentchar 1,2', comment.char = '##'), error = "comment.char= must be a single non-NA character") + test(2341.12, fread('a,b NA,NA 1,2', comment.char = NA), error = "comment.char= must be a single non-NA character") + # CLRF test(2341.13, fread('a,b\r\n# cmt\r\n1,2\r\n3,4\r\n', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) + # header comment test(2341.14, fread('# hdr cmt x,y 1,2', header=TRUE, comment.char='#'), data.table(x=1L, y=2L)) + # nrow not counting comments test(2341.15, fread('a,b 1,2 # cmt 3,4 5,6', nrows=2, comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) + # sep and comment char same test(2341.16, fread('a#b 1#2 # only comment', sep="#", comment.char="#"), data.table(a=1L)) + # na.strings test(2341.17, fread('v #NA 1 # comment', na.strings="#NA", comment.char='#'), data.table(v=1L)) + test(2341.18, fread('a,b "p#q",2 # tail "r#s",3', comment.char='#'), data.table(a=c("p#q","r#s"), b=c(2L,3L))) + test(2341.19, fread(' # lead comment with padding \t# and tab a,b 1,2', comment.char='#'), data.table(a=1L, b=2L)) + test(2341.20, fread('a,b # header cmt with padding # second cmt 1,2 3,4', comment.char='#'), data.table(a=c(1L,3L), b=c(2L,4L))) + test(2341.21, fread('# meta1 # meta2 a,b 1,2', comment.char = '#'), data.table(a=1L, b=2L)) test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L)) + # control skipping white space before comments with strip.white test(2341.23, fread('a b # trailing cmnt ', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a="b ")) + test(2341.24, fread('a # leading cmnt b From 941600602a4ec0a5b383e0e82d68112ffc3fcefc Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Sun, 19 Oct 2025 17:53:42 -0700 Subject: [PATCH 25/32] include some line-end comments in the multi-line comment test --- inst/tests/tests.Rraw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9fc9615763..62965a4a5e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21712,10 +21712,10 @@ test(2341.05, fread('a # multi line comments test(2341.06, fread('# multi line # comment -1,2 +1,2 # line-end comment # multi line # comment -3,4 +3,4 # line-end comment # trailing comment', comment.char='#'), data.table(V1=c(1L,3L), V2=c(2L,4L))) test(2341.07, fread('id;value From fe2b74b031493ce7e84b71a8adf55ca9a5fb6df5 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 20 Oct 2025 15:45:05 +0200 Subject: [PATCH 26/32] match read.table for na.strings and comment.char --- inst/tests/tests.Rraw | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 62965a4a5e..fbfa58e106 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21769,10 +21769,13 @@ test(2341.16, fread('a#b # only comment', sep="#", comment.char="#"), data.table(a=1L)) # na.strings -test(2341.17, fread('v +txt = 'v #NA 1 -# comment', na.strings="#NA", comment.char='#'), data.table(v=1L)) +# comment' +writeLines(txt, tmp <- tempfile()) +test(2341.17, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L)) +test(2341.175, fread(tmp, na.strings="#NA", comment.char='#'), setDT(read.table(tmp, na.strings="#NA", comment.char='#', header=TRUE))) test(2341.18, fread('a,b "p#q",2 # tail From 9288dd155fa4df6526b39eb7049ba38a640928e8 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 20 Oct 2025 15:47:39 +0200 Subject: [PATCH 27/32] add strip.white=FALSE header testcase --- inst/tests/tests.Rraw | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fbfa58e106..fc6209d102 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21774,8 +21774,8 @@ txt = 'v 1 # comment' writeLines(txt, tmp <- tempfile()) -test(2341.17, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L)) -test(2341.175, fread(tmp, na.strings="#NA", comment.char='#'), setDT(read.table(tmp, na.strings="#NA", comment.char='#', header=TRUE))) +test(2341.170, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L)) +test(2341.171, fread(tmp, na.strings="#NA", comment.char='#'), setDT(read.table(tmp, na.strings="#NA", comment.char='#', header=TRUE))) test(2341.18, fread('a,b "p#q",2 # tail @@ -21796,9 +21796,12 @@ a,b test(2341.22, fread('a,b # inline header comment\r\n1,2\r\n', comment.char = '#'), data.table(a=1L, b=2L)) # control skipping white space before comments with strip.white -test(2341.23, fread('a +test(2341.230, fread('a b # trailing cmnt ', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a="b ")) +test(2341.231, fread('a # trailing header cmnt +b +', comment.char = '#', strip.white = FALSE, sep = ","), data.table(`a `="b")) test(2341.24, fread('a # leading cmnt From 73590c444e46e4c268393c8bf92ea2447c9e9581 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 20 Oct 2025 15:49:47 +0200 Subject: [PATCH 28/32] refactor end_of_field helper into more readable version --- src/fread.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/fread.c b/src/fread.c index 315dfe51e0..1d43173b8a 100644 --- a/src/fread.c +++ b/src/fread.c @@ -349,6 +349,10 @@ static inline bool end_of_field(const char *ch) // \0 (maybe more than one) before eof are part of field and do not end it; eol() returns false for \0 but the ch==eof will return true for the \0 at eof. // Comment characters terminate a field immediately and take precedence over separators. return *ch == sep || ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch))) || (commentChar && *ch == commentChar); + if (*ch == sep) return true; + if ((uint8_t)*ch <= 13 && (ch == eof || eol(&ch))) return true; + if (!commentChar) return false; + return *ch == commentChar; } static inline const char *end_NA_string(const char *start) From f925ce4423a0ca3cc1a51a7a41d47344442a5655 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Mon, 20 Oct 2025 16:16:34 +0200 Subject: [PATCH 29/32] add example for strip.white --- inst/tests/tests.Rraw | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index fc6209d102..5aa80fcf80 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21802,6 +21802,11 @@ b # trailing cmnt test(2341.231, fread('a # trailing header cmnt b ', comment.char = '#', strip.white = FALSE, sep = ","), data.table(`a `="b")) +test(2341.232, fread('a +# full line cmnt + # full line with leading ws cmnt +b +', comment.char = '#', strip.white = FALSE, sep = ","), data.table(a=c(" ", "b"))) test(2341.24, fread('a # leading cmnt From c77332df9f63b6ff2d40e97f5a9128dc95005f0b Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 20 Oct 2025 09:36:30 -0700 Subject: [PATCH 30/32] summarize line-skipping behavior --- man/fread.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/fread.Rd b/man/fread.Rd index 1560a2acff..40d2cfcc90 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -56,7 +56,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC" \item{strip.white}{ Logical, default \code{TRUE}, in which case leading and trailing whitespace is stripped from unquoted \code{"character"} fields. \code{"numeric"} fields are always stripped of leading and trailing whitespace.} \item{fill}{logical or integer (default is \code{FALSE}). If \code{TRUE} then in case the rows have unequal length, number of columns is estimated and blank fields are implicitly filled. If an integer is provided it is used as an upper bound for the number of columns. If \code{fill=Inf} then the whole file is read for detecting the number of columns. } \item{blank.lines.skip}{\code{logical}, default is \code{FALSE}. If \code{TRUE} blank lines in the input are ignored.} - \item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored. Use \code{""} to turn off the interpretation of comments altogether.} + \item{comment.char}{Character vector of length one containing a single character of an empty string. Any text after the comment character in a line is ignored, including skipping comment-only lines. Use \code{""} to turn off the interpretation of comments altogether.} \item{key}{Character vector of one or more column names which is passed to \code{\link{setkey}}. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{index}{ Character vector or list of character vectors of one or more column names which is passed to \code{\link{setindexv}}. As with \code{key}, comma-separated notation like \code{index="x,y,z"} is accepted for convenience. Only valid when argument \code{data.table=TRUE}. Where applicable, this should refer to column names given in \code{col.names}. } \item{showProgress}{ \code{TRUE} displays progress on the console if the ETA is greater than 3 seconds. It is produced in fread's C code where the very nice (but R level) txtProgressBar and tkProgressBar are not easily available. } From d73651d2951945bda062b754eade5dff15dd7cd1 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 20 Oct 2025 09:46:58 -0700 Subject: [PATCH 31/32] clean up tmp --- inst/tests/tests.Rraw | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 5aa80fcf80..1990d62979 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21769,13 +21769,16 @@ test(2341.16, fread('a#b # only comment', sep="#", comment.char="#"), data.table(a=1L)) # na.strings -txt = 'v -#NA -1 -# comment' -writeLines(txt, tmp <- tempfile()) -test(2341.170, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L)) -test(2341.171, fread(tmp, na.strings="#NA", comment.char='#'), setDT(read.table(tmp, na.strings="#NA", comment.char='#', header=TRUE))) +local({ + txt = 'v + #NA + 1 + # comment' + writeLines(txt, tmp <- tempfile()) + on.exit(unlink(tmp)) + test(2341.170, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L)) + test(2341.171, fread(tmp, na.strings="#NA", comment.char='#'), setDT(read.table(tmp, na.strings="#NA", comment.char='#', header=TRUE))) +}) test(2341.18, fread('a,b "p#q",2 # tail From ba0c68ad0423977f76d886960fa6b2bc0457ae2e Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Mon, 20 Oct 2025 09:47:30 -0700 Subject: [PATCH 32/32] don't introduce whitespace to string literal body --- inst/tests/tests.Rraw | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1990d62979..2e1284aea6 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21771,9 +21771,9 @@ test(2341.16, fread('a#b # na.strings local({ txt = 'v - #NA - 1 - # comment' +#NA +1 +# comment' writeLines(txt, tmp <- tempfile()) on.exit(unlink(tmp)) test(2341.170, fread(tmp, na.strings="#NA", comment.char='#'), data.table(v=1L))