Rdatatable · dselivanov · May 23, 2016 · Sep 25, 2016 · jangorecki · Sep 15, 2016
@@ -1,6 +1,15 @@
-
-fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",file,stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) {
-    if (!is.character(dec) || length(dec)!=1L || nchar(dec)!=1) stop("dec must be a single character e.g. '.' or ','")
+fread <- function(input = "", sep = "auto", sep2 = "auto", nrows = -1L, header = "auto",
+                  na.strings = "NA", file = NULL, 
+                  stringsAsFactors = FALSE, verbose = getOption("datatable.verbose"),
+                  autostart = 1L, skip = 0L, select = NULL, drop = NULL, colClasses = NULL,
+                  integer64 = getOption("datatable.integer64"), 
+                  dec=if (sep!=".") "." else ",", col.names,
+                  check.names = FALSE, encoding = "unknown", quote = "\"", 
+                  strip.white = !identical(sep,"\n"), 
+                  fill = FALSE, blank.lines.skip = FALSE, key = NULL, 
+                  showProgress = getOption("datatable.showProgress"),
+                  data.table = getOption("datatable.fread.datatable")) {
+    if (!is.character(dec) || length(dec) != 1L || nchar(dec) != 1) stop("dec must be a single character e.g. '.' or ','")
     # handle encoding, #563
     if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) {
         stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")
@@ -52,7 +61,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
         if (verbose) cat("This R session's locale is now '",tt,"' which provides the desired decimal point for reading numerics in the file - success! The locale will be restored to what it was ('",oldlocale,") even if the function fails for other reasons.\n")
     }
     # map file as input
-    if (!missing(file)) {
+    if (!is.null(file)) {
         if (!identical(input, "")) stop("You can provide 'input' or 'file', not both.")
         if (!file.exists(file)) stop(sprintf("Provided file '%s' does not exists.", file))
         input = file
@@ -99,7 +108,10 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
         input = tt
     }
     if (identical(header,"auto")) header=NA
-    if (identical(sep,"auto")) sep=NULL
+    if (identical(sep, "auto")) sep = NA_character_
+    # do not split lines - faster replacement for base::readLines()
+    # "\r\n" will be detected automatically at C level
+    if (identical(sep, "\n")) sep = NULL
     if (is.atomic(colClasses) && !is.null(names(colClasses))) colClasses = tapply(names(colClasses),colClasses,c,simplify=FALSE)
     ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,encoding,quote,strip.white,blank.lines.skip,fill,as.integer(showProgress))
     nr = length(ans[[1]])

@@ -1,4 +1,3 @@
 
 ### Project overview is on the GitHub Wiki tab, our [HOMEPAGE](https://github.com/Rdatatable/data.table/wiki)
 
-
@@ -9175,6 +9175,12 @@ test(1710.4, all.equal(x,y,check.attributes=TRUE),            # desired
              "Datasets have different column classes. First 3: a(numeric!=hello;world)")
 test(1710.5, isTRUE(all.equal(x,y,check.attributes=FALSE)))   # desired
 
+# readLines with fread
+lines <- fread("a,b\n ab,cd,ce\n abcdef\n hjkli \n", sep = "\n", header = T)[[1]]
+test(1711.1, lines, c(" ab,cd,ce", " abcdef", " hjkli ") )
+lines <- fread("a,b\r\n ab,cd,ce\r\n abcdef\r\n hjkli \r\n", sep = "\n", header = T)[[1]]
+test(1711.2, lines, c(" ab,cd,ce", " abcdef", " hjkli ") )
+
 
 ##########################
 

@@ -9,20 +9,20 @@
    `fread` is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector.
 }
 \usage{
-fread(input, sep="auto", sep2="auto", nrows=-1L, header="auto", na.strings="NA", file,
+fread(input, sep="auto", sep2="auto", nrows=-1L, header="auto", na.strings="NA", file=NULL,
 stringsAsFactors=FALSE, verbose=getOption("datatable.verbose"), autostart=1L,
 skip=0L, select=NULL, drop=NULL, colClasses=NULL,
 integer64=getOption("datatable.integer64"),         # default: "integer64"
 dec=if (sep!=".") "." else ",", col.names, 
 check.names=FALSE, encoding="unknown", quote="\"", 
-strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, 
+strip.white = !identical(sep,"\n"), fill=FALSE, blank.lines.skip=FALSE, key=NULL, 
 showProgress=getOption("datatable.showProgress"),   # default: TRUE
 data.table=getOption("datatable.fread.datatable")   # default: TRUE
 )
 }
 \arguments{
   \item{input}{ Either the file name to read (containing no \\n character), a shell command that preprocesses the file (e.g. \code{fread("grep blah filename"))} or the input itself as a string (containing at least one \\n), see examples. In both cases, a length 1 character string. A filename input is passed through \code{\link[base]{path.expand}} for convenience and may be a URL starting http:// or file://. }
-  \item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. }
+  \item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. Use \code{"\n"} to read lines without splitting - faster alternative to \code{readLines()}.}
   \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;:}], other than \code{sep}, that exists inside each field outside quoted regions on line \code{autostart}. NB: \code{sep2} is not yet implemented. }
   \item{nrows}{ The number of rows to read, by default -1 means all. Unlike \code{read.table}, it doesn't help speed to set this to the number of rows in the file (or an estimate), since the number of rows is automatically determined and is already fast. Only set \code{nrows} if you require the first 10 rows, for example. `nrows=0` is a special case that just returns the column names and types; e.g., a dry run for a large file or to quickly check format consistency of a set of files before starting to read any. }
   \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. }

@@ -590,8 +590,12 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
     if (isNumeric(skip)) { skip = PROTECT(coerceVector(skip, INTSXP)); protecti++; }
     if (!( (isInteger(skip) && LENGTH(skip)==1 && INTEGER(skip)[0]>=0)  // NA_INTEGER is covered by >=0
          ||(isString(skip) && LENGTH(skip)==1))) error("'skip' must be a length 1 vector of type numeric or integer >=0, or single character search string");
-    if (!isNull(separg)) {
-        if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0)))!=1) error("'sep' must be 'auto' or a single character");
+
+    if (isNull(separg)) { 
+      sep = eol;
+    } else
+    if (STRING_ELT(separg, 0) != NA_STRING) {
+        if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0))) != 1) error("'sep' must be 'auto' or a single character");
         if (*CHAR(STRING_ELT(separg,0))==quote[0]) error("sep = '%c' = quote, is not an allowed separator.",quote[0]);
         if (*CHAR(STRING_ELT(separg,0)) == decChar) error("The two arguments to fread 'dec' and 'sep' are equal ('%c').", decChar);
     }
@@ -798,13 +802,16 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
     //   Auto detect separator, number of fields, and location of first row
     // ********************************************************************************************
     const char *seps;
-    if (isNull(separg)) {
+    if (!isNull(separg)) {
+      if (STRING_ELT(separg, 0) == NA_STRING) {
         seps=",\t |;:";  // separators, in order of preference. See ?fread. (colon last as it can appear in time fields)
         if (verbose) Rprintf("Detecting sep ... ");
-    } else {
+      } else {
         seps = (const char *)CHAR(STRING_ELT(separg,0));  // length 1 string of 1 character, checked above
         if (verbose) Rprintf("Using supplied sep '%s' ... ", seps[0]=='\t'?"\\t":seps);
-    }
+      }
+    } else
+      seps = &eol;
     int nseps = strlen(seps);
     int *maxcols = Calloc(nseps, int); // if (fill) grab longest col stretch as topNcol
     if (maxcols == NULL) error("Error while allocating memory to store max column size of each separator.");
@@ -858,7 +865,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
         if (!fill) { ch=pos=topStart; line=topLine; }
         else { ch=pos; line=1; }
         if (verbose) {
-            if (isNull(separg)) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); }
+            if (STRING_ELT(separg, 0) == NA_STRING) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); }
             else Rprintf("found ok\n");
         } 
     }
Original file line number	Diff line number	Diff line change
		@@ -1,4 +1,3 @@

		### Project overview is on the GitHub Wiki tab, our [HOMEPAGE](https://github.com/Rdatatable/data.table/wiki)