diff --git a/R/fread.R b/R/fread.R index ff076b737a..5b9c33be99 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,6 +1,15 @@ - -fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",file,stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) { - if (!is.character(dec) || length(dec)!=1L || nchar(dec)!=1) stop("dec must be a single character e.g. '.' or ','") +fread <- function(input = "", sep = "auto", sep2 = "auto", nrows = -1L, header = "auto", + na.strings = "NA", file = NULL, + stringsAsFactors = FALSE, verbose = getOption("datatable.verbose"), + autostart = 1L, skip = 0L, select = NULL, drop = NULL, colClasses = NULL, + integer64 = getOption("datatable.integer64"), + dec=if (sep!=".") "." else ",", col.names, + check.names = FALSE, encoding = "unknown", quote = "\"", + strip.white = !identical(sep,"\n"), + fill = FALSE, blank.lines.skip = FALSE, key = NULL, + showProgress = getOption("datatable.showProgress"), + data.table = getOption("datatable.fread.datatable")) { + if (!is.character(dec) || length(dec) != 1L || nchar(dec) != 1) stop("dec must be a single character e.g. '.' or ','") # handle encoding, #563 if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) { stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") @@ -52,7 +61,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str if (verbose) cat("This R session's locale is now '",tt,"' which provides the desired decimal point for reading numerics in the file - success! The locale will be restored to what it was ('",oldlocale,") even if the function fails for other reasons.\n") } # map file as input - if (!missing(file)) { + if (!is.null(file)) { if (!identical(input, "")) stop("You can provide 'input' or 'file', not both.") if (!file.exists(file)) stop(sprintf("Provided file '%s' does not exists.", file)) input = file @@ -99,7 +108,10 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str input = tt } if (identical(header,"auto")) header=NA - if (identical(sep,"auto")) sep=NULL + if (identical(sep, "auto")) sep = NA_character_ + # do not split lines - faster replacement for base::readLines() + # "\r\n" will be detected automatically at C level + if (identical(sep, "\n")) sep = NULL if (is.atomic(colClasses) && !is.null(names(colClasses))) colClasses = tapply(names(colClasses),colClasses,c,simplify=FALSE) ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,encoding,quote,strip.white,blank.lines.skip,fill,as.integer(showProgress)) nr = length(ans[[1]]) diff --git a/README.md b/README.md index 3441f3ff54..761ff01c1b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ ### Project overview is on the GitHub Wiki tab, our [HOMEPAGE](https://github.com/Rdatatable/data.table/wiki) - diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9e985451cc..8bf29fb744 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9175,6 +9175,12 @@ test(1710.4, all.equal(x,y,check.attributes=TRUE), # desired "Datasets have different column classes. First 3: a(numeric!=hello;world)") test(1710.5, isTRUE(all.equal(x,y,check.attributes=FALSE))) # desired +# readLines with fread +lines <- fread("a,b\n ab,cd,ce\n abcdef\n hjkli \n", sep = "\n", header = T)[[1]] +test(1711.1, lines, c(" ab,cd,ce", " abcdef", " hjkli ") ) +lines <- fread("a,b\r\n ab,cd,ce\r\n abcdef\r\n hjkli \r\n", sep = "\n", header = T)[[1]] +test(1711.2, lines, c(" ab,cd,ce", " abcdef", " hjkli ") ) + ########################## diff --git a/man/fread.Rd b/man/fread.Rd index 939aae3b33..2a29101626 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -9,20 +9,20 @@ `fread` is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector. } \usage{ -fread(input, sep="auto", sep2="auto", nrows=-1L, header="auto", na.strings="NA", file, +fread(input, sep="auto", sep2="auto", nrows=-1L, header="auto", na.strings="NA", file=NULL, stringsAsFactors=FALSE, verbose=getOption("datatable.verbose"), autostart=1L, skip=0L, select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64"), # default: "integer64" dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", -strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, +strip.white = !identical(sep,"\n"), fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=getOption("datatable.showProgress"), # default: TRUE data.table=getOption("datatable.fread.datatable") # default: TRUE ) } \arguments{ \item{input}{ Either the file name to read (containing no \\n character), a shell command that preprocesses the file (e.g. \code{fread("grep blah filename"))} or the input itself as a string (containing at least one \\n), see examples. In both cases, a length 1 character string. A filename input is passed through \code{\link[base]{path.expand}} for convenience and may be a URL starting http:// or file://. } - \item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. } + \item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. Use \code{"\n"} to read lines without splitting - faster alternative to \code{readLines()}.} \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;:}], other than \code{sep}, that exists inside each field outside quoted regions on line \code{autostart}. NB: \code{sep2} is not yet implemented. } \item{nrows}{ The number of rows to read, by default -1 means all. Unlike \code{read.table}, it doesn't help speed to set this to the number of rows in the file (or an estimate), since the number of rows is automatically determined and is already fast. Only set \code{nrows} if you require the first 10 rows, for example. `nrows=0` is a special case that just returns the column names and types; e.g., a dry run for a large file or to quickly check format consistency of a set of files before starting to read any. } \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. } diff --git a/src/fread.c b/src/fread.c index f1b54309dd..1347017b16 100644 --- a/src/fread.c +++ b/src/fread.c @@ -590,8 +590,12 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr if (isNumeric(skip)) { skip = PROTECT(coerceVector(skip, INTSXP)); protecti++; } if (!( (isInteger(skip) && LENGTH(skip)==1 && INTEGER(skip)[0]>=0) // NA_INTEGER is covered by >=0 ||(isString(skip) && LENGTH(skip)==1))) error("'skip' must be a length 1 vector of type numeric or integer >=0, or single character search string"); - if (!isNull(separg)) { - if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0)))!=1) error("'sep' must be 'auto' or a single character"); + + if (isNull(separg)) { + sep = eol; + } else + if (STRING_ELT(separg, 0) != NA_STRING) { + if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0))) != 1) error("'sep' must be 'auto' or a single character"); if (*CHAR(STRING_ELT(separg,0))==quote[0]) error("sep = '%c' = quote, is not an allowed separator.",quote[0]); if (*CHAR(STRING_ELT(separg,0)) == decChar) error("The two arguments to fread 'dec' and 'sep' are equal ('%c').", decChar); } @@ -798,13 +802,16 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr // Auto detect separator, number of fields, and location of first row // ******************************************************************************************** const char *seps; - if (isNull(separg)) { + if (!isNull(separg)) { + if (STRING_ELT(separg, 0) == NA_STRING) { seps=",\t |;:"; // separators, in order of preference. See ?fread. (colon last as it can appear in time fields) if (verbose) Rprintf("Detecting sep ... "); - } else { + } else { seps = (const char *)CHAR(STRING_ELT(separg,0)); // length 1 string of 1 character, checked above if (verbose) Rprintf("Using supplied sep '%s' ... ", seps[0]=='\t'?"\\t":seps); - } + } + } else + seps = &eol; int nseps = strlen(seps); int *maxcols = Calloc(nseps, int); // if (fill) grab longest col stretch as topNcol if (maxcols == NULL) error("Error while allocating memory to store max column size of each separator."); @@ -858,7 +865,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr if (!fill) { ch=pos=topStart; line=topLine; } else { ch=pos; line=1; } if (verbose) { - if (isNull(separg)) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); } + if (STRING_ELT(separg, 0) == NA_STRING) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); } else Rprintf("found ok\n"); } }