From 5eae076cac6756e3f05a4e3ca90f015fc07cc859 Mon Sep 17 00:00:00 2001 From: Dmitriy Selivanov Date: Mon, 23 May 2016 12:15:01 +0300 Subject: [PATCH] readLines with fread --- R/fread.R | 18 +++++++++++++++--- README.md | 1 - inst/tests/tests.Rraw | 6 ++++++ man/fread.Rd | 4 ++-- src/fread.c | 19 +++++++++++++------ 5 files changed, 36 insertions(+), 12 deletions(-) diff --git a/R/fread.R b/R/fread.R index 8e497c72a0..7df8b4a749 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,6 +1,15 @@ -fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) { - if (!is.character(dec) || length(dec)!=1L || nchar(dec)!=1) stop("dec must be a single character e.g. '.' or ','") +fread <- function(input = "", sep = "auto", sep2 = "auto", nrows = -1L, header = "auto", + na.strings = "NA", stringsAsFactors = FALSE, verbose = getOption("datatable.verbose"), + autostart = 1L, skip = 0L, select = NULL, drop = NULL, colClasses = NULL, + integer64 = getOption("datatable.integer64"), + dec=if (sep!=".") "." else ",", col.names, + check.names = FALSE, encoding = "unknown", quote = "\"", + strip.white = !identical(sep,"\n"), + fill = FALSE, blank.lines.skip = FALSE, key = NULL, + showProgress = getOption("datatable.showProgress"), + data.table = getOption("datatable.fread.datatable")) { + if (!is.character(dec) || length(dec) != 1L || nchar(dec) != 1) stop("dec must be a single character e.g. '.' or ','") # handle encoding, #563 if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) { stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.") @@ -93,7 +102,10 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str input = tt } if (identical(header,"auto")) header=NA - if (identical(sep,"auto")) sep=NULL + if (identical(sep, "auto")) sep = NA_character_ + # do not split lines - faster replacement for base::readLines() + # "\r\n" will be detected automatically at C level + if (identical(sep, "\n")) sep = NULL if (is.atomic(colClasses) && !is.null(names(colClasses))) colClasses = tapply(names(colClasses),colClasses,c,simplify=FALSE) ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,encoding,quote,strip.white,blank.lines.skip,fill,as.integer(showProgress)) nr = length(ans[[1]]) diff --git a/README.md b/README.md index 3441f3ff54..761ff01c1b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ ### Project overview is on the GitHub Wiki tab, our [HOMEPAGE](https://github.com/Rdatatable/data.table/wiki) - diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d1c946f00f..cf1fecacab 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9108,6 +9108,12 @@ test_values <- c(53L, 53L, 52L, 1L, 52L, 1L, 1L, test(1702, isoweek(test_cases), test_values) +# +lines <- fread("a,b\n ab,cd,ce\n abcdef\n hjkli \n", sep = "\n", header = T)[[1]] +test(1703.1, lines, c(" ab,cd,ce", " abcdef", " hjkli ") ) +lines <- fread("a,b\r\n ab,cd,ce\r\n abcdef\r\n hjkli \r\n", sep = "\n", header = T)[[1]] +test(1703.2, lines, c(" ab,cd,ce", " abcdef", " hjkli ") ) + ########################## # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time. diff --git a/man/fread.Rd b/man/fread.Rd index 5ec64b5312..10f695958c 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -15,14 +15,14 @@ skip=0L, select=NULL, drop=NULL, colClasses=NULL, integer64=getOption("datatable.integer64"), # default: "integer64" dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", -strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, +strip.white = !identical(sep,"\n"), fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=getOption("datatable.showProgress"), # default: TRUE data.table=getOption("datatable.fread.datatable") # default: TRUE ) } \arguments{ \item{input}{ Either the file name to read (containing no \\n character), a shell command that preprocesses the file (e.g. \code{fread("grep blah filename"))} or the input itself as a string (containing at least one \\n), see examples. In both cases, a length 1 character string. A filename input is passed through \code{\link[base]{path.expand}} for convenience and may be a URL starting http:// or file://. } - \item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. } + \item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. Use \code{"\n"} to read lines without splitting - faster alternative to \code{readLines()}.} \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;:}], other than \code{sep}, that exists inside each field outside quoted regions on line \code{autostart}. NB: \code{sep2} is not yet implemented. } \item{nrows}{ The number of rows to read, by default -1 means all. Unlike \code{read.table}, it doesn't help speed to set this to the number of rows in the file (or an estimate), since the number of rows is automatically determined and is already fast. Only set \code{nrows} if you require the first 10 rows, for example. `nrows=0` is a special case that just returns the column names and types; e.g., a dry run for a large file or to quickly check format consistency of a set of files before starting to read any. } \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. } diff --git a/src/fread.c b/src/fread.c index f08651b55d..f1833b63ee 100644 --- a/src/fread.c +++ b/src/fread.c @@ -590,8 +590,12 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr if (isNumeric(skip)) { skip = PROTECT(coerceVector(skip, INTSXP)); protecti++; } if (!( (isInteger(skip) && LENGTH(skip)==1 && INTEGER(skip)[0]>=0) // NA_INTEGER is covered by >=0 ||(isString(skip) && LENGTH(skip)==1))) error("'skip' must be a length 1 vector of type numeric or integer >=0, or single character search string"); - if (!isNull(separg)) { - if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0)))!=1) error("'sep' must be 'auto' or a single character"); + + if (isNull(separg)) { + sep = eol; + } else + if (STRING_ELT(separg, 0) != NA_STRING) { + if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0))) != 1) error("'sep' must be 'auto' or a single character"); if (*CHAR(STRING_ELT(separg,0))==quote[0]) error("sep = '%c' = quote, is not an allowed separator.",quote[0]); if (*CHAR(STRING_ELT(separg,0)) == decChar) error("The two arguments to fread 'dec' and 'sep' are equal ('%c').", decChar); } @@ -798,13 +802,16 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr // Auto detect separator, number of fields, and location of first row // ******************************************************************************************** const char *seps; - if (isNull(separg)) { + if (!isNull(separg)) { + if (STRING_ELT(separg, 0) == NA_STRING) { seps=",\t |;:"; // separators, in order of preference. See ?fread. (colon last as it can appear in time fields) if (verbose) Rprintf("Detecting sep ... "); - } else { + } else { seps = (const char *)CHAR(STRING_ELT(separg,0)); // length 1 string of 1 character, checked above if (verbose) Rprintf("Using supplied sep '%s' ... ", seps[0]=='\t'?"\\t":seps); - } + } + } else + seps = &eol; int nseps = strlen(seps); int *maxcols = Calloc(nseps, int); // if (fill) grab longest col stretch as topNcol if (maxcols == NULL) error("Error while allocating memory to store max column size of each separator."); @@ -858,7 +865,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr if (!fill) { ch=pos=topStart; line=topLine; } else { ch=pos; line=1; } if (verbose) { - if (isNull(separg)) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); } + if (STRING_ELT(separg, 0) == NA_STRING) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); } else Rprintf("found ok\n"); } }