Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions R/fread.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@

fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",file,stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) {
if (!is.character(dec) || length(dec)!=1L || nchar(dec)!=1) stop("dec must be a single character e.g. '.' or ','")
fread <- function(input = "", sep = "auto", sep2 = "auto", nrows = -1L, header = "auto",
na.strings = "NA", file = NULL,
stringsAsFactors = FALSE, verbose = getOption("datatable.verbose"),
autostart = 1L, skip = 0L, select = NULL, drop = NULL, colClasses = NULL,
integer64 = getOption("datatable.integer64"),
dec=if (sep!=".") "." else ",", col.names,
check.names = FALSE, encoding = "unknown", quote = "\"",
strip.white = !identical(sep,"\n"),
fill = FALSE, blank.lines.skip = FALSE, key = NULL,
showProgress = getOption("datatable.showProgress"),
data.table = getOption("datatable.fread.datatable")) {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is more readable now 👍

if (!is.character(dec) || length(dec) != 1L || nchar(dec) != 1) stop("dec must be a single character e.g. '.' or ','")
# handle encoding, #563
if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) {
stop("Argument 'encoding' must be 'unknown', 'UTF-8' or 'Latin-1'.")
Expand Down Expand Up @@ -52,7 +61,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
if (verbose) cat("This R session's locale is now '",tt,"' which provides the desired decimal point for reading numerics in the file - success! The locale will be restored to what it was ('",oldlocale,") even if the function fails for other reasons.\n")
}
# map file as input
if (!missing(file)) {
if (!is.null(file)) {
if (!identical(input, "")) stop("You can provide 'input' or 'file', not both.")
if (!file.exists(file)) stop(sprintf("Provided file '%s' does not exists.", file))
input = file
Expand Down Expand Up @@ -99,7 +108,10 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
input = tt
}
if (identical(header,"auto")) header=NA
if (identical(sep,"auto")) sep=NULL
if (identical(sep, "auto")) sep = NA_character_
# do not split lines - faster replacement for base::readLines()
# "\r\n" will be detected automatically at C level
if (identical(sep, "\n")) sep = NULL
if (is.atomic(colClasses) && !is.null(names(colClasses))) colClasses = tapply(names(colClasses),colClasses,c,simplify=FALSE)
ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,encoding,quote,strip.white,blank.lines.skip,fill,as.integer(showProgress))
nr = length(ans[[1]])
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

### Project overview is on the GitHub Wiki tab, our [HOMEPAGE](https://github.com/Rdatatable/data.table/wiki)


6 changes: 6 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -9175,6 +9175,12 @@ test(1710.4, all.equal(x,y,check.attributes=TRUE), # desired
"Datasets have different column classes. First 3: a(numeric!=hello;world)")
test(1710.5, isTRUE(all.equal(x,y,check.attributes=FALSE))) # desired

# readLines with fread
lines <- fread("a,b\n ab,cd,ce\n abcdef\n hjkli \n", sep = "\n", header = T)[[1]]
test(1711.1, lines, c(" ab,cd,ce", " abcdef", " hjkli ") )
lines <- fread("a,b\r\n ab,cd,ce\r\n abcdef\r\n hjkli \r\n", sep = "\n", header = T)[[1]]
test(1711.2, lines, c(" ab,cd,ce", " abcdef", " hjkli ") )


##########################

Expand Down
6 changes: 3 additions & 3 deletions man/fread.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,20 @@
`fread` is for \emph{regular} delimited files; i.e., where every row has the same number of columns. In future, secondary separator (\code{sep2}) may be specified \emph{within} each column. Such columns will be read as type \code{list} where each cell is itself a vector.
}
\usage{
fread(input, sep="auto", sep2="auto", nrows=-1L, header="auto", na.strings="NA", file,
fread(input, sep="auto", sep2="auto", nrows=-1L, header="auto", na.strings="NA", file=NULL,
stringsAsFactors=FALSE, verbose=getOption("datatable.verbose"), autostart=1L,
skip=0L, select=NULL, drop=NULL, colClasses=NULL,
integer64=getOption("datatable.integer64"), # default: "integer64"
dec=if (sep!=".") "." else ",", col.names,
check.names=FALSE, encoding="unknown", quote="\"",
strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL,
strip.white = !identical(sep,"\n"), fill=FALSE, blank.lines.skip=FALSE, key=NULL,
showProgress=getOption("datatable.showProgress"), # default: TRUE
data.table=getOption("datatable.fread.datatable") # default: TRUE
)
}
\arguments{
\item{input}{ Either the file name to read (containing no \\n character), a shell command that preprocesses the file (e.g. \code{fread("grep blah filename"))} or the input itself as a string (containing at least one \\n), see examples. In both cases, a length 1 character string. A filename input is passed through \code{\link[base]{path.expand}} for convenience and may be a URL starting http:// or file://. }
\item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. }
\item{sep}{ The separator between columns. Defaults to the first character in the set [\code{,\\t |;:}] that exists on line \code{autostart} outside quoted (\code{""}) regions, and separates the rows above \code{autostart} into a consistent number of fields, too. Use \code{"\n"} to read lines without splitting - faster alternative to \code{readLines()}.}
\item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;:}], other than \code{sep}, that exists inside each field outside quoted regions on line \code{autostart}. NB: \code{sep2} is not yet implemented. }
\item{nrows}{ The number of rows to read, by default -1 means all. Unlike \code{read.table}, it doesn't help speed to set this to the number of rows in the file (or an estimate), since the number of rows is automatically determined and is already fast. Only set \code{nrows} if you require the first 10 rows, for example. `nrows=0` is a special case that just returns the column names and types; e.g., a dry run for a large file or to quickly check format consistency of a set of files before starting to read any. }
\item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. }
Expand Down
19 changes: 13 additions & 6 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -590,8 +590,12 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
if (isNumeric(skip)) { skip = PROTECT(coerceVector(skip, INTSXP)); protecti++; }
if (!( (isInteger(skip) && LENGTH(skip)==1 && INTEGER(skip)[0]>=0) // NA_INTEGER is covered by >=0
||(isString(skip) && LENGTH(skip)==1))) error("'skip' must be a length 1 vector of type numeric or integer >=0, or single character search string");
if (!isNull(separg)) {
if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0)))!=1) error("'sep' must be 'auto' or a single character");

if (isNull(separg)) {
sep = eol;
} else
if (STRING_ELT(separg, 0) != NA_STRING) {
if (!isString(separg) || LENGTH(separg)!=1 || strlen(CHAR(STRING_ELT(separg,0))) != 1) error("'sep' must be 'auto' or a single character");
if (*CHAR(STRING_ELT(separg,0))==quote[0]) error("sep = '%c' = quote, is not an allowed separator.",quote[0]);
if (*CHAR(STRING_ELT(separg,0)) == decChar) error("The two arguments to fread 'dec' and 'sep' are equal ('%c').", decChar);
}
Expand Down Expand Up @@ -798,13 +802,16 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
// Auto detect separator, number of fields, and location of first row
// ********************************************************************************************
const char *seps;
if (isNull(separg)) {
if (!isNull(separg)) {
if (STRING_ELT(separg, 0) == NA_STRING) {
seps=",\t |;:"; // separators, in order of preference. See ?fread. (colon last as it can appear in time fields)
if (verbose) Rprintf("Detecting sep ... ");
} else {
} else {
seps = (const char *)CHAR(STRING_ELT(separg,0)); // length 1 string of 1 character, checked above
if (verbose) Rprintf("Using supplied sep '%s' ... ", seps[0]=='\t'?"\\t":seps);
}
}
} else
seps = &eol;
int nseps = strlen(seps);
int *maxcols = Calloc(nseps, int); // if (fill) grab longest col stretch as topNcol
if (maxcols == NULL) error("Error while allocating memory to store max column size of each separator.");
Expand Down Expand Up @@ -858,7 +865,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
if (!fill) { ch=pos=topStart; line=topLine; }
else { ch=pos; line=1; }
if (verbose) {
if (isNull(separg)) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); }
if (STRING_ELT(separg, 0) == NA_STRING) { if (sep=='\t') Rprintf("'\\t'\n"); else Rprintf("'%c'\n", sep); }
else Rprintf("found ok\n");
}
}
Expand Down