diff --git a/NEWS.md b/NEWS.md index f333ec0356..3a9ffbca5f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -161,6 +161,8 @@ 29. A segfault occurred when `nrow/throttle < nthread`, [#5077](https://github.com/Rdatatable/data.table/issues/5077). With the default throttle of 1024 rows (see `?setDTthreads`), at least 64 threads would be needed to trigger the segfault since there needed to be more than 65,535 rows too. It occurred on a server with 256 logical cores where `data.table` uses 128 threads by default. Thanks to Bennet Becker for reporting, debugging at C level, and fixing. It also occurred when the throttle was increased so as to use fewer threads; e.g. at the limit `setDTthreads(throttle=nrow(DT))`. +30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : diff --git a/R/fread.R b/R/fread.R index 81ffb2a0df..1bc0267b34 100644 --- a/R/fread.R +++ b/R/fread.R @@ -54,41 +54,15 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if (input=="" || length(grep('\\n|\\r', input))) { # input is data itself containing at least one \n or \r - } else { - if (startsWith(input, " ")) { - stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") - } - str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 - if (str7=="ftps://" || startsWith(input, "https://")) { - # nocov start - if (!requireNamespace("curl", quietly = TRUE)) - stopf("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) - file = tmpFile - on.exit(unlink(tmpFile), add=TRUE) - # nocov end - } - else if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") { - # nocov start - method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") - # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) - download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) - # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" - file = tmpFile - on.exit(unlink(tmpFile), add=TRUE) - # nocov end - } - else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command - cmd = input - if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { - messagef("Taking input= as a system command because it contains a space ('%s'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.", cmd) - } - } - else { - file = input # filename + } else if (startsWith(input, " ")) { + stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") + } else if (length(grep(' ', input, fixed=TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command + cmd = input + if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { + messagef("Taking input= as a system command because it contains a space ('%s'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.", cmd) } + } else { + file = input # filename, including URLS } } if (!is.null(cmd)) { @@ -97,6 +71,26 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(tmpFile), add=TRUE) } if (!is.null(file)) { + if (!is.character(file) || length(file)!=1L) + stopf("file= must be a single character string containing a filename, or URL starting 'http[s]://', 'ftp[s]://' or 'file://'") + if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 + # nocov start + tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below + if (w<=2L) { # https: or ftps: + if (!requireNamespace("curl", quietly = TRUE)) + stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov + + curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) + } else { + method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 + else getOption("download.file.method", default="auto") # http: or ftp: + download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) + # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" + } + file = tmpFile + on.exit(unlink(tmpFile), add=TRUE) + # nocov end + } file_info = file.info(file) if (is.na(file_info$size)) stopf("File '%s' does not exist or is non-readable. getwd()=='%s'", file, getwd()) if (isTRUE(file_info$isdir)) stopf("File '%s' is a directory. Not yet implemented.", file) # dir.exists() requires R v3.2+, #989 @@ -104,10 +98,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") warningf("File '%s' has size 0. Returning a NULL %s.", file, if (data.table) 'data.table' else 'data.frame') return(if (data.table) data.table(NULL) else data.frame(NULL)) } - if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) { + if (w <- endsWithAny(file, c(".gz",".bz2"))) { if (!requireNamespace("R.utils", quietly = TRUE)) stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (is_gz) gzfile else bzfile + FUN = if (w==1L) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) diff --git a/R/utils.R b/R/utils.R index b4ae4c9ee6..575913d345 100644 --- a/R/utils.R +++ b/R/utils.R @@ -28,9 +28,13 @@ if (base::getRversion() < "3.2.0") { # Apr 2015 if (!exists('startsWith', 'package:base', inherits=FALSE)) { # R 3.3.0; Apr 2016 startsWith = function(x, stub) substr(x, 1L, nchar(stub))==stub } -if (!exists('endsWith', 'package:base', inherits=FALSE)) { - endsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} -} +# endsWith no longer used from #5097 so no need to backport; prevent usage to avoid dev delay until GLCI's R 3.1.0 test +endsWith = function(...) stop("Internal error: use endsWithAny instead of base::endsWith") + +startsWithAny = function(x,y) .Call(CstartsWithAny, x, y, TRUE) +endsWithAny = function(x,y) .Call(CstartsWithAny, x, y, FALSE) +# For fread.R #5097 we need if any of the prefixes match, which one, and can return early on the first match +# Hence short and simple ascii-only at C level # which.first which.first = function(x) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9a30d6b3d4..ea8abd7537 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -30,7 +30,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table dcast.data.table = data.table:::dcast.data.table - if (!exists('endsWith', 'package:base', inherits=FALSE)) endsWith = data.table:::endsWith + endsWith = data.table:::endsWith + endsWithAny = data.table:::endsWithAny forder = data.table:::forder forderv = data.table:::forderv format.data.table = data.table:::format.data.table @@ -5887,10 +5888,11 @@ f = paste0("file://",testDir("russellCRLF.csv")) # simulates a http:// request as far as file.download() and unlink() goes, without internet # download.file() in fread() changes the input data from \r\n to \n, on Windows. test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) - +test(1378.25, fread(file = f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) f = paste("file://",testDir("russellCRCRLF.csv"),sep="") # actually has 3 \r in the file, download.file() from file:// changes that to \r\r\n, so we can simulate download.file from http: in text mode. test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) +test(1378.35, fread(file = f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) #==================================== options(datatable.fread.datatable = FALSE) @@ -13094,12 +13096,15 @@ test(1921.2, as.IDate(1000), as.IDate("1972-09-27")) f = tempfile() file.create(f) test(1922.1, fread(f), data.table(NULL), warning = 'File.*size 0') -test(1922.2, fread(file = f), data.table(NULL), warning = 'File.*size 0') +test(1922.21, fread(file = f), data.table(NULL), warning = 'File.*size 0') +test(1922.22, fread(file = 2L), error="file= must be a single character string containing") # trigger download for last instance of warning test(1922.3, fread(paste0('file://', f)), data.table(NULL), warning = 'File.*size 0') +test(1922.35, fread(file = paste0('file://', f)), data.table(NULL), warning = 'File.*size 0') test(1922.4, fread(f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') test(1922.5, fread(file = f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') test(1922.6, fread(paste0('file://', f), data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') +test(1922.65, fread(file = paste0('file://', f), data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') unlink(f) #fwrite creates a file or does nothing, as appropriate, also #2898 @@ -17790,18 +17795,17 @@ if (test_bit64) { test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } -# compatibility of endsWith backport with base::endsWith -if (exists('endsWith', 'package:base', inherits=FALSE)) { - DTendsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} - BSendsWith = base::endsWith - test(2194.1, DTendsWith('abcd', 'd'), BSendsWith('abcd', 'd')) - test(2194.2, DTendsWith(letters, 'e'), BSendsWith(letters, 'e')) - test(2194.3, DTendsWith(NA_character_, 'a'), BSendsWith(NA_character_, 'a')) - test(2194.4, DTendsWith(character(), 'a'), BSendsWith(character(), 'a')) - # file used in encoding tests - txt = readLines(testDir("issue_563_fread.txt")) - test(2194.5, DTendsWith(txt, 'B'), BSendsWith(txt, 'B')) -} +# endsWithAny added in #5097 for internal use replacing one use of base::endsWith (in fread.R) +test(2194.1, endsWithAny('abcd', 'd'), 1L) +test(2194.2, endsWithAny('ab.bz2', c('.gz','.bz2')), 2L) +test(2194.3, endsWithAny('ab.bz', c('.gz','.bz2')), FALSE) +test(2194.4, endsWithAny(letters, 'e'), error="Internal error.*types or lengths incorrect") +test(2194.5, endsWithAny(NA_character_, 'a'), FALSE) +test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect") +# file used in encoding tests +txt = readLines(testDir("issue_563_fread.txt")) +test(2194.7, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 +test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny") # uniqueN(x, by=character()) was internal error, #4594 DT = data.table(idx=c(1L,2L,1L,3L), value="val") diff --git a/src/init.c b/src/init.c index 49765ea482..910d675194 100644 --- a/src/init.c +++ b/src/init.c @@ -127,6 +127,7 @@ SEXP islockedR(); SEXP allNAR(); SEXP test_dt_win_snprintf(); SEXP dt_zlib_version(); +SEXP startsWithAny(); // .Externals SEXP fastmean(); @@ -222,6 +223,7 @@ R_CallMethodDef callMethods[] = { {"Ctest_dt_win_snprintf", (DL_FUNC)&test_dt_win_snprintf, -1}, {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, +{"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {NULL, NULL, 0} }; diff --git a/src/utils.c b/src/utils.c index a1d9093b8d..6320ccd221 100644 --- a/src/utils.c +++ b/src/utils.c @@ -389,3 +389,30 @@ SEXP dt_zlib_version() { #endif return ScalarString(mkChar(out)); } + +SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) { + // for is_url in fread.R added in #5097 + // startsWith was added to R in 3.3.0 so we need something to support R 3.1.0 + // short and simple ascii-only + if (!isString(x) || !isString(y) || length(x)!=1 || length(y)<1 || !isLogical(start) || length(start)!=1 || LOGICAL(start)[0]==NA_LOGICAL) + error("Internal error: data.table's internal startsWithAny types or lengths incorrect"); + const char *xd = CHAR(STRING_ELT(x, 0)); + const int n=length(y); + if (LOGICAL(start)[0]) { + for (int i=0; i=ylen && strncmp(xd+xlen-ylen, yd, ylen)==0) + return ScalarInteger(i+1); + } + } + return ScalarLogical(false); +} +