From 73255906c7d35df4aaf7bf9826161054dacaad2a Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 6 Aug 2021 14:07:15 +0200 Subject: [PATCH 1/7] added fread file=url --- R/fread.R | 52 ++++++++++++++++++++++++++----------------- inst/tests/tests.Rraw | 5 ++++- 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/R/fread.R b/R/fread.R index 81ffb2a0df..896513c76f 100644 --- a/R/fread.R +++ b/R/fread.R @@ -36,6 +36,33 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") ) nThread=as.integer(nThread) stopifnot(nThread>=1L) + + is_url = function(x) any(startsWith(x, c("http://", "https://", "ftp://", "ftps://", "file://"))) + + download_file = function(input) { + str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 + # nocov start + if (str7=="ftps://" || startsWith(input, "https://")) { + if (!requireNamespace("curl", quietly = TRUE)) + stopf("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov + tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below + curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) + } + else { #if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") { + method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") + # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 + tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) + download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) + # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" + } + assign("file", tmpFile, envir=parent.env(environment())) + do.call( + on.exit, list(substitute(unlink(tmpFile)), add = TRUE), + envir = parent.frame() + ) + # nocov end + } + if (!is.null(text)) { if (!is.character(text)) stopf("'text=' is type %s but must be character.", typeof(text)) if (!length(text)) return(data.table()) @@ -58,27 +85,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") if (startsWith(input, " ")) { stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") } - str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 - if (str7=="ftps://" || startsWith(input, "https://")) { - # nocov start - if (!requireNamespace("curl", quietly = TRUE)) - stopf("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) - file = tmpFile - on.exit(unlink(tmpFile), add=TRUE) - # nocov end - } - else if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") { - # nocov start - method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") - # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) - download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) - # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" - file = tmpFile - on.exit(unlink(tmpFile), add=TRUE) - # nocov end + if (is_url(input)) { + download_file(input) } else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command cmd = input @@ -97,6 +105,8 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(tmpFile), add=TRUE) } if (!is.null(file)) { + if (is.character(file) && is_url(file)) download_file(file) # use URL as input for argument file #4952 + file_info = file.info(file) if (is.na(file_info$size)) stopf("File '%s' does not exist or is non-readable. getwd()=='%s'", file, getwd()) if (isTRUE(file_info$isdir)) stopf("File '%s' is a directory. Not yet implemented.", file) # dir.exists() requires R v3.2+, #989 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 9a30d6b3d4..77bd0245eb 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5887,10 +5887,11 @@ f = paste0("file://",testDir("russellCRLF.csv")) # simulates a http:// request as far as file.download() and unlink() goes, without internet # download.file() in fread() changes the input data from \r\n to \n, on Windows. test(1378.2, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) - +test(1378.25, fread(file = f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) f = paste("file://",testDir("russellCRCRLF.csv"),sep="") # actually has 3 \r in the file, download.file() from file:// changes that to \r\r\n, so we can simulate download.file from http: in text mode. test(1378.3, fread(f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) +test(1378.35, fread(file = f, showProgress=FALSE)[19,`Value With Dividends`], 357.97) #==================================== options(datatable.fread.datatable = FALSE) @@ -13097,9 +13098,11 @@ test(1922.1, fread(f), data.table(NULL), warning = 'File.*size 0') test(1922.2, fread(file = f), data.table(NULL), warning = 'File.*size 0') # trigger download for last instance of warning test(1922.3, fread(paste0('file://', f)), data.table(NULL), warning = 'File.*size 0') +test(1922.35, fread(file = paste0('file://', f)), data.table(NULL), warning = 'File.*size 0') test(1922.4, fread(f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') test(1922.5, fread(file = f, data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') test(1922.6, fread(paste0('file://', f), data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') +test(1922.65, fread(file = paste0('file://', f), data.table = FALSE), data.frame(NULL), warning = 'File.*size 0') unlink(f) #fwrite creates a file or does nothing, as appropriate, also #2898 From 72f8048335564544ca07a79dbbc6aae0503a6dd8 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Fri, 6 Aug 2021 14:21:50 +0200 Subject: [PATCH 2/7] added NEWS --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index f333ec0356..824816ba45 100644 --- a/NEWS.md +++ b/NEWS.md @@ -161,6 +161,8 @@ 29. A segfault occurred when `nrow/throttle < nthread`, [#5077](https://github.com/Rdatatable/data.table/issues/5077). With the default throttle of 1024 rows (see `?setDTthreads`), at least 64 threads would be needed to trigger the segfault since there needed to be more than 65,535 rows too. It occurred on a server with 256 logical cores where `data.table` uses 128 threads by default. Thanks to Bennet Becker for reporting, debugging at C level, and fixing. It also occurred when the throttle was increased so as to use fewer threads; e.g. at the limit `setDTthreads(throttle=nrow(DT))`. +30. `fread(file = URL)` now works as intended. Previously reading files from URLs did only work for `fread(input = URL)` and `fread(file = URL)` resulted in an error [#4952](https://github.com/Rdatatable/data.table/issues/4952). Thanks to @pnacht for reporting and @ben-schwen for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From b4e64c73223eb3a7fbee65dcfec82d32a742bf76 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 6 Aug 2021 23:36:31 -0600 Subject: [PATCH 3/7] new item tweak included error so that if folk getting the error search for the error, they might find this news item --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 824816ba45..3a9ffbca5f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -161,7 +161,7 @@ 29. A segfault occurred when `nrow/throttle < nthread`, [#5077](https://github.com/Rdatatable/data.table/issues/5077). With the default throttle of 1024 rows (see `?setDTthreads`), at least 64 threads would be needed to trigger the segfault since there needed to be more than 65,535 rows too. It occurred on a server with 256 logical cores where `data.table` uses 128 threads by default. Thanks to Bennet Becker for reporting, debugging at C level, and fixing. It also occurred when the throttle was increased so as to use fewer threads; e.g. at the limit `setDTthreads(throttle=nrow(DT))`. -30. `fread(file = URL)` now works as intended. Previously reading files from URLs did only work for `fread(input = URL)` and `fread(file = URL)` resulted in an error [#4952](https://github.com/Rdatatable/data.table/issues/4952). Thanks to @pnacht for reporting and @ben-schwen for the PR. +30. `fread(file=URL)` now works rather than error `does not exist or is non-readable`, [#4952](https://github.com/Rdatatable/data.table/issues/4952). `fread(URL)` and `fread(input=URL)` worked before and continue to work. Thanks to @pnacht for reporting and @ben-schwen for the PR. ## NOTES From 962e4a5579e36426923a899e9e144328daef4b80 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 7 Aug 2021 19:28:43 +0200 Subject: [PATCH 4/7] changed assign to return --- R/fread.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/fread.R b/R/fread.R index 896513c76f..332cd69c07 100644 --- a/R/fread.R +++ b/R/fread.R @@ -55,11 +55,11 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" } - assign("file", tmpFile, envir=parent.env(environment())) do.call( on.exit, list(substitute(unlink(tmpFile)), add = TRUE), envir = parent.frame() ) + return(tmpFile) # nocov end } @@ -86,7 +86,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") } if (is_url(input)) { - download_file(input) + file = download_file(input) } else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command cmd = input @@ -105,7 +105,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(tmpFile), add=TRUE) } if (!is.null(file)) { - if (is.character(file) && is_url(file)) download_file(file) # use URL as input for argument file #4952 + if (is.character(file) && is_url(file)) file = download_file(file) # use URL as input for argument file #4952 file_info = file.info(file) if (is.na(file_info$size)) stopf("File '%s' does not exist or is non-readable. getwd()=='%s'", file, getwd()) From dff4f808ae54630b3f4c36415338f654a935a3c9 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sat, 7 Aug 2021 20:47:40 +0200 Subject: [PATCH 5/7] changed use of startsWith for backport --- R/fread.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/fread.R b/R/fread.R index 332cd69c07..e53c62f45d 100644 --- a/R/fread.R +++ b/R/fread.R @@ -37,7 +37,9 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") nThread=as.integer(nThread) stopifnot(nThread>=1L) - is_url = function(x) any(startsWith(x, c("http://", "https://", "ftp://", "ftps://", "file://"))) + is_url = function(x) startsWith(x, "http://") || startsWith(x, "https://") || + startsWith(x, "ftp://") || startsWith(x, "ftps://") || + startsWith(x, "file://") download_file = function(input) { str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 From 941c2ddd6bb78855130ab7858f9299be9cd7b853 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 9 Aug 2021 14:24:55 -0600 Subject: [PATCH 6/7] added startsWithAny and endsWithAny --- R/fread.R | 16 +++++++--------- R/utils.R | 10 +++++++--- inst/tests/tests.Rraw | 26 +++++++++++++------------- src/init.c | 2 ++ src/utils.c | 27 +++++++++++++++++++++++++++ 5 files changed, 56 insertions(+), 25 deletions(-) diff --git a/R/fread.R b/R/fread.R index e53c62f45d..f4b13b01a1 100644 --- a/R/fread.R +++ b/R/fread.R @@ -37,21 +37,19 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") nThread=as.integer(nThread) stopifnot(nThread>=1L) - is_url = function(x) startsWith(x, "http://") || startsWith(x, "https://") || - startsWith(x, "ftp://") || startsWith(x, "ftps://") || - startsWith(x, "file://") + is_url = function(x) startsWithAny(x, c("https://", "http://", "file://", "ftps://", "ftp://")) download_file = function(input) { - str7 = substr(input, 1L, 7L) # avoid grepl() for #2531 # nocov start - if (str7=="ftps://" || startsWith(input, "https://")) { + if (startsWithAny(input, c("https://", "ftps://"))) { # avoid grepl() for #2531 if (!requireNamespace("curl", quietly = TRUE)) stopf("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) } - else { #if (startsWith(input, "ftp://") || str7== "http://" || str7=="file://") { - method = if (str7=="file://") "internal" else getOption("download.file.method", default="auto") + else { + method = if (startsWith(input, "file://")) "internal" + else getOption("download.file.method", default="auto") # ftp:// or http:// # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) @@ -116,10 +114,10 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") warningf("File '%s' has size 0. Returning a NULL %s.", file, if (data.table) 'data.table' else 'data.frame') return(if (data.table) data.table(NULL) else data.frame(NULL)) } - if ((is_gz <- endsWith(file, ".gz")) || endsWith(file, ".bz2")) { + if (w <- endsWithAny(file, c(".gz",".bz2"))) { if (!requireNamespace("R.utils", quietly = TRUE)) stopf("To read gz and bz2 files directly, fread() requires 'R.utils' package which cannot be found. Please install 'R.utils' using 'install.packages('R.utils')'.") # nocov - FUN = if (is_gz) gzfile else bzfile + FUN = if (w==1L) gzfile else bzfile R.utils::decompressFile(file, decompFile<-tempfile(tmpdir=tmpdir), ext=NULL, FUN=FUN, remove=FALSE) # ext is not used by decompressFile when destname is supplied, but isn't optional file = decompFile # don't use 'tmpFile' symbol again, as tmpFile might be the http://domain.org/file.csv.gz download on.exit(unlink(decompFile), add=TRUE) diff --git a/R/utils.R b/R/utils.R index b4ae4c9ee6..575913d345 100644 --- a/R/utils.R +++ b/R/utils.R @@ -28,9 +28,13 @@ if (base::getRversion() < "3.2.0") { # Apr 2015 if (!exists('startsWith', 'package:base', inherits=FALSE)) { # R 3.3.0; Apr 2016 startsWith = function(x, stub) substr(x, 1L, nchar(stub))==stub } -if (!exists('endsWith', 'package:base', inherits=FALSE)) { - endsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} -} +# endsWith no longer used from #5097 so no need to backport; prevent usage to avoid dev delay until GLCI's R 3.1.0 test +endsWith = function(...) stop("Internal error: use endsWithAny instead of base::endsWith") + +startsWithAny = function(x,y) .Call(CstartsWithAny, x, y, TRUE) +endsWithAny = function(x,y) .Call(CstartsWithAny, x, y, FALSE) +# For fread.R #5097 we need if any of the prefixes match, which one, and can return early on the first match +# Hence short and simple ascii-only at C level # which.first which.first = function(x) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 77bd0245eb..4bbf7dcba8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -30,7 +30,8 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { compactprint = data.table:::compactprint cube.data.table = data.table:::cube.data.table dcast.data.table = data.table:::dcast.data.table - if (!exists('endsWith', 'package:base', inherits=FALSE)) endsWith = data.table:::endsWith + endsWith = data.table:::endsWith + endsWithAny = data.table:::endsWithAny forder = data.table:::forder forderv = data.table:::forderv format.data.table = data.table:::format.data.table @@ -17793,18 +17794,17 @@ if (test_bit64) { test(2193.2, X[Y, `:=`(y=i.y), on="x", by=.EACHI], data.table(x=1:3, y=as.integer64(10L,20L,NA))) } -# compatibility of endsWith backport with base::endsWith -if (exists('endsWith', 'package:base', inherits=FALSE)) { - DTendsWith = function(x, stub) {n=nchar(x); substr(x, n-nchar(stub)+1L, n)==stub} - BSendsWith = base::endsWith - test(2194.1, DTendsWith('abcd', 'd'), BSendsWith('abcd', 'd')) - test(2194.2, DTendsWith(letters, 'e'), BSendsWith(letters, 'e')) - test(2194.3, DTendsWith(NA_character_, 'a'), BSendsWith(NA_character_, 'a')) - test(2194.4, DTendsWith(character(), 'a'), BSendsWith(character(), 'a')) - # file used in encoding tests - txt = readLines(testDir("issue_563_fread.txt")) - test(2194.5, DTendsWith(txt, 'B'), BSendsWith(txt, 'B')) -} +# endsWithAny added in #5097 for internal use replacing one use of base::endsWith (in fread.R) +test(2194.1, endsWithAny('abcd', 'd'), 1L) +test(2194.2, endsWithAny('ab.bz2', c('.gz','.bz2')), 2L) +test(2194.3, endsWithAny('ab.bz', c('.gz','.bz2')), FALSE) +test(2194.4, endsWithAny(letters, 'e'), error="Internal error.*types or lengths incorrect") +test(2194.5, endsWithAny(NA_character_, 'a'), FALSE) +test(2194.6, endsWithAny(character(), 'a'), error="Internal error.*types or lengths incorrect") +# file used in encoding tests +txt = readLines(testDir("issue_563_fread.txt")) +test(2194.7, endsWithAny(txt, 'B'), error="Internal error.*types or lengths incorrect") # txt is length 5 +test(2194.8, endsWith('abcd', 'd'), error="Internal error.*use endsWithAny") # uniqueN(x, by=character()) was internal error, #4594 DT = data.table(idx=c(1L,2L,1L,3L), value="val") diff --git a/src/init.c b/src/init.c index 49765ea482..910d675194 100644 --- a/src/init.c +++ b/src/init.c @@ -127,6 +127,7 @@ SEXP islockedR(); SEXP allNAR(); SEXP test_dt_win_snprintf(); SEXP dt_zlib_version(); +SEXP startsWithAny(); // .Externals SEXP fastmean(); @@ -222,6 +223,7 @@ R_CallMethodDef callMethods[] = { {"Ctest_dt_win_snprintf", (DL_FUNC)&test_dt_win_snprintf, -1}, {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, +{"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {NULL, NULL, 0} }; diff --git a/src/utils.c b/src/utils.c index a1d9093b8d..6320ccd221 100644 --- a/src/utils.c +++ b/src/utils.c @@ -389,3 +389,30 @@ SEXP dt_zlib_version() { #endif return ScalarString(mkChar(out)); } + +SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) { + // for is_url in fread.R added in #5097 + // startsWith was added to R in 3.3.0 so we need something to support R 3.1.0 + // short and simple ascii-only + if (!isString(x) || !isString(y) || length(x)!=1 || length(y)<1 || !isLogical(start) || length(start)!=1 || LOGICAL(start)[0]==NA_LOGICAL) + error("Internal error: data.table's internal startsWithAny types or lengths incorrect"); + const char *xd = CHAR(STRING_ELT(x, 0)); + const int n=length(y); + if (LOGICAL(start)[0]) { + for (int i=0; i=ylen && strncmp(xd+xlen-ylen, yd, ylen)==0) + return ScalarInteger(i+1); + } + } + return ScalarLogical(false); +} + From 67b869d25a5fd132e4375e8644b9e3c86194c497 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 9 Aug 2021 16:12:16 -0600 Subject: [PATCH 7/7] simplify --- R/fread.R | 72 +++++++++++++++++-------------------------- inst/tests/tests.Rraw | 3 +- 2 files changed, 30 insertions(+), 45 deletions(-) diff --git a/R/fread.R b/R/fread.R index f4b13b01a1..1bc0267b34 100644 --- a/R/fread.R +++ b/R/fread.R @@ -36,33 +36,6 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") ) nThread=as.integer(nThread) stopifnot(nThread>=1L) - - is_url = function(x) startsWithAny(x, c("https://", "http://", "file://", "ftps://", "ftp://")) - - download_file = function(input) { - # nocov start - if (startsWithAny(input, c("https://", "ftps://"))) { # avoid grepl() for #2531 - if (!requireNamespace("curl", quietly = TRUE)) - stopf("Input URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below - curl::curl_download(input, tmpFile, mode="wb", quiet = !showProgress) - } - else { - method = if (startsWith(input, "file://")) "internal" - else getOption("download.file.method", default="auto") # ftp:// or http:// - # force "auto" when file:// to ensure we don't use an invalid option (e.g. wget), #1668 - tmpFile = tempfile(fileext = paste0(".",tools::file_ext(input)), tmpdir=tmpdir) - download.file(input, tmpFile, method=method, mode="wb", quiet=!showProgress) - # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" - } - do.call( - on.exit, list(substitute(unlink(tmpFile)), add = TRUE), - envir = parent.frame() - ) - return(tmpFile) - # nocov end - } - if (!is.null(text)) { if (!is.character(text)) stopf("'text=' is type %s but must be character.", typeof(text)) if (!length(text)) return(data.table()) @@ -81,22 +54,15 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") } if (input=="" || length(grep('\\n|\\r', input))) { # input is data itself containing at least one \n or \r - } else { - if (startsWith(input, " ")) { - stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") - } - if (is_url(input)) { - file = download_file(input) - } - else if (length(grep(' ', input, fixed = TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command - cmd = input - if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { - messagef("Taking input= as a system command because it contains a space ('%s'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.", cmd) - } - } - else { - file = input # filename + } else if (startsWith(input, " ")) { + stopf("input= contains no \\n or \\r, but starts with a space. Please remove the leading space, or use text=, file= or cmd=") + } else if (length(grep(' ', input, fixed=TRUE)) && !file.exists(input)) { # file name or path containing spaces is not a command + cmd = input + if (input_has_vars && getOption("datatable.fread.input.cmd.message", TRUE)) { + messagef("Taking input= as a system command because it contains a space ('%s'). If it's a filename please remove the space, or use file= explicitly. A variable is being passed to input= and when this is taken as a system command there is a security concern if you are creating an app, the app could have a malicious user, and the app is not running in a secure environment; e.g. the app is running as root. Please read item 5 in the NEWS file for v1.11.6 for more information and for the option to suppress this message.", cmd) } + } else { + file = input # filename, including URLS } } if (!is.null(cmd)) { @@ -105,8 +71,26 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir(), tz="UTC") on.exit(unlink(tmpFile), add=TRUE) } if (!is.null(file)) { - if (is.character(file) && is_url(file)) file = download_file(file) # use URL as input for argument file #4952 - + if (!is.character(file) || length(file)!=1L) + stopf("file= must be a single character string containing a filename, or URL starting 'http[s]://', 'ftp[s]://' or 'file://'") + if (w <- startsWithAny(file, c("https://", "ftps://", "http://", "ftp://", "file://"))) { # avoid grepl() for #2531 + # nocov start + tmpFile = tempfile(fileext = paste0(".",tools::file_ext(file)), tmpdir=tmpdir) # retain .gz extension in temp filename so it knows to be decompressed further below + if (w<=2L) { # https: or ftps: + if (!requireNamespace("curl", quietly = TRUE)) + stopf("URL requires https:// connection for which fread() requires 'curl' package which cannot be found. Please install 'curl' using 'install.packages('curl')'.") # nocov + + curl::curl_download(file, tmpFile, mode="wb", quiet = !showProgress) + } else { + method = if (w==5L) "internal" # force 'auto' when file: to ensure we don't use an invalid option (e.g. wget), #1668 + else getOption("download.file.method", default="auto") # http: or ftp: + download.file(file, tmpFile, method=method, mode="wb", quiet=!showProgress) + # In text mode on Windows-only, R doubles up \r to make \r\r\n line endings. mode="wb" avoids that. See ?connections:"CRLF" + } + file = tmpFile + on.exit(unlink(tmpFile), add=TRUE) + # nocov end + } file_info = file.info(file) if (is.na(file_info$size)) stopf("File '%s' does not exist or is non-readable. getwd()=='%s'", file, getwd()) if (isTRUE(file_info$isdir)) stopf("File '%s' is a directory. Not yet implemented.", file) # dir.exists() requires R v3.2+, #989 diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4bbf7dcba8..ea8abd7537 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13096,7 +13096,8 @@ test(1921.2, as.IDate(1000), as.IDate("1972-09-27")) f = tempfile() file.create(f) test(1922.1, fread(f), data.table(NULL), warning = 'File.*size 0') -test(1922.2, fread(file = f), data.table(NULL), warning = 'File.*size 0') +test(1922.21, fread(file = f), data.table(NULL), warning = 'File.*size 0') +test(1922.22, fread(file = 2L), error="file= must be a single character string containing") # trigger download for last instance of warning test(1922.3, fread(paste0('file://', f)), data.table(NULL), warning = 'File.*size 0') test(1922.35, fread(file = paste0('file://', f)), data.table(NULL), warning = 'File.*size 0')