diff --git a/r/NEWS.md b/r/NEWS.md index 1955ede5602..3c8cd79717e 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -22,6 +22,8 @@ ## Minor improvements and fixes - Added bindings for atan, sinh, cosh, tanh, asinh, acosh, and tanh, and expm1 (#44953) +- Expose an option `check_directory_existence_before_creation` in `S3FileSystem` + to reduce I/O calls on cloud storage (@HaochengLIU, #41998) # arrow 20.0.0 diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 62e2182ffcd..901898e5b29 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1372,8 +1372,8 @@ fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_ba invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads)) } -fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, connect_timeout, request_timeout) { - .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, connect_timeout, request_timeout) +fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, check_directory_existence_before_creation, connect_timeout, request_timeout) { + .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, check_directory_existence_before_creation, connect_timeout, request_timeout) } fs___S3FileSystem__region <- function(fs) { diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index b8bc2b60638..77c6c9821e3 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -856,7 +856,8 @@ register_bindings_hms <- function() { Expression$create("multiply_checked", days, 86400) return(numeric_to_time32(total_secs)) - } + }, + notes = "subsecond times not supported" ) register_binding( @@ -880,6 +881,7 @@ register_bindings_hms <- function() { ) return(datetime_to_time32(as_date_time)) } - } + }, + notes = "subsecond times not supported" ) } diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index 64e4aab0f2c..7954cd55367 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -21,7 +21,7 @@ #' #' The `arrow` package contains methods for 37 `dplyr` table functions, many of #' which are "verbs" that do transformations to one or more tables. -#' The package also has mappings of 221 R functions to the corresponding +#' The package also has mappings of 222 R functions to the corresponding #' functions in the Arrow compute library. These allow you to write code inside #' of `dplyr` methods that call R functions, including many in packages like #' `stringr` and `lubridate`, and they will get translated to Arrow and run @@ -83,7 +83,7 @@ #' Functions can be called either as `pkg::fun()` or just `fun()`, i.e. both #' `str_sub()` and `stringr::str_sub()` work. #' -#' In addition to these functions, you can call any of Arrow's 271 compute +#' In addition to these functions, you can call any of Arrow's 280 compute #' functions directly. Arrow has many functions that don't map to an existing R #' function. In other cases where there is an R function mapping, you can still #' call the Arrow function directly if you don't want the adaptations that the R @@ -96,7 +96,6 @@ #' #' * [`add_filename()`][arrow::add_filename()] #' * [`cast()`][arrow::cast()] -#' * [`one()`][arrow::one()] #' #' ## base #' @@ -215,6 +214,11 @@ #' * [`n()`][dplyr::n()] #' * [`n_distinct()`][dplyr::n_distinct()] #' +#' ## hms +#' +#' * [`as_hms()`][hms::as_hms()] +#' * [`hms()`][hms::hms()] +#' #' ## lubridate #' #' * [`am()`][lubridate::am()] @@ -297,11 +301,6 @@ #' * [`ymd_hms()`][lubridate::ymd_hms()]: `locale` argument not supported #' * [`yq()`][lubridate::yq()]: `locale` argument not supported #' -#' ## hms -#' -#' * [`hms()`][hms::hms()]: subsecond times not supported -#' * [`hms()`][hms::as_hms()]: subsecond times not supported -#' #' ## methods #' #' * [`is()`][methods::is()] diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 0176cdf846d..378e52bf741 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -156,6 +156,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' buckets if `$CreateDir()` is called on the bucket level (default `FALSE`). #' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete #' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`). +#' - `check_directory_existence_before_creation`: logical, check if directory +#' already exists or not before creation. Helpful for cloud storage operations +#' where object mutation operations are rate limited or existing directories +#' are read-only. (default `FALSE`). #' - `request_timeout`: Socket read time on Windows and macOS in seconds. If #' negative, the AWS SDK default (typically 3 seconds). #' - `connect_timeout`: Socket connection timeout in seconds. If negative, AWS @@ -411,7 +415,8 @@ S3FileSystem$create <- function(anonymous = FALSE, ...) { invalid_args <- intersect( c( "access_key", "secret_key", "session_token", "role_arn", "session_name", - "external_id", "load_frequency", "allow_bucket_creation", "allow_bucket_deletion" + "external_id", "load_frequency", "allow_bucket_creation", "allow_bucket_deletion", + "check_directory_existence_before_creation" ), names(args) ) @@ -459,6 +464,7 @@ default_s3_options <- list( background_writes = TRUE, allow_bucket_creation = FALSE, allow_bucket_deletion = FALSE, + check_directory_existence_before_creation = FALSE, connect_timeout = -1, request_timeout = -1 ) diff --git a/r/data-raw/docgen.R b/r/data-raw/docgen.R index 5f7f21b9029..45773d24659 100644 --- a/r/data-raw/docgen.R +++ b/r/data-raw/docgen.R @@ -149,6 +149,10 @@ tidyselect <- grep("^tidyselect::", readLines("R/reexports-tidyselect.R"), value # HACK: remove the _random_along UDF we're using (fix in ARROW-17974) docs[["_random_along"]] <- NULL +# TODO - update the script to add this back in - will fail CI as tries to link +# to non-existent function as arrow::one only exists as registered binding +docs[["arrow::one"]] <- NULL + docs <- c(docs, setNames(rep(list(NULL), length(tidyselect)), tidyselect)) fun_df <- tibble::tibble( diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index dbf89ef1387..83e7fc65261 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -57,6 +57,10 @@ in the background, without blocking (default \code{TRUE}) buckets if \verb{$CreateDir()} is called on the bucket level (default \code{FALSE}). \item \code{allow_bucket_deletion}: logical, if TRUE, the filesystem will delete buckets if\verb{$DeleteDir()} is called on the bucket level (default \code{FALSE}). +\item \code{check_directory_existence_before_creation}: logical, check if directory +already exists or not before creation. Helpful for cloud storage operations +where object mutation operations are rate limited or existing directories +are read-only. (default \code{FALSE}). \item \code{request_timeout}: Socket read time on Windows and macOS in seconds. If negative, the AWS SDK default (typically 3 seconds). \item \code{connect_timeout}: Socket connection timeout in seconds. If negative, AWS diff --git a/r/man/acero.Rd b/r/man/acero.Rd index 345bb099d5a..238a57bdd41 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -9,7 +9,7 @@ \description{ The \code{arrow} package contains methods for 37 \code{dplyr} table functions, many of which are "verbs" that do transformations to one or more tables. -The package also has mappings of 212 R functions to the corresponding +The package also has mappings of 222 R functions to the corresponding functions in the Arrow compute library. These allow you to write code inside of \code{dplyr} methods that call R functions, including many in packages like \code{stringr} and \code{lubridate}, and they will get translated to Arrow and run @@ -29,7 +29,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:compute]{collect()}} \item \code{\link[dplyr:compute]{compute()}} \item \code{\link[dplyr:count]{count()}} -\item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} not supported +\item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} returns a non-missing value if present, only returning missing values if all are missing. \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} \item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored @@ -71,7 +71,7 @@ can assume that the function works in Acero just as it does in R. Functions can be called either as \code{pkg::fun()} or just \code{fun()}, i.e. both \code{str_sub()} and \code{stringr::str_sub()} work. -In addition to these functions, you can call any of Arrow's 262 compute +In addition to these functions, you can call any of Arrow's 280 compute functions directly. Arrow has many functions that don't map to an existing R function. In other cases where there is an R function mapping, you can still call the Arrow function directly if you don't want the adaptations that the R @@ -108,6 +108,7 @@ as \code{arrow_ascii_is_decimal}. \item \code{\link[=^]{^}} \item \code{\link[base:MathFun]{abs()}} \item \code{\link[base:Trig]{acos()}} +\item \code{\link[base:Hyperbolic]{acosh()}} \item \code{\link[base:all]{all()}} \item \code{\link[base:any]{any()}} \item \code{\link[base:as.Date]{as.Date()}}: Multiple \code{tryFormats} not supported in Arrow. @@ -119,14 +120,19 @@ Consider using the lubridate specialised parsing functions \code{ymd()}, \code{y \item \code{\link[base:logical]{as.logical()}} \item \code{\link[base:numeric]{as.numeric()}} \item \code{\link[base:Trig]{asin()}} +\item \code{\link[base:Hyperbolic]{asinh()}} +\item \code{\link[base:Trig]{atan()}} +\item \code{\link[base:Hyperbolic]{atanh()}} \item \code{\link[base:Round]{ceiling()}} \item \code{\link[base:Trig]{cos()}} +\item \code{\link[base:Hyperbolic]{cosh()}} \item \code{\link[base:data.frame]{data.frame()}}: \code{row.names} and \code{check.rows} arguments not supported; \code{stringsAsFactors} must be \code{FALSE} \item \code{\link[base:difftime]{difftime()}}: only supports \code{units = "secs"} (the default); \code{tz} argument not supported \item \code{\link[base:startsWith]{endsWith()}} \item \code{\link[base:Log]{exp()}} +\item \code{\link[base:Log]{expm1()}} \item \code{\link[base:Round]{floor()}} \item \code{\link[base:format]{format()}} \item \code{\link[base:grep]{grepl()}} @@ -160,6 +166,7 @@ Consider using the lubridate specialised parsing functions \code{ymd()}, \code{y \item \code{\link[base:Round]{round()}} \item \code{\link[base:sign]{sign()}} \item \code{\link[base:Trig]{sin()}} +\item \code{\link[base:Hyperbolic]{sinh()}} \item \code{\link[base:MathFun]{sqrt()}} \item \code{\link[base:startsWith]{startsWith()}} \item \code{\link[base:strptime]{strftime()}} @@ -172,6 +179,7 @@ Valid values are "s", "ms" (default), "us", "ns". \item \code{\link[base:substr]{substring()}} \item \code{\link[base:sum]{sum()}} \item \code{\link[base:Trig]{tan()}} +\item \code{\link[base:Hyperbolic]{tanh()}} \item \code{\link[base:chartr]{tolower()}} \item \code{\link[base:chartr]{toupper()}} \item \code{\link[base:Round]{trunc()}} @@ -201,6 +209,13 @@ Valid values are "s", "ms" (default), "us", "ns". } } +\subsection{hms}{ +\itemize{ +\item \code{\link[hms:hms]{as_hms()}} +\item \code{\link[hms:hms]{hms()}} +} +} + \subsection{lubridate}{ \itemize{ \item \code{\link[lubridate:am]{am()}} @@ -285,13 +300,6 @@ On Linux and OS X additionally a, A, b, B, Om, p, r are available. } } -\subsection{hms}{ -\itemize{ -\item \code{\link[hms:hms]{hms()}}: subsecond times not supported -\item \code{\link[hms:hms]{hms()}}: subsecond times not supported -} -} - \subsection{methods}{ \itemize{ \item \code{\link[methods:is]{is()}} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index d5aec50219e..e75d38a303f 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3521,8 +3521,8 @@ END_CPP11 } // filesystem.cpp #if defined(ARROW_R_WITH_S3) -std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes, bool allow_bucket_creation, bool allow_bucket_deletion, double connect_timeout, double request_timeout); -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ +std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes, bool allow_bucket_creation, bool allow_bucket_deletion, bool check_directory_existence_before_creation, double connect_timeout, double request_timeout); +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP check_directory_existence_before_creation_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ BEGIN_CPP11 arrow::r::Input::type anonymous(anonymous_sexp); arrow::r::Input::type access_key(access_key_sexp); @@ -3539,13 +3539,14 @@ BEGIN_CPP11 arrow::r::Input::type background_writes(background_writes_sexp); arrow::r::Input::type allow_bucket_creation(allow_bucket_creation_sexp); arrow::r::Input::type allow_bucket_deletion(allow_bucket_deletion_sexp); + arrow::r::Input::type check_directory_existence_before_creation(check_directory_existence_before_creation_sexp); arrow::r::Input::type connect_timeout(connect_timeout_sexp); arrow::r::Input::type request_timeout(request_timeout_sexp); - return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, connect_timeout, request_timeout)); + return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion, check_directory_existence_before_creation, connect_timeout, request_timeout)); END_CPP11 } #else -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp, SEXP check_directory_existence_before_creation_sexp, SEXP connect_timeout_sexp, SEXP request_timeout_sexp){ Rf_error("Cannot call fs___S3FileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -6013,7 +6014,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, - { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 17}, + { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 18}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, { "_arrow_FinalizeS3", (DL_FUNC) &_arrow_FinalizeS3, 0}, { "_arrow_fs___GcsFileSystem__Make", (DL_FUNC) &_arrow_fs___GcsFileSystem__Make, 2}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index f133013d20f..f553bee7e90 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -289,7 +289,8 @@ std::shared_ptr fs___S3FileSystem__create( std::string region = "", std::string endpoint_override = "", std::string scheme = "", std::string proxy_options = "", bool background_writes = true, bool allow_bucket_creation = false, bool allow_bucket_deletion = false, - double connect_timeout = -1, double request_timeout = -1) { + bool check_directory_existence_before_creation = false, double connect_timeout = -1, + double request_timeout = -1) { // We need to ensure that S3 is initialized before we start messing with the // options StopIfNotOk(fs::EnsureS3Initialized()); @@ -331,6 +332,9 @@ std::shared_ptr fs___S3FileSystem__create( s3_opts.allow_bucket_creation = allow_bucket_creation; s3_opts.allow_bucket_deletion = allow_bucket_deletion; + s3_opts.check_directory_existence_before_creation = + check_directory_existence_before_creation; + s3_opts.request_timeout = request_timeout; s3_opts.connect_timeout = connect_timeout; diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 8dfac634716..99219f2a818 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -46,7 +46,8 @@ fs <- S3FileSystem$create( scheme = "http", endpoint_override = paste0("localhost:", minio_port), allow_bucket_creation = TRUE, - allow_bucket_deletion = TRUE + allow_bucket_deletion = TRUE, + check_directory_existence_before_creation = TRUE ) limited_fs <- S3FileSystem$create( access_key = minio_key, @@ -54,7 +55,8 @@ limited_fs <- S3FileSystem$create( scheme = "http", endpoint_override = paste0("localhost:", minio_port), allow_bucket_creation = FALSE, - allow_bucket_deletion = FALSE + allow_bucket_deletion = FALSE, + check_directory_existence_before_creation = FALSE ) now <- as.character(as.numeric(Sys.time())) fs$CreateDir(now) diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index 50278af25bd..07476877c5b 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -190,7 +190,7 @@ Also note that parameters in the URI need to be For S3, only the following options can be included in the URI as query parameters are `region`, `scheme`, `endpoint_override`, `access_key`, `secret_key`, `allow_bucket_creation`, -and `allow_bucket_deletion`. For GCS, the supported parameters are `scheme`, `endpoint_override`, +`allow_bucket_deletion` and `check_directory_existence_before_creation`. For GCS, the supported parameters are `scheme`, `endpoint_override`, and `retry_limit_seconds`. In GCS, a useful option is `retry_limit_seconds`, which sets the number of seconds