diff --git a/dev/tasks/homebrew-formulae/travis.osx.r.yml b/dev/tasks/homebrew-formulae/travis.osx.r.yml index 020010111b1..d8bdb370478 100644 --- a/dev/tasks/homebrew-formulae/travis.osx.r.yml +++ b/dev/tasks/homebrew-formulae/travis.osx.r.yml @@ -45,9 +45,11 @@ before_install: - sed -i.bak -E -e 's@https://github.com/apache/arrow.git"$@{{ arrow.remote }}.git", :revision => "{{ arrow.head }}"@' tools/apache-arrow.rb && rm -f tools/apache-arrow.rb.bak # Sometimes crossbow gives a remote URL with .git and sometimes not. Make sure there's only one - sed -i.bak -E -e 's@.git.git@.git@' tools/apache-arrow.rb && rm -f tools/apache-arrow.rb.bak +# Get minio for S3 testing +- brew install minio/stable/minio script: -- Rscript -e 'install.packages("rcmdcheck")' +- Rscript -e 'install.packages(c("rcmdcheck", "sys"))' # Note that this is not --as-cran. CRAN doesn't do macOS checks --as-cran -- travis_wait Rscript -e "rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check')" +- travis_wait Rscript -e "minio_dir <- tempfile(); dir.create(minio_dir); pid <- sys::exec_background('minio', c('server', minio_dir)); on.exit(tools::pskill(pid)); rcmdcheck::rcmdcheck(build_args = '--no-build-vignettes', args = c('--no-manual', '--ignore-vignettes', '--run-donttest'), error_on = 'warning', check_dir = 'check')" # If there's a build failure, it's probably in this log. Let's print it regardless though - cat check/arrow.Rcheck/00install.out diff --git a/docs/source/python/filesystems.rst b/docs/source/python/filesystems.rst index 553a9e45d41..184bd481fb0 100644 --- a/docs/source/python/filesystems.rst +++ b/docs/source/python/filesystems.rst @@ -43,7 +43,7 @@ and Amazon S3-compatible storage (:class:`S3FileSystem`). Usage ----- -A FileSystem object can be created with one of the constuctors (and check the +A FileSystem object can be created with one of the constructors (and check the respective constructor for its options):: >>> from pyarrow import fs diff --git a/r/NAMESPACE b/r/NAMESPACE index a5e83b1f801..ec71a80bad0 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -72,6 +72,7 @@ S3method(mean,Scalar) S3method(min,Array) S3method(min,ChunkedArray) S3method(names,Dataset) +S3method(names,FeatherReader) S3method(names,RecordBatch) S3method(names,ScannerBuilder) S3method(names,Schema) @@ -287,6 +288,7 @@ importFrom(rlang,enquos) importFrom(rlang,env) importFrom(rlang,env_bind) importFrom(rlang,eval_tidy) +importFrom(rlang,exec) importFrom(rlang,is_false) importFrom(rlang,is_integerish) importFrom(rlang,list2) @@ -309,6 +311,7 @@ importFrom(tidyselect,vars_rename) importFrom(tidyselect,vars_select) importFrom(utils,head) importFrom(utils,install.packages) +importFrom(utils,modifyList) importFrom(utils,tail) importFrom(vctrs,s3_register) importFrom(vctrs,vec_ptype_abbr) diff --git a/r/NEWS.md b/r/NEWS.md index 5b2eac82539..c03ff2f7487 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -28,7 +28,7 @@ ## AWS S3 support * S3 support is now enabled in binary macOS and Windows (Rtools40 only, i.e. R >= 4.0) packages. To enable it on Linux, you will need to build and install `aws-sdk-cpp` from source, then set the environment variable `EXTRA_CMAKE_FLAGS="-DARROW_S3=ON -DAWSSDK_SOURCE=SYSTEM"` prior to building the R package (with bundled C++ build, not with Arrow system libraries) from source. -* File readers and writers (`read_parquet()`, `write_feather()`, et al.) now accept an `s3://` URI as the source or destination file, as do `open_dataset()` and `write_dataset()`. See `vignette("fs", package = "arrow")` for details. +* File readers and writers (`read_parquet()`, `write_feather()`, et al.), as well as `open_dataset()` and `write_dataset()`, allow you to access resources on S3 (or on file systems that emulate S3) either by providing an `s3://` URI or by passing an additional `filesystem` argument. See `vignette("fs", package = "arrow")` for details. ## Computation diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 5ab67937f32..2ffe7e8ecc9 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -18,7 +18,7 @@ #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map_dfr map_int map_lgl #' @importFrom assertthat assert_that is.string -#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env env_bind as_label set_names +#' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos eval_tidy new_data_mask syms env env_bind as_label set_names exec #' @importFrom tidyselect vars_select #' @useDynLib arrow, .registration = TRUE #' @keywords internal diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 7ae31f04f59..5e43b442aff 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -932,12 +932,8 @@ fs___CopyFiles <- function(src_fs, src_paths, dest_fs, dest_paths, chunk_size, u invisible(.Call(`_arrow_fs___CopyFiles` , src_fs, src_paths, dest_fs, dest_paths, chunk_size, use_threads)) } -fs___EnsureS3Initialized <- function(){ - invisible(.Call(`_arrow_fs___EnsureS3Initialized` )) -} - -fs___S3FileSystem__create <- function(){ - .Call(`_arrow_fs___S3FileSystem__create` ) +fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes){ + .Call(`_arrow_fs___S3FileSystem__create` , anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes) } io___Readable__Read <- function(x, nbytes){ diff --git a/r/R/csv.R b/r/R/csv.R index 9b16713e6d7..c4460d79f55 100644 --- a/r/R/csv.R +++ b/r/R/csv.R @@ -68,6 +68,8 @@ #' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.). #' @param convert_options see [file reader options][CsvReadOptions] #' @param read_options see [file reader options][CsvReadOptions] +#' @param filesystem A [FileSystem] where `file` can be found if it is a +#' string file path; default is the local file system #' @param as_data_frame Should the function return a `data.frame` (default) or #' an Arrow [Table]? #' @@ -98,6 +100,7 @@ read_delim_arrow <- function(file, parse_options = NULL, convert_options = NULL, read_options = NULL, + filesystem = NULL, as_data_frame = TRUE) { if (is.null(parse_options)) { @@ -119,7 +122,7 @@ read_delim_arrow <- function(file, } if (!inherits(file, "InputStream")) { - file <- make_readable_file(file) + file <- make_readable_file(file, filesystem = filesystem) on.exit(file$close()) } reader <- CsvTableReader$create( @@ -206,7 +209,7 @@ read_tsv_arrow <- function(file, #' The `CsvTableReader$create()` and `JsonTableReader$create()` factory methods #' take the following arguments: #' -#' - `file` A character path to a local file, or an Arrow input stream +#' - `file` An Arrow [InputStream] #' - `convert_options` (CSV only), `parse_options`, `read_options`: see #' [CsvReadOptions] #' - `...` additional parameters. @@ -227,7 +230,7 @@ CsvTableReader$create <- function(file, parse_options = CsvParseOptions$create(), convert_options = CsvConvertOptions$create(), ...) { - file <- make_readable_file(file) + assert_is(file, "InputStream") shared_ptr( CsvTableReader, csv___TableReader__Make(file, read_options, parse_options, convert_options) diff --git a/r/R/feather.R b/r/R/feather.R index 7026de4dbab..e9656aa0901 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -25,6 +25,8 @@ #' #' @param x `data.frame`, [RecordBatch], or [Table] #' @param sink A string file path, URI, or [OutputStream] +#' @param filesystem A [FileSystem] where `sink` should be written if it is a +#' string file path; default is the local file system #' @param version integer Feather file version. Version 2 is the current. #' Version 1 is the more limited legacy format. #' @param chunk_size For V2 files, the number of rows that each chunk of data @@ -52,6 +54,7 @@ #' @include arrow-package.R write_feather <- function(x, sink, + filesystem = NULL, version = 2, chunk_size = 65536L, compression = c("default", "lz4", "uncompressed", "zstd"), @@ -106,7 +109,7 @@ write_feather <- function(x, assert_is(x, "Table") if (is.string(sink)) { - sink <- make_output_stream(sink) + sink <- make_output_stream(sink, filesystem) on.exit(sink$close()) } assert_is(sink, "OutputStream") @@ -141,17 +144,16 @@ write_feather <- function(x, #' # Can select columns #' df <- read_feather(tf, col_select = starts_with("d")) #' } -read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { +read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, filesystem = NULL, ...) { if (!inherits(file, "RandomAccessFile")) { - file <- make_readable_file(file) + file <- make_readable_file(file, filesystem = filesystem) on.exit(file$close()) } reader <- FeatherReader$create(file, ...) - all_columns <- ipc___feather___Reader__column_names(reader) col_select <- enquo(col_select) columns <- if (!quo_is_null(col_select)) { - vars_select(all_columns, !!col_select) + vars_select(names(reader), !!col_select) } out <- reader$Read(columns) @@ -198,10 +200,14 @@ FeatherReader <- R6Class("FeatherReader", inherit = ArrowObject, ), active = list( # versions are officially 2 for V1 and 3 for V2 :shrug: - version = function() ipc___feather___Reader__version(self) - 1L + version = function() ipc___feather___Reader__version(self) - 1L, + column_names = function() ipc___feather___Reader__column_names(self) ) ) +#' @export +names.FeatherReader <- function(x) x$column_names + FeatherReader$create <- function(file, mmap = TRUE, ...) { assert_is(file, "RandomAccessFile") shared_ptr(FeatherReader, ipc___feather___Reader__Open(file)) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 3d7d6fb611a..339af190a24 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -122,11 +122,39 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' #' @section Factory: #' -#' The `$create()` factory methods instantiate the `FileSystem` object and -#' take the following arguments, depending on the subclass: +#' `LocalFileSystem$create()` returns the object and takes no arguments. #' -#' - no argument is needed for instantiating a `LocalFileSystem` -#' - `base_path` and `base_fs` for instantiating a `SubTreeFileSystem` +#' `SubTreeFileSystem$create()` takes the following arguments: +#' +#' - `base_path`, a string path +#' - `base_fs`, a `FileSystem` object +#' +#' `S3FileSystem$create()` optionally takes arguments: +#' +#' - `anonymous`: logical, default `FALSE`. If true, will not attempt to look up +#' credentials using standard AWS configuration methods. +#' - `access_key`, `secret_key`: authentication credentials. If one is provided, +#' the other must be as well. If both are provided, they will override any +#' AWS configuration set at the environment level. +#' - `session_token`: optional string for authentication along with +#' `access_key` and `secret_key` +#' - `role_arn`: string AWS ARN of an AccessRole. If provided instead of `access_key` and +#' `secret_key`, temporary credentials will be fetched by assuming this role. +#' - `session_name`: optional string identifier for the assumed role session. +#' - `external_id`: optional unique string identifier that might be required +#' when you assume a role in another account. +#' - `load_frequency`: integer, frequency (in seconds) with which temporary +#' credentials from an assumed role session will be refreshed. Default is +#' 900 (i.e. 15 minutes) +#' - `region`: AWS region to connect to. If omitted, the AWS library will +#' provide a sensible default based on client configuration, falling back +#' to "us-east-1" if no other alternatives are found. +#' - `endpoint_override`: If non-empty, override region with a connect string +#' such as "localhost:9000". This is useful for connecting to file systems +#' that emulate S3. +#' - `scheme`: S3 connection transport (default "https") +#' - `background_writes`: logical, whether `OutputStream` writes will be issued +#' in the background, without blocking (default `TRUE`) #' #' @section Methods: #' @@ -279,13 +307,56 @@ LocalFileSystem$create <- function() { #' @usage NULL #' @format NULL #' @rdname FileSystem +#' @importFrom utils modifyList #' @export S3FileSystem <- R6Class("S3FileSystem", inherit = FileSystem) -S3FileSystem$create <- function() { - fs___EnsureS3Initialized() - shared_ptr(S3FileSystem, fs___S3FileSystem__create()) +S3FileSystem$create <- function(anonymous = FALSE, ...) { + args <- list2(...) + if (anonymous) { + invalid_args <- intersect(c("access_key", "secret_key", "session_token", "role_arn", "session_name", "external_id", "load_frequency"), names(args)) + if (length(invalid_args)) { + stop("Cannot specify ", oxford_paste(invalid_args), " when anonymous = TRUE", call. = FALSE) + } + } else { + keys_present <- length(intersect(c("access_key", "secret_key"), names(args))) + if (keys_present == 1) { + stop("Key authentication requires both access_key and secret_key", call. = FALSE) + } + if ("session_token" %in% names(args) && keys_present != 2) { + stop( + "In order to initialize a session with temporary credentials, ", + "both secret_key and access_key must be provided ", + "in addition to session_token.", + call. = FALSE + ) + } + arn <- "role_arn" %in% names(args) + if (keys_present == 2 && arn) { + stop("Cannot provide both key authentication and role_arn", call. = FALSE) + } + arn_extras <- intersect(c("session_name", "external_id", "load_frequency"), names(args)) + if (length(arn_extras) > 0 && !arn) { + stop("Cannot specify ", oxford_paste(arn_extras), " without providing a role_arn string", call. = FALSE) + } + } + args <- c(modifyList(default_s3_options, args), anonymous = anonymous) + shared_ptr(S3FileSystem, exec(fs___S3FileSystem__create, !!!args)) } +default_s3_options <- list( + access_key = "", + secret_key = "", + session_token = "", + role_arn = "", + session_name = "", + external_id = "", + load_frequency = 900L, + region = "", + endpoint_override = "", + scheme = "", + background_writes = TRUE +) + arrow_with_s3 <- function() { .Call(`_s3_available`) } @@ -295,9 +366,12 @@ arrow_with_s3 <- function() { #' @rdname FileSystem #' @export SubTreeFileSystem <- R6Class("SubTreeFileSystem", inherit = FileSystem) -SubTreeFileSystem$create <- function(base_path, base_fs) { - xp <- fs___SubTreeFileSystem__create(clean_path_rel(base_path), base_fs) - shared_ptr(SubTreeFileSystem, xp) +SubTreeFileSystem$create <- function(base_path, base_fs = NULL) { + fs_and_path <- get_path_and_filesystem(base_path, base_fs) + shared_ptr( + SubTreeFileSystem, + fs___SubTreeFileSystem__create(fs_and_path$path, fs_and_path$fs) + ) } #' Copy files between FileSystems diff --git a/r/R/io.R b/r/R/io.R index 3b607a4e2b7..98b89f79bd7 100644 --- a/r/R/io.R +++ b/r/R/io.R @@ -257,12 +257,16 @@ make_readable_file <- function(file, mmap = TRUE, compression = NULL, filesystem file } -make_output_stream <- function(x) { +make_output_stream <- function(x, filesystem = NULL) { if (is_url(x)) { fs_and_path <- FileSystem$from_uri(x) - fs_and_path$fs$OpenOutputStream(fs_and_path$path) - } else { + filesystem = fs_and_path$fs + x <- fs_and_path$path + } + if (is.null(filesystem)) { FileOutputStream$create(x) + } else { + filesystem$OpenOutputStream(x) } } diff --git a/r/R/ipc_stream.R b/r/R/ipc_stream.R index 618ace52f49..be21157e292 100644 --- a/r/R/ipc_stream.R +++ b/r/R/ipc_stream.R @@ -35,13 +35,13 @@ #' serialize data to a buffer. #' [RecordBatchWriter] for a lower-level interface. #' @export -write_ipc_stream <- function(x, sink, ...) { +write_ipc_stream <- function(x, sink, filesystem = NULL, ...) { x_out <- x # So we can return the data we got if (is.data.frame(x)) { x <- Table$create(x) } if (is.string(sink)) { - sink <- make_output_stream(sink) + sink <- make_output_stream(sink, filesystem) on.exit(sink$close()) } assert_is(sink, "OutputStream") @@ -90,6 +90,8 @@ write_to_raw <- function(x, format = c("stream", "file")) { #' open. #' @param as_data_frame Should the function return a `data.frame` (default) or #' an Arrow [Table]? +#' @param filesystem A [FileSystem] where `file` can be found if it is a +#' string file path; default is the local file system #' @param ... extra parameters passed to `read_feather()`. #' #' @return A `data.frame` if `as_data_frame` is `TRUE` (the default), or an @@ -97,9 +99,9 @@ write_to_raw <- function(x, format = c("stream", "file")) { #' @seealso [read_feather()] for writing IPC files. [RecordBatchReader] for a #' lower-level interface. #' @export -read_ipc_stream <- function(file, as_data_frame = TRUE, ...) { +read_ipc_stream <- function(file, as_data_frame = TRUE, filesystem = NULL, ...) { if (!inherits(file, "InputStream")) { - file <- make_readable_file(file) + file <- make_readable_file(file, filesystem = filesystem) on.exit(file$close()) } diff --git a/r/R/json.R b/r/R/json.R index 006a7d7cf95..18c2888588d 100644 --- a/r/R/json.R +++ b/r/R/json.R @@ -35,7 +35,15 @@ #' ', tf, useBytes=TRUE) #' df <- read_json_arrow(tf) #' } -read_json_arrow <- function(file, col_select = NULL, as_data_frame = TRUE, ...) { +read_json_arrow <- function(file, + col_select = NULL, + as_data_frame = TRUE, + filesystem = NULL, + ...) { + if (!inherits(file, "InputStream")) { + file <- make_readable_file(file, filesystem = filesystem) + on.exit(file$close()) + } tab <- JsonTableReader$create(file, ...)$Read() col_select <- enquo(col_select) @@ -64,8 +72,7 @@ JsonTableReader$create <- function(file, read_options = JsonReadOptions$create(), parse_options = JsonParseOptions$create(), ...) { - - file <- make_readable_file(file) + assert_is(file, "InputStream") shared_ptr( JsonTableReader, json___TableReader__Make(file, read_options, parse_options) diff --git a/r/R/parquet.R b/r/R/parquet.R index 4e3287b8fbf..2b97a9ffae6 100644 --- a/r/R/parquet.R +++ b/r/R/parquet.R @@ -39,9 +39,10 @@ read_parquet <- function(file, col_select = NULL, as_data_frame = TRUE, props = ParquetReaderProperties$create(), + filesystem = NULL, ...) { if (is.string(file)) { - file <- make_readable_file(file) + file <- make_readable_file(file, filesystem = filesystem) on.exit(file$close()) } reader <- ParquetFileReader$create(file, props = props, ...) @@ -58,9 +59,10 @@ read_parquet <- function(file, #' [Parquet](https://parquet.apache.org/) is a columnar storage file format. #' This function enables you to write Parquet files from R. #' -#' @param x An [arrow::Table][Table], or an object convertible to it. -#' @param sink an [arrow::io::OutputStream][OutputStream] or a string -#' interpreted as a file path or URI +#' @param x `data.frame`, [RecordBatch], or [Table] +#' @param sink A string file path, URI, or [OutputStream] +#' @param filesystem A [FileSystem] where `sink` should be written if it is a +#' string file path; default is the local file system #' @param chunk_size chunk size in number of rows. If NULL, the total number of rows is used. #' @param version parquet version, "1.0" or "2.0". Default "1.0". Numeric values #' are coerced to character. @@ -112,6 +114,7 @@ read_parquet <- function(file, #' @export write_parquet <- function(x, sink, + filesystem = NULL, chunk_size = NULL, # writer properties version = NULL, @@ -130,7 +133,7 @@ write_parquet <- function(x, } if (is.string(sink)) { - sink <- make_output_stream(sink) + sink <- make_output_stream(sink, filesystem) on.exit(sink$close()) } else if (!inherits(sink, "OutputStream")) { abort("sink must be a file path or an OutputStream") diff --git a/r/README.md b/r/README.md index 6fb9cba7f35..6a417454f8b 100644 --- a/r/README.md +++ b/r/README.md @@ -172,6 +172,27 @@ isn’t found, you can explicitly provide the path to it like this by installing LLVM via Homebrew and running the script as `CLANG_FORMAT=$(brew --prefix llvm@8)/bin/clang-format ./lint.sh` +### Running tests + +Some tests are conditionally enabled based on the availability of certain +features in the package build (S3 support, compression libraries, etc.). +Others are generally skipped by default but can be enabled with environment +variables or other settings: + +* All tests are skipped on Linux if the package builds without the C++ libarrow. + To make the build fail if libarrow is not available (as in, to test that + the C++ build was successful), set `TEST_R_WITH_ARROW=TRUE` +* Some tests are disabled unless `ARROW_R_DEV=TRUE` +* Tests that require allocating >2GB of memory to test Large types are disabled + unless `ARROW_LARGE_MEMORY_TESTS=TRUE` +* Integration tests against a real S3 bucket are disabled unless credentials + are set in `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`; these are available + on request +* S3 tests using [MinIO](https://min.io/) locally are enabled if the + `minio server` process is found running. If you're running MinIO with custom + settings, you can set `MINIO_ACCESS_KEY`, `MINIO_SECRET_KEY`, and + `MINIO_PORT` to override the defaults. + ### Useful functions Within an R session, these can help with package development: diff --git a/r/man/CsvTableReader.Rd b/r/man/CsvTableReader.Rd index 8343653b862..1afa9d02016 100644 --- a/r/man/CsvTableReader.Rd +++ b/r/man/CsvTableReader.Rd @@ -16,7 +16,7 @@ and JSON table readers. See their usage in \code{\link[=read_csv_arrow]{read_csv The \code{CsvTableReader$create()} and \code{JsonTableReader$create()} factory methods take the following arguments: \itemize{ -\item \code{file} A character path to a local file, or an Arrow input stream +\item \code{file} An Arrow \link{InputStream} \item \code{convert_options} (CSV only), \code{parse_options}, \code{read_options}: see \link{CsvReadOptions} \item \code{...} additional parameters. diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index a675c5bfb4e..3ca945fa14a 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -16,11 +16,40 @@ to another implementation after prepending a fixed base path \section{Factory}{ -The \verb{$create()} factory methods instantiate the \code{FileSystem} object and -take the following arguments, depending on the subclass: +\code{LocalFileSystem$create()} returns the object and takes no arguments. + +\code{SubTreeFileSystem$create()} takes the following arguments: +\itemize{ +\item \code{base_path}, a string path +\item \code{base_fs}, a \code{FileSystem} object +} + +\code{S3FileSystem$create()} optionally takes arguments: \itemize{ -\item no argument is needed for instantiating a \code{LocalFileSystem} -\item \code{base_path} and \code{base_fs} for instantiating a \code{SubTreeFileSystem} +\item \code{anonymous}: logical, default \code{FALSE}. If true, will not attempt to look up +credentials using standard AWS configuration methods. +\item \code{access_key}, \code{secret_key}: authentication credentials. If one is provided, +the other must be as well. If both are provided, they will override any +AWS configuration set at the environment level. +\item \code{session_token}: optional string for authentication along with +\code{access_key} and \code{secret_key} +\item \code{role_arn}: string AWS ARN of an AccessRole. If provided instead of \code{access_key} and +\code{secret_key}, temporary credentials will be fetched by assuming this role. +\item \code{session_name}: optional string identifier for the assumed role session. +\item \code{external_id}: optional unique string identifier that might be required +when you assume a role in another account. +\item \code{load_frequency}: integer, frequency (in seconds) with which temporary +credentials from an assumed role session will be refreshed. Default is +900 (i.e. 15 minutes) +\item \code{region}: AWS region to connect to. If omitted, the AWS library will +provide a sensible default based on client configuration, falling back +to "us-east-1" if no other alternatives are found. +\item \code{endpoint_override}: If non-empty, override region with a connect string +such as "localhost:9000". This is useful for connecting to file systems +that emulate S3. +\item \code{scheme}: S3 connection transport (default "https") +\item \code{background_writes}: logical, whether \code{OutputStream} writes will be issued +in the background, without blocking (default \code{TRUE}) } } diff --git a/r/man/copy_files.Rd b/r/man/copy_files.Rd new file mode 100644 index 00000000000..770e86b27eb --- /dev/null +++ b/r/man/copy_files.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{copy_files} +\alias{copy_files} +\title{Copy files between FileSystems} +\usage{ +copy_files(src_fs, src_paths, dest_fs, dest_paths, chunk_size = 1024L * 1024L) +} +\arguments{ +\item{src_fs}{The FileSystem from which files will be copied.} + +\item{src_paths}{The paths of files to be copied.} + +\item{dest_fs}{The FileSystem into which files will be copied.} + +\item{dest_paths}{Where the copied files should be placed.} + +\item{chunk_size}{The maximum size of block to read before flushing +to the destination file. A larger chunk_size will use more memory while +copying but may help accommodate high latency FileSystems.} +} +\description{ +Copy files between FileSystems +} diff --git a/r/man/read_delim_arrow.Rd b/r/man/read_delim_arrow.Rd index abc2d4b0581..38251c71466 100644 --- a/r/man/read_delim_arrow.Rd +++ b/r/man/read_delim_arrow.Rd @@ -21,6 +21,7 @@ read_delim_arrow( parse_options = NULL, convert_options = NULL, read_options = NULL, + filesystem = NULL, as_data_frame = TRUE ) @@ -109,6 +110,9 @@ parsing options provided in other arguments (e.g. \code{delim}, \code{quote}, et \item{read_options}{see \link[=CsvReadOptions]{file reader options}} +\item{filesystem}{A \link{FileSystem} where \code{file} can be found if it is a +string file path; default is the local file system} + \item{as_data_frame}{Should the function return a \code{data.frame} (default) or an Arrow \link{Table}?} } diff --git a/r/man/read_feather.Rd b/r/man/read_feather.Rd index b84d07f6176..99c1845e2aa 100644 --- a/r/man/read_feather.Rd +++ b/r/man/read_feather.Rd @@ -4,7 +4,13 @@ \alias{read_feather} \title{Read a Feather file} \usage{ -read_feather(file, col_select = NULL, as_data_frame = TRUE, ...) +read_feather( + file, + col_select = NULL, + as_data_frame = TRUE, + filesystem = NULL, + ... +) } \arguments{ \item{file}{A character file name or URI, \code{raw} vector, or an Arrow input stream. @@ -20,6 +26,9 @@ of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or an Arrow \link{Table}?} +\item{filesystem}{A \link{FileSystem} where \code{file} can be found if it is a +string file path; default is the local file system} + \item{...}{additional parameters, passed to \link[=FeatherReader]{FeatherReader$create()}} } \value{ diff --git a/r/man/read_ipc_stream.Rd b/r/man/read_ipc_stream.Rd index 01b64350a8c..020f605aa65 100644 --- a/r/man/read_ipc_stream.Rd +++ b/r/man/read_ipc_stream.Rd @@ -7,7 +7,7 @@ \usage{ read_arrow(file, ...) -read_ipc_stream(file, as_data_frame = TRUE, ...) +read_ipc_stream(file, as_data_frame = TRUE, filesystem = NULL, ...) } \arguments{ \item{file}{A character file name or URI, \code{raw} vector, or an Arrow input stream. @@ -19,6 +19,9 @@ open.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or an Arrow \link{Table}?} + +\item{filesystem}{A \link{FileSystem} where \code{file} can be found if it is a +string file path; default is the local file system} } \value{ A \code{data.frame} if \code{as_data_frame} is \code{TRUE} (the default), or an diff --git a/r/man/read_json_arrow.Rd b/r/man/read_json_arrow.Rd index 8501b19c392..d835ec86e6e 100644 --- a/r/man/read_json_arrow.Rd +++ b/r/man/read_json_arrow.Rd @@ -4,7 +4,13 @@ \alias{read_json_arrow} \title{Read a JSON file} \usage{ -read_json_arrow(file, col_select = NULL, as_data_frame = TRUE, ...) +read_json_arrow( + file, + col_select = NULL, + as_data_frame = TRUE, + filesystem = NULL, + ... +) } \arguments{ \item{file}{A character file name or URI, \code{raw} vector, or an Arrow input stream. @@ -21,6 +27,9 @@ of columns, as used in \code{dplyr::select()}.} \item{as_data_frame}{Should the function return a \code{data.frame} (default) or an Arrow \link{Table}?} +\item{filesystem}{A \link{FileSystem} where \code{file} can be found if it is a +string file path; default is the local file system} + \item{...}{Additional options, passed to \code{json_table_reader()}} } \value{ diff --git a/r/man/read_parquet.Rd b/r/man/read_parquet.Rd index f4a3897643c..89cae7809f0 100644 --- a/r/man/read_parquet.Rd +++ b/r/man/read_parquet.Rd @@ -9,6 +9,7 @@ read_parquet( col_select = NULL, as_data_frame = TRUE, props = ParquetReaderProperties$create(), + filesystem = NULL, ... ) } @@ -28,6 +29,9 @@ an Arrow \link{Table}?} \item{props}{\link{ParquetReaderProperties}} +\item{filesystem}{A \link{FileSystem} where \code{file} can be found if it is a +string file path; default is the local file system} + \item{...}{Additional arguments passed to \code{ParquetFileReader$create()}} } \value{ diff --git a/r/man/write_feather.Rd b/r/man/write_feather.Rd index e079aeb8934..60b31cd8019 100644 --- a/r/man/write_feather.Rd +++ b/r/man/write_feather.Rd @@ -7,6 +7,7 @@ write_feather( x, sink, + filesystem = NULL, version = 2, chunk_size = 65536L, compression = c("default", "lz4", "uncompressed", "zstd"), @@ -18,6 +19,9 @@ write_feather( \item{sink}{A string file path, URI, or \link{OutputStream}} +\item{filesystem}{A \link{FileSystem} where \code{sink} should be written if it is a +string file path; default is the local file system} + \item{version}{integer Feather file version. Version 2 is the current. Version 1 is the more limited legacy format.} diff --git a/r/man/write_ipc_stream.Rd b/r/man/write_ipc_stream.Rd index 8274eddb3b1..5b73a911d10 100644 --- a/r/man/write_ipc_stream.Rd +++ b/r/man/write_ipc_stream.Rd @@ -7,7 +7,7 @@ \usage{ write_arrow(x, sink, ...) -write_ipc_stream(x, sink, ...) +write_ipc_stream(x, sink, filesystem = NULL, ...) } \arguments{ \item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} @@ -15,6 +15,9 @@ write_ipc_stream(x, sink, ...) \item{sink}{A string file path, URI, or \link{OutputStream}} \item{...}{extra parameters passed to \code{write_feather()}.} + +\item{filesystem}{A \link{FileSystem} where \code{sink} should be written if it is a +string file path; default is the local file system} } \value{ \code{x}, invisibly. diff --git a/r/man/write_parquet.Rd b/r/man/write_parquet.Rd index f532ce06c4c..bb70502c2ba 100644 --- a/r/man/write_parquet.Rd +++ b/r/man/write_parquet.Rd @@ -7,6 +7,7 @@ write_parquet( x, sink, + filesystem = NULL, chunk_size = NULL, version = NULL, compression = default_parquet_compression(), @@ -20,10 +21,12 @@ write_parquet( ) } \arguments{ -\item{x}{An \link[=Table]{arrow::Table}, or an object convertible to it.} +\item{x}{\code{data.frame}, \link{RecordBatch}, or \link{Table}} -\item{sink}{an \link[=OutputStream]{arrow::io::OutputStream} or a string -interpreted as a file path or URI} +\item{sink}{A string file path, URI, or \link{OutputStream}} + +\item{filesystem}{A \link{FileSystem} where \code{sink} should be written if it is a +string file path; default is the local file system} \item{chunk_size}{chunk size in number of rows. If NULL, the total number of rows is used.} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 39701da7d9d..6b58de07927 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3631,29 +3631,26 @@ extern "C" SEXP _arrow_fs___CopyFiles(SEXP src_fs_sexp, SEXP src_paths_sexp, SEX // filesystem.cpp #if defined(ARROW_R_WITH_S3) -void fs___EnsureS3Initialized(); -extern "C" SEXP _arrow_fs___EnsureS3Initialized(){ -BEGIN_CPP11 - fs___EnsureS3Initialized(); - return R_NilValue; -END_CPP11 -} -#else -extern "C" SEXP _arrow_fs___EnsureS3Initialized(){ - Rf_error("Cannot call fs___EnsureS3Initialized(). Please use arrow::install_arrow() to install required runtime libraries. "); -} -#endif - -// filesystem.cpp -#if defined(ARROW_R_WITH_S3) -std::shared_ptr fs___S3FileSystem__create(); -extern "C" SEXP _arrow_fs___S3FileSystem__create(){ -BEGIN_CPP11 - return cpp11::as_sexp(fs___S3FileSystem__create()); -END_CPP11 -} -#else -extern "C" SEXP _arrow_fs___S3FileSystem__create(){ +std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, bool background_writes); +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP background_writes_sexp){ +BEGIN_CPP11 + arrow::r::Input::type anonymous(anonymous_sexp); + arrow::r::Input::type access_key(access_key_sexp); + arrow::r::Input::type secret_key(secret_key_sexp); + arrow::r::Input::type session_token(session_token_sexp); + arrow::r::Input::type role_arn(role_arn_sexp); + arrow::r::Input::type session_name(session_name_sexp); + arrow::r::Input::type external_id(external_id_sexp); + arrow::r::Input::type load_frequency(load_frequency_sexp); + arrow::r::Input::type region(region_sexp); + arrow::r::Input::type endpoint_override(endpoint_override_sexp); + arrow::r::Input::type scheme(scheme_sexp); + arrow::r::Input::type background_writes(background_writes_sexp); + return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, background_writes)); +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP background_writes_sexp){ Rf_error("Cannot call fs___S3FileSystem__create(). Please use arrow::install_arrow() to install required runtime libraries. "); } #endif @@ -6249,8 +6246,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___SubTreeFileSystem__create", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__create, 2}, { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, - { "_arrow_fs___EnsureS3Initialized", (DL_FUNC) &_arrow_fs___EnsureS3Initialized, 0}, - { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 0}, + { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 12}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index c37410bcbe2..c28a28b2448 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -253,12 +253,43 @@ void fs___CopyFiles(const std::shared_ptr& src_fs, #include // [[s3::export]] -void fs___EnsureS3Initialized() { StopIfNotOk(fs::EnsureS3Initialized()); } +std::shared_ptr fs___S3FileSystem__create( + bool anonymous = false, std::string access_key = "", std::string secret_key = "", + std::string session_token = "", std::string role_arn = "", + std::string session_name = "", std::string external_id = "", int load_frequency = 900, + std::string region = "", std::string endpoint_override = "", std::string scheme = "", + bool background_writes = true) { + fs::S3Options s3_opts; + // Handle auth (anonymous, keys, default) + // (validation/internal coherence handled in R) + if (anonymous) { + s3_opts = fs::S3Options::Anonymous(); + } else if (access_key != "" && secret_key != "") { + s3_opts = fs::S3Options::FromAccessKey(access_key, secret_key, session_token); + } else if (role_arn != "") { + s3_opts = fs::S3Options::FromAssumeRole(role_arn, session_name, external_id, + load_frequency); + } else { + s3_opts = fs::S3Options::Defaults(); + } -// [[s3::export]] -std::shared_ptr fs___S3FileSystem__create() { - auto opts = fs::S3Options::Defaults(); - return ValueOrStop(fs::S3FileSystem::Make(opts)); + // Now handle the rest of the options + /// AWS region to connect to (default determined by AWS SDK) + if (region != "") { + s3_opts.region = region; + } + /// If non-empty, override region with a connect string such as "localhost:9000" + s3_opts.endpoint_override = endpoint_override; + /// S3 connection transport, default "https" + if (scheme != "") { + s3_opts.scheme = scheme; + } + /// Whether OutputStream writes will be issued in the background, without blocking + /// default true + s3_opts.background_writes = background_writes; + + StopIfNotOk(fs::EnsureS3Initialized()); + return ValueOrStop(fs::S3FileSystem::Make(s3_opts)); } #endif diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index 94bfd49efad..366cc75fb27 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -43,3 +43,8 @@ skip_if_not_running_large_memory_tests <- function() { "environment variable ARROW_LARGE_MEMORY_TESTS" ) } + +process_is_running <- function(x) { + cmd <- sprintf("ps aux | grep '%s' | grep -v grep", x) + tryCatch(system(cmd, ignore.stdout = TRUE) == 0, error = function(e) FALSE) +} diff --git a/r/tests/testthat/test-filesystem.R b/r/tests/testthat/test-filesystem.R index fc0e02f34a1..255465d2b10 100644 --- a/r/tests/testthat/test-filesystem.R +++ b/r/tests/testthat/test-filesystem.R @@ -131,6 +131,13 @@ test_that("FileSystem$from_uri", { expect_is(fs_and_path$fs, "S3FileSystem") }) +test_that("SubTreeFileSystem$create() with URI", { + skip_on_cran() + skip_if_not_available("s3") + fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data") + expect_is(fs, "SubTreeFileSystem") +}) + test_that("S3FileSystem", { skip_on_cran() skip_if_not_available("s3") diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R new file mode 100644 index 00000000000..63902aac40a --- /dev/null +++ b/r/tests/testthat/test-s3-minio.R @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("S3 tests using local minio") + +if (arrow_with_s3() && process_is_running("minio server")) { + # Get minio config, with expected defaults + minio_key <- Sys.getenv("MINIO_ACCESS_KEY", "minioadmin") + minio_secret <- Sys.getenv("MINIO_SECRET_KEY", "minioadmin") + minio_port <- Sys.getenv("MINIO_PORT", "9000") + + # Helper function for minio URIs + minio_uri <- function(...) { + template <- "s3://%s:%s@%s?scheme=http&endpoint_override=localhost%s%s" + sprintf(template, minio_key, minio_secret, minio_path(...), "%3A", minio_port) + } + minio_path <- function(...) paste(now, ..., sep = "/") + + test_that("minio setup", { + # Create a "bucket" on minio for this test run, which we'll delete when done. + fs <- S3FileSystem$create( + access_key = minio_key, + secret_key = minio_secret, + scheme = "http", + endpoint_override = paste0("localhost:", minio_port) + ) + expect_is(fs, "S3FileSystem") + now <- as.character(as.numeric(Sys.time())) + # If minio isn't running, this will hang for a few seconds and fail with a + # curl timeout, causing `run_these` to be set to FALSE and skipping the tests + fs$CreateDir(now) + # Clean up when we're all done + on.exit(fs$DeleteDir(now)) + }) + + test_that("read/write Feather on minio", { + write_feather(example_data, minio_uri("test.feather")) + expect_identical(read_feather(minio_uri("test.feather")), example_data) + }) + + test_that("read/write Feather by filesystem, not URI", { + write_feather(example_data, minio_path("test2.feather"), filesystem = fs) + expect_identical( + read_feather(minio_path("test2.feather"), filesystem = fs), + example_data + ) + }) + + test_that("read/write stream", { + write_ipc_stream(example_data, minio_path("test3.ipc"), filesystem = fs) + expect_identical( + read_ipc_stream(minio_path("test3.ipc"), filesystem = fs), + example_data + ) + }) + + test_that("read/write Parquet on minio", { + write_parquet(example_data, minio_uri("test.parquet")) + expect_identical(read_parquet(minio_uri("test.parquet")), example_data) + }) + + # Dataset test setup, cf. test-dataset.R + library(dplyr) + first_date <- lubridate::ymd_hms("2015-04-29 03:12:39") + df1 <- tibble( + int = 1:10, + dbl = as.numeric(1:10), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[1:10], + fct = factor(LETTERS[1:10]), + ts = first_date + lubridate::days(1:10) + ) + + second_date <- lubridate::ymd_hms("2017-03-09 07:01:02") + df2 <- tibble( + int = 101:110, + dbl = as.numeric(51:60), + lgl = rep(c(TRUE, FALSE, NA, TRUE, FALSE), 2), + chr = letters[10:1], + fct = factor(LETTERS[10:1]), + ts = second_date + lubridate::days(10:1) + ) + + # This is also to set up the dataset tests + test_that("write_parquet with filesystem arg", { + fs$CreateDir(minio_path("hive_dir", "group=1", "other=xxx")) + fs$CreateDir(minio_path("hive_dir", "group=2", "other=yyy")) + expect_length(fs$GetFileInfo(FileSelector$create(minio_path("hive_dir"))), 2) + write_parquet(df1, minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"), filesystem = fs) + write_parquet(df2, minio_path("hive_dir", "group=2", "other=yyy", "file2.parquet"), filesystem = fs) + expect_identical( + read_parquet(minio_path("hive_dir", "group=1", "other=xxx", "file1.parquet"), filesystem = fs), + df1 + ) + }) + + test_that("open_dataset with fs", { + ds <- open_dataset(minio_path("hive_dir"), filesystem = fs) + expect_identical( + ds %>% select(dbl, lgl) %>% collect(), + rbind(df1[, c("dbl", "lgl")], df2[, c("dbl", "lgl")]) + ) + }) + + test_that("write_dataset with fs", { + ds <- open_dataset(minio_path("hive_dir"), filesystem = fs) + write_dataset(ds, minio_path("new_dataset_dir"), filesystem = fs) + expect_length(fs$GetFileInfo(FileSelector$create(minio_path("new_dataset_dir"))), 2) + }) + + test_that("S3FileSystem input validation", { + expect_error( + S3FileSystem$create(access_key = "foo"), + "Key authentication requires both access_key and secret_key" + ) + expect_error( + S3FileSystem$create(secret_key = "foo"), + "Key authentication requires both access_key and secret_key" + ) + expect_error( + S3FileSystem$create(session_token = "foo"), + paste0( + "In order to initialize a session with temporary credentials, ", + "both secret_key and access_key must be provided ", + "in addition to session_token." + ) + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", anonymous = TRUE), + 'Cannot specify "access_key" and "secret_key" when anonymous = TRUE' + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", role_arn = "qwer"), + "Cannot provide both key authentication and role_arn" + ) + expect_error( + S3FileSystem$create(access_key = "foo", secret_key = "asdf", external_id = "qwer"), + 'Cannot specify "external_id" without providing a role_arn string' + ) + expect_error( + S3FileSystem$create(external_id = "foo"), + 'Cannot specify "external_id" without providing a role_arn string' + ) + }) +} else { + # Kinda hacky, let's put a skipped test here, just so we note that the tests + # didn't run + test_that("S3FileSystem tests with Minio", { + skip("Minio is not running") + }) +} diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index 03730bc1269..5c2ece1cca4 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -19,41 +19,109 @@ using Arrow. ## URIs File readers and writers (`read_parquet()`, `write_feather()`, et al.) -now accept an S3 URI as the source or destination file, +accept an S3 URI as the source or destination file, as do `open_dataset()` and `write_dataset()`. An S3 URI looks like: ``` -s3://[id:secret@]bucket/path[?region=] +s3://[access_key:secret_key@]bucket/path[?region=] ``` For example, one of the NYC taxi data files used in `vignette("dataset", package = "arrow")` is found at ``` -s3://ursa-labs-taxi-data/2019/06/data.parquet?region=us-east-2 +s3://ursa-labs-taxi-data/2019/06/data.parquet ``` -`region` defaults to `us-east-1` and can be omitted if the bucket is in that region. - Given this URI, we can pass it to `read_parquet()` just as if it were a local file path: ```r -df <- read_parquet("s3://ursa-labs-taxi-data/2019/06/data.parquet?region=us-east-2") +df <- read_parquet("s3://ursa-labs-taxi-data/2019/06/data.parquet") ``` Note that this will be slower to read than if the file were local, though if you're running on a machine in the same AWS region as the file in S3, the cost of reading the data over the network should be much lower. +## Creating a FileSystem object + +Another way to connect to S3 is to create an `S3FileSystem` object once and pass +that to the read/write functions. This may be a convenience when dealing with +long URIs, and it's necessary for some options and authentication methods +that aren't supported in the URI format. + +In the previous example, this would look like: + +```r +fs <- S3FileSystem$create(region = "us-east-2") +df <- read_parquet("ursa-labs-taxi-data/2019/06/data.parquet", filesystem = fs) +``` + +See the help for `FileSystem` for a list of options that `S3FileSystem$create()` +can take. `region`, `scheme`, and `endpoint_override` can be encoded as query +parameters in the URI (though `region` will be auto-detected the bucket URI if omitted), +and `access_key` and `secret_key` can also be included, +but other options are not supported in the URI. + +Using the `SubTreeFileSystem` class, you can represent an S3 bucket or +subdirectory inside of one. + +```r +bucket <- SubTreeFileSystem$create( + "ursa-labs-taxi-data", + S3FileSystem$create(region = "us-east-2") +) +df <- read_parquet("2019/06/data.parquet", filesystem = bucket) +``` + +`SubTreeFileSystem` can also be made from a URI: + +```r +bucket <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data") +``` + ## Authentication -To access private S3 buckets, you need two secret parameters: -a `AWS_ACCESS_KEY_ID`, which is like a user id, -and `AWS_SECRET_ACCESS_KEY`, like a token. +To access private S3 buckets, you need typically need two secret parameters: +a `access_key`, which is like a user id, +and `secret_key`, like a token. There are a few options for passing these credentials: -1. Include them in the URI, like `s3://AWS_ACCESS_KEY_ID:AWS_SECRET_ACCESS_KEY@bucket-name/path/to/file`. Be sure to [URL-encode](https://en.wikipedia.org/wiki/Percent-encoding) your secrets if they contain special characters like "/". +1. Include them in the URI, like `s3://access_key:secret_key@bucket-name/path/to/file`. Be sure to [URL-encode](https://en.wikipedia.org/wiki/Percent-encoding) your secrets if they contain special characters like "/". + +2. Pass them as `access_key` and `secret_key` to `S3FileSystem$create()` + +3. Set them as environment variables named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`, respectively. + +4. Define them in a `~/.aws/credentials` file, according to the [AWS documentation](https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html). + +You can also use an [AccessRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html) +for temporary access by passing the `role_arn` identifier to `S3FileSystem$create()`. + +## File systems that emulate S3 + +The `S3FileSystem` machinery enables you to work with any file system that +provides an S3-compatible interface. For example, [MinIO](https://min.io/) is +and object-storage server that emulates the S3 API. If you were to +run `minio server` locally with its default settings, you could connect to +it with `arrow` using `S3FileSystem` like this: + +```r +minio <- S3FileSystem$create( + access_key = "minioadmin", + secret_key = "minioadmin", + scheme = "http", + endpoint_override = "localhost:9000" +) +``` + +or, as a URI, it would be + +``` +s3://minioadmin:minioadmin@?scheme=http&endpoint_override=localhost%3A9000 +``` -2. Set them as environment variables named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +(note the URL escaping of the `:` in `endpoint_override`). -3. Define them in a `~/.aws/credentials` file, according to the [AWS documentation](https://docs.aws.amazon.com/sdk-for-cpp/v1/developer-guide/credentials.html). +Among other applications, this can be useful for testing out code locally before +running on a remote S3 bucket.