diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 7216af297a0..1d89e2da711 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -322,7 +322,7 @@ TEST_F(S3OptionsTest, FromAssumeRole) { class S3RegionResolutionTest : public AwsTestMixin {}; TEST_F(S3RegionResolutionTest, PublicBucket) { - ASSERT_OK_AND_EQ("us-east-2", ResolveS3BucketRegion("ursa-labs-taxi-data")); + ASSERT_OK_AND_EQ("us-east-2", ResolveS3BucketRegion("voltrondata-labs-datasets")); // Taken from a registry of open S3-hosted datasets // at https://github.com/awslabs/open-data-registry diff --git a/docs/source/python/dataset.rst b/docs/source/python/dataset.rst index 4808457355d..2ac592d8d0c 100644 --- a/docs/source/python/dataset.rst +++ b/docs/source/python/dataset.rst @@ -355,7 +355,7 @@ specifying a S3 path: .. code-block:: python - dataset = ds.dataset("s3://ursa-labs-taxi-data/", partitioning=["year", "month"]) + dataset = ds.dataset("s3://voltrondata-labs-datasets/nyc-taxi/") Typically, you will want to customize the connection parameters, and then a file system object can be created and passed to the ``filesystem`` keyword: @@ -365,8 +365,7 @@ a file system object can be created and passed to the ``filesystem`` keyword: from pyarrow import fs s3 = fs.S3FileSystem(region="us-east-2") - dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=s3, - partitioning=["year", "month"]) + dataset = ds.dataset("voltrondata-labs-datasets/nyc-taxi/", filesystem=s3) The currently available classes are :class:`~pyarrow.fs.S3FileSystem` and :class:`~pyarrow.fs.HadoopFileSystem`. See the :ref:`filesystem` docs for more @@ -387,8 +386,7 @@ useful for testing or benchmarking. # By default, MinIO will listen for unencrypted HTTP traffic. minio = fs.S3FileSystem(scheme="http", endpoint_override="localhost:9000") - dataset = ds.dataset("ursa-labs-taxi-data/", filesystem=minio, - partitioning=["year", "month"]) + dataset = ds.dataset("voltrondata-labs-datasets/nyc-taxi/", filesystem=minio) Working with Parquet Datasets diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index d9335995dc2..f668038e623 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -74,8 +74,8 @@ def resolve_s3_region(bucket): Examples -------- - >>> fs.resolve_s3_region('registry.opendata.aws') - 'us-east-1' + >>> fs.resolve_s3_region('voltrondata-labs-datasets') + 'us-east-2' """ cdef: c_string c_bucket diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 41c242ff83b..05ebf4ed4c7 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1616,15 +1616,17 @@ def test_s3_real_aws(): assert fs.region == default_region fs = S3FileSystem(anonymous=True, region='us-east-2') - entries = fs.get_file_info(FileSelector('ursa-labs-taxi-data')) + entries = fs.get_file_info(FileSelector( + 'voltrondata-labs-datasets/nyc-taxi')) assert len(entries) > 0 - with fs.open_input_stream('ursa-labs-taxi-data/2019/06/data.parquet') as f: + key = 'voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/part-0.parquet' + with fs.open_input_stream(key) as f: md = f.metadata() assert 'Content-Type' in md - assert md['Last-Modified'] == b'2020-01-17T16:26:28Z' + assert md['Last-Modified'] == b'2022-07-12T23:32:00Z' # For some reason, the header value is quoted # (both with AWS and Minio) - assert md['ETag'] == b'"f1efd5d76cb82861e1542117bfa52b90-8"' + assert md['ETag'] == b'"4c6a76826a695c6ac61592bc30cda3df-16"' @pytest.mark.s3 @@ -1653,7 +1655,7 @@ def test_s3_real_aws_region_selection(): @pytest.mark.s3 def test_resolve_s3_region(): from pyarrow.fs import resolve_s3_region - assert resolve_s3_region('ursa-labs-taxi-data') == 'us-east-2' + assert resolve_s3_region('voltrondata-labs-datasets') == 'us-east-2' assert resolve_s3_region('mf-nwp-models') == 'eu-west-1' with pytest.raises(ValueError, match="Not a valid bucket name"): diff --git a/r/.gitignore b/r/.gitignore index 695e42b7593..e607d2662f2 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -18,6 +18,7 @@ vignettes/nyc-taxi/ arrow_*.tar.gz arrow_*.tgz extra-tests/files +.deps # C++ sources for an offline build. They're copied from the ../cpp directory, so ignore them here. /tools/cpp/ diff --git a/r/NAMESPACE b/r/NAMESPACE index 750a815f9ff..0fa23fd01e9 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -301,6 +301,7 @@ export(float) export(float16) export(float32) export(float64) +export(gs_bucket) export(halffloat) export(hive_partition) export(infer_type) diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 3cebbc30c85..2f0b1cfd585 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -155,6 +155,26 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete #' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`). #' +#' `GcsFileSystem$create()` optionally takes arguments: +#' +#' - `anonymous`: logical, default `FALSE`. If true, will not attempt to look up +#' credentials using standard GCS configuration methods. +#' - `access_token`: optional string for authentication. Should be provided along +#' with `expiration` +#' - `expiration`: optional date representing point at which `access_token` will +#' expire. +#' - `json_credentials`: optional string for authentication. Point to a JSON +#' credentials file downloaded from GCS. +#' - `endpoint_override`: if non-empty, will connect to provided host name / port, +#' such as "localhost:9001", instead of default GCS ones. This is primarily useful +#' for testing purposes. +#' - `scheme`: connection transport (default "https") +#' - `default_bucket_location`: the default location (or "region") to create new +#' buckets in. +#' - `retry_limit_seconds`: the maximum amount of time to spend retrying if +#' the filesystem encounters errors. Default is 15 seconds. +#' - `default_metadata`: default metadata to write in new objects. +#' #' @section Methods: #' #' - `$GetFileInfo(x)`: `x` may be a [FileSelector][FileSelector] or a character @@ -426,7 +446,7 @@ default_s3_options <- list( #' relative path. Note that this function's success does not guarantee that you #' are authorized to access the bucket's contents. #' @examplesIf FALSE -#' bucket <- s3_bucket("ursa-labs-taxi-data") +#' bucket <- s3_bucket("voltrondata-labs-datasets") #' @export s3_bucket <- function(bucket, ...) { assert_that(is.string(bucket)) @@ -448,6 +468,28 @@ s3_bucket <- function(bucket, ...) { SubTreeFileSystem$create(fs_and_path$path, fs) } +#' Connect to a Google Cloud Storage (GCS) bucket +#' +#' `gs_bucket()` is a convenience function to create an `GcsFileSystem` object +#' that holds onto its relative path +#' +#' @param bucket string GCS bucket name or path +#' @param ... Additional connection options, passed to `GcsFileSystem$create()` +#' @return A `SubTreeFileSystem` containing an `GcsFileSystem` and the bucket's +#' relative path. Note that this function's success does not guarantee that you +#' are authorized to access the bucket's contents. +#' @examplesIf FALSE +#' bucket <- gs_bucket("voltrondata-labs-datasets") +#' @export +gs_bucket <- function(bucket, ...) { + assert_that(is.string(bucket)) + args <- list2(...) + + fs <- exec(GcsFileSystem$create, !!!args) + + SubTreeFileSystem$create(bucket, fs) +} + #' @usage NULL #' @format NULL #' @rdname FileSystem diff --git a/r/_pkgdown.yml b/r/_pkgdown.yml index c0f599fb8a5..12ab3ccadcf 100644 --- a/r/_pkgdown.yml +++ b/r/_pkgdown.yml @@ -90,7 +90,7 @@ navbar: href: articles/install.html - text: Working with Arrow Datasets and dplyr href: articles/dataset.html - - text: Working with Cloud Storage (S3) + - text: Working with Cloud Storage (S3, GCS) href: articles/fs.html - text: Apache Arrow in Python and R with reticulate href: articles/python.html @@ -198,6 +198,7 @@ reference: - title: File systems contents: - s3_bucket + - gs_bucket - FileSystem - FileInfo - FileSelector diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 41d9e925140..f4f6cb57ffc 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -56,6 +56,27 @@ buckets if \verb{$CreateDir()} is called on the bucket level (default \code{FALS \item \code{allow_bucket_deletion}: logical, if TRUE, the filesystem will delete buckets if\verb{$DeleteDir()} is called on the bucket level (default \code{FALSE}). } + +\code{GcsFileSystem$create()} optionally takes arguments: +\itemize{ +\item \code{anonymous}: logical, default \code{FALSE}. If true, will not attempt to look up +credentials using standard GCS configuration methods. +\item \code{access_token}: optional string for authentication. Should be provided along +with \code{expiration} +\item \code{expiration}: optional date representing point at which \code{access_token} will +expire. +\item \code{json_credentials}: optional string for authentication. Point to a JSON +credentials file downloaded from GCS. +\item \code{endpoint_override}: if non-empty, will connect to provided host name / port, +such as "localhost:9001", instead of default GCS ones. This is primarily useful +for testing purposes. +\item \code{scheme}: connection transport (default "https") +\item \code{default_bucket_location}: the default location (or "region") to create new +buckets in. +\item \code{retry_limit_seconds}: the maximum amount of time to spend retrying if +the filesystem encounters errors. Default is 15 seconds. +\item \code{default_metadata}: default metadata to write in new objects. +} } \section{Methods}{ diff --git a/r/man/gs_bucket.Rd b/r/man/gs_bucket.Rd new file mode 100644 index 00000000000..7dc39a42c3d --- /dev/null +++ b/r/man/gs_bucket.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filesystem.R +\name{gs_bucket} +\alias{gs_bucket} +\title{Connect to a Google Cloud Storage (GCS) bucket} +\usage{ +gs_bucket(bucket, ...) +} +\arguments{ +\item{bucket}{string GCS bucket name or path} + +\item{...}{Additional connection options, passed to \code{GcsFileSystem$create()}} +} +\value{ +A \code{SubTreeFileSystem} containing an \code{GcsFileSystem} and the bucket's +relative path. Note that this function's success does not guarantee that you +are authorized to access the bucket's contents. +} +\description{ +\code{gs_bucket()} is a convenience function to create an \code{GcsFileSystem} object +that holds onto its relative path +} +\examples{ +\dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +bucket <- gs_bucket("voltrondata-labs-datasets") +\dontshow{\}) # examplesIf} +} diff --git a/r/man/s3_bucket.Rd b/r/man/s3_bucket.Rd index 7baeb49b698..2ab7d4962ed 100644 --- a/r/man/s3_bucket.Rd +++ b/r/man/s3_bucket.Rd @@ -23,6 +23,6 @@ relative path. } \examples{ \dontshow{if (FALSE) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} -bucket <- s3_bucket("ursa-labs-taxi-data") +bucket <- s3_bucket("voltrondata-labs-datasets") \dontshow{\}) # examplesIf} } diff --git a/r/tests/testthat/test-filesystem.R b/r/tests/testthat/test-filesystem.R index 1852634ac99..7957743a2aa 100644 --- a/r/tests/testthat/test-filesystem.R +++ b/r/tests/testthat/test-filesystem.R @@ -147,7 +147,7 @@ test_that("FileSystem$from_uri", { skip_on_cran() skip_if_not_available("s3") skip_if_offline() - fs_and_path <- FileSystem$from_uri("s3://ursa-labs-taxi-data") + fs_and_path <- FileSystem$from_uri("s3://voltrondata-labs-datasets") expect_r6_class(fs_and_path$fs, "S3FileSystem") expect_identical(fs_and_path$fs$region, "us-east-2") }) @@ -156,11 +156,11 @@ test_that("SubTreeFileSystem$create() with URI", { skip_on_cran() skip_if_not_available("s3") skip_if_offline() - fs <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data") + fs <- SubTreeFileSystem$create("s3://voltrondata-labs-datasets") expect_r6_class(fs, "SubTreeFileSystem") expect_identical( capture.output(print(fs)), - "SubTreeFileSystem: s3://ursa-labs-taxi-data/" + "SubTreeFileSystem: s3://voltrondata-labs-datasets/" ) }) @@ -187,6 +187,19 @@ test_that("s3_bucket", { capture.output(print(bucket)), "SubTreeFileSystem: s3://ursa-labs-r-test/" ) - skip_on_os("windows") # FIXME expect_identical(bucket$base_path, "ursa-labs-r-test/") }) + +test_that("gs_bucket", { + skip_on_cran() + skip_if_not_available("gcs") + skip_if_offline() + bucket <- gs_bucket("voltrondata-labs-datasets") + expect_r6_class(bucket, "SubTreeFileSystem") + expect_r6_class(bucket$base_fs, "GcsFileSystem") + expect_identical( + capture.output(print(bucket)), + "SubTreeFileSystem: gs://voltrondata-labs-datasets/" + ) + expect_identical(bucket$base_path, "voltrondata-labs-datasets/") +}) diff --git a/r/vignettes/dataset.Rmd b/r/vignettes/dataset.Rmd index 5c430c4be0d..1a969f979c6 100644 --- a/r/vignettes/dataset.Rmd +++ b/r/vignettes/dataset.Rmd @@ -44,7 +44,9 @@ directory. If your arrow build has S3 support, you can sync the data locally with: ```{r, eval = FALSE} -arrow::copy_files("s3://ursa-labs-taxi-data", "nyc-taxi") +arrow::copy_files("s3://voltrondata-labs-datasets/nyc-taxi", "nyc-taxi") +# Alternatively, with GCS: +arrow::copy_files("gs://voltrondata-labs-datasets/nyc-taxi", "nyc-taxi") ``` If your arrow build doesn't have S3 support, you can download the files @@ -53,7 +55,7 @@ you may need to increase R's download timeout from the default of 60 seconds, e. `options(timeout = 300)`. ```{r, eval = FALSE} -bucket <- "https://ursa-labs-taxi-data.s3.us-east-2.amazonaws.com" +bucket <- "https://voltrondata-labs-datasets.s3.us-east-2.amazonaws.com" for (year in 2009:2019) { if (year == 2019) { # We only have through June 2019 there @@ -64,8 +66,8 @@ for (year in 2009:2019) { for (month in sprintf("%02d", months)) { dir.create(file.path("nyc-taxi", year, month), recursive = TRUE) try(download.file( - paste(bucket, year, month, "data.parquet", sep = "/"), - file.path("nyc-taxi", year, month, "data.parquet"), + paste(bucket, "nyc-taxi", paste0("year=", year), paste0("month=", month), "data.parquet", sep = "/"), + file.path("nyc-taxi", paste0("year=", year), paste0("month=", month), "data.parquet"), mode = "wb" ), silent = TRUE) } @@ -99,7 +101,7 @@ library(dplyr, warn.conflicts = FALSE) The first step is to create a Dataset object, pointing at the directory of data. ```{r, eval = file.exists("nyc-taxi")} -ds <- open_dataset("nyc-taxi", partitioning = c("year", "month")) +ds <- open_dataset("nyc-taxi") ``` The file format for `open_dataset()` is controlled by the `format` parameter, @@ -122,9 +124,18 @@ For text files, you can pass the following parsing options to `open_dataset()`: For more information on the usage of these parameters, see `?read_delim_arrow()`. -The `partitioning` argument lets you specify how the file paths provide information -about how the dataset is chunked into different files. The files in this example -have file paths like +`open_dataset()` was able to automatically infer column values for `year` and `month` +--which are not present in the data files--based on the directory structure. The +Hive-style partitioning structure is self-describing, with file paths like + +``` +year=2009/month=1/data.parquet +year=2009/month=2/data.parquet +... +``` + +But sometimes the directory partitioning isn't self describing; that is, it doesn't +contain field names. For example, if instead we had file paths like ``` 2009/01/data.parquet @@ -132,12 +143,13 @@ have file paths like ... ``` -By providing `c("year", "month")` to the `partitioning` argument, you're saying that the first -path segment gives the value for `year`, and the second segment is `month`. -Every row in `2009/01/data.parquet` has a value of 2009 for `year` +then `open_dataset()` would need some hints as to how to use the file paths. In this +case, you could provide `c("year", "month")` to the `partitioning` argument, +saying that the first path segment gives the value for `year`, and the second +segment is `month`. Every row in `2009/01/data.parquet` has a value of 2009 for `year` and 1 for `month`, even though those columns may not be present in the file. -Indeed, when you look at the dataset, you can see that in addition to the columns present +In either case, when you look at the dataset, you can see that in addition to the columns present in every file, there are also columns `year` and `month` even though they are not present in the files themselves. ```{r, eval = file.exists("nyc-taxi")} @@ -145,29 +157,31 @@ ds ``` ```{r, echo = FALSE, eval = !file.exists("nyc-taxi")} cat(" -FileSystemDataset with 125 Parquet files -vendor_id: string -pickup_at: timestamp[us] -dropoff_at: timestamp[us] -passenger_count: int8 -trip_distance: float -pickup_longitude: float -pickup_latitude: float -rate_code_id: null -store_and_fwd_flag: string -dropoff_longitude: float -dropoff_latitude: float +FileSystemDataset with 158 Parquet files +vendor_name: string +pickup_datetime: timestamp[ms] +dropoff_datetime: timestamp[ms] +passenger_count: int64 +trip_distance: double +pickup_longitude: double +pickup_latitude: double +rate_code: string +store_and_fwd: string +dropoff_longitude: double +dropoff_latitude: double payment_type: string -fare_amount: float -extra: float -mta_tax: float -tip_amount: float -tolls_amount: float -total_amount: float +fare_amount: double +extra: double +mta_tax: double +tip_amount: double +tolls_amount: double +total_amount: double +improvement_surcharge: double +congestion_surcharge: double +pickup_location_id: int64 +dropoff_location_id: int64 year: int32 month: int32 - -See $metadata for additional Schema metadata ") ``` @@ -271,7 +285,7 @@ ds %>% ```{r, echo = FALSE, eval = !file.exists("nyc-taxi")} cat(" FileSystemDataset (query) -passenger_count: int8 +passenger_count: int64 median_tip_pct: double n: int32 @@ -312,19 +326,20 @@ percentage of rows from each batch: sampled_data <- ds %>% filter(year == 2015) %>% select(tip_amount, total_amount, passenger_count) %>% - map_batches(~ sample_frac(as.data.frame(.), 1e-4)) %>% - mutate(tip_pct = tip_amount / total_amount) + map_batches(~ as_record_batch(sample_frac(as.data.frame(.), 1e-4))) %>% + mutate(tip_pct = tip_amount / total_amount) %>% + collect() str(sampled_data) ``` ```{r, echo = FALSE, eval = !file.exists("nyc-taxi")} cat(" -'data.frame': 15603 obs. of 4 variables: - $ tip_amount : num 0 0 1.55 1.45 5.2 ... - $ total_amount : num 5.8 16.3 7.85 8.75 26 ... - $ passenger_count: int 1 1 1 1 1 6 5 1 2 1 ... - $ tip_pct : num 0 0 0.197 0.166 0.2 ... +tibble [10,918 × 4] (S3: tbl_df/tbl/data.frame) + $ tip_amount : num [1:10918] 3 0 4 1 1 6 0 1.35 0 5.9 ... + $ total_amount : num [1:10918] 18.8 13.3 20.3 15.8 13.3 ... + $ passenger_count: int [1:10918] 3 2 1 1 1 1 1 1 1 3 ... + $ tip_pct : num [1:10918] 0.1596 0 0.197 0.0633 0.0752 ... ") ``` @@ -345,7 +360,8 @@ ds %>% as.data.frame() %>% mutate(pred_tip_pct = predict(model, newdata = .)) %>% filter(!is.nan(tip_pct)) %>% - summarize(sse_partial = sum((pred_tip_pct - tip_pct)^2), n_partial = n()) + summarize(sse_partial = sum((pred_tip_pct - tip_pct)^2), n_partial = n()) %>% + as_record_batch() }) %>% summarize(mse = sum(sse_partial) / sum(n_partial)) %>% pull(mse) diff --git a/r/vignettes/fs.Rmd b/r/vignettes/fs.Rmd index a0c92bb6be2..6fb7e2d1af9 100644 --- a/r/vignettes/fs.Rmd +++ b/r/vignettes/fs.Rmd @@ -1,8 +1,8 @@ --- -title: "Working with Cloud Storage (S3)" +title: "Working with Cloud Storage (S3, GCS)" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Working with Cloud Storage (S3)} + %\VignetteIndexEntry{Working with Cloud Storage (S3, GCS)} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -10,91 +10,152 @@ vignette: > The Arrow C++ library includes a generic filesystem interface and specific implementations for some cloud storage systems. This setup allows various parts of the project to be able to read and write data with different storage -backends. In the `arrow` R package, support has been enabled for AWS S3. -This vignette provides an overview of working with S3 data using Arrow. +backends. In the `arrow` R package, support has been enabled for AWS S3 and +Google Cloud Storage (GCS). This vignette provides an overview of working with +S3 and GCS data using Arrow. -> In Windows and macOS binary packages, S3 support is included. On Linux when -installing from source, S3 support is not enabled by default, and it has +> In Windows and macOS binary packages, S3 and GCS support are included. On Linux when +installing from source, S3 and GCS support is not always enabled by default, and it has additional system requirements. See `vignette("install", package = "arrow")` for details. +## Creating a FileSystem object + +One way of working with filesystems is to create `?FileSystem` objects. +`?S3FileSystem` objects can be created with the `s3_bucket()` function, which +automatically detects the bucket's AWS region. Similarly, `?GcsFileSystem` objects +can be created with the `gs_bucket()` function. The resulting +`FileSystem` will consider paths relative to the bucket's path (so for example +you don't need to prefix the bucket path when listing a directory). + +With a `FileSystem` object, you can point to specific files in it with the `$path()` method +and pass the result to file readers and writers (`read_parquet()`, `write_feather()`, et al.). +For example, to read a parquet file from the example NYC taxi data +(used in `vignette("dataset", package = "arrow")`): + +```r +bucket <- s3_bucket("voltrondata-labs-datasets") +# Or in GCS (anonymous = TRUE is required if credentials are not configured): +bucket <- gs_bucket("voltrondata-labs-datasets", anonymous = TRUE) +df <- read_parquet(bucket$path("nyc-taxi/year=2019/month=6/data.parquet")) +``` + +Note that this will be slower to read than if the file were local, +though if you're running on a machine in the same AWS region as the file in S3, +the cost of reading the data over the network should be much lower. + +You can list the files and/or directories in a bucket or subdirectory using +the `$ls()` method: + +```r +bucket$ls("nyc-taxi") +# Or recursive: +bucket$ls("nyc-taxi", recursive = TRUE) +``` + +**NOTE**: in GCS, you *should always* use `recursive = TRUE` as directories often don't appear in +`$ls()` results. + + + +See `help(FileSystem)` for a list of options that `s3_bucket()`/`S3FileSystem$create()` +and `gs_bucket()`/`GcsFileSystem$create()` can take. + +The object that `s3_bucket()` and `gs_bucket()` return is technically a `SubTreeFileSystem`, +which holds a path and a file system to which it corresponds. `SubTreeFileSystem`s can be +useful for holding a reference to a subdirectory somewhere (on S3, GCS, or elsewhere). + +One way to get a subtree is to call the `$cd()` method on a `FileSystem` + +```r +june2019 <- bucket$cd("2019/06") +df <- read_parquet(june2019$path("data.parquet")) +``` + +`SubTreeFileSystem` can also be made from a URI: + +```r +june2019 <- SubTreeFileSystem$create("s3://voltrondata-labs-datasets/nyc-taxi/2019/06") +``` + ## URIs -File readers and writers (`read_parquet()`, `write_feather()`, et al.) -accept an S3 URI as the source or destination file, -as do `open_dataset()` and `write_dataset()`. +File readers and writers (`read_parquet()`, `write_feather()`, et al.) also +accept a URI as the source or destination file, as do `open_dataset()` and `write_dataset()`. An S3 URI looks like: ``` s3://[access_key:secret_key@]bucket/path[?region=] ``` +A GCS URI looks like: + +``` +gs://[access_key:secret_key@]bucket/path +gs://anonymous@bucket/path +``` + For example, one of the NYC taxi data files used in `vignette("dataset", package = "arrow")` is found at ``` -s3://ursa-labs-taxi-data/2019/06/data.parquet +s3://voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/data.parquet +# Or in GCS (anonymous required on public buckets): +gs://anonymous@voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/data.parquet ``` Given this URI, you can pass it to `read_parquet()` just as if it were a local file path: ```r -df <- read_parquet("s3://ursa-labs-taxi-data/2019/06/data.parquet") +df <- read_parquet("s3://voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/data.parquet") +# Or in GCS: +df <- read_parquet("gs://anonymous@voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/data.parquet") ``` -Note that this will be slower to read than if the file were local, -though if you're running on a machine in the same AWS region as the file in S3, -the cost of reading the data over the network should be much lower. - -## Creating a FileSystem object - -Another way to connect to S3 is to create a `FileSystem` object once and pass -that to the read/write functions. -`S3FileSystem` objects can be created with the `s3_bucket()` function, which -automatically detects the bucket's AWS region. Additionally, the resulting -`FileSystem` will consider paths relative to the bucket's path (so for example -you don't need to prefix the bucket path when listing a directory). -This may be convenient when dealing with -long URIs, and it's necessary for some options and authentication methods -that aren't supported in the URI format. +### URI options -With a `FileSystem` object, you can point to specific files in it with the `$path()` method. -In the previous example, this would look like: +URIs accept additional options in the query parameters (the part after the `?`) +that are passed down to configure the underlying file system. They are separated +by `&`. For example, -```r -bucket <- s3_bucket("ursa-labs-taxi-data") -df <- read_parquet(bucket$path("2019/06/data.parquet")) +``` +s3://voltrondata-labs-datasets/?endpoint_override=https%3A%2F%2Fstorage.googleapis.com&allow_bucket_creation=true ``` -You can list the files and/or directories in an S3 bucket or subdirectory using -the `$ls()` method: +is equivlant to: ```r -bucket$ls() +fs <- S3FileSystem$create( + endpoint_override="https://storage.googleapis.com", + allow_bucket_creation=TRUE +) +fs$path("voltrondata-labs-datasets/") ``` -See `help(FileSystem)` for a list of options that `s3_bucket()` and `S3FileSystem$create()` -can take. `region`, `scheme`, and `endpoint_override` can be encoded as query -parameters in the URI (though `region` will be auto-detected in `s3_bucket()` or from the URI if omitted). -`access_key` and `secret_key` can also be included, -but other options are not supported in the URI. +Both tell the `S3FileSystem` that it should allow the creation of new buckets and to +talk to Google Storage instead of S3. The latter works because GCS implements an +S3-compatible API--see [File systems that emulate S3](#file-systems-that-emulate-s3) +below--but for better support for GCS use the GCSFileSystem with `gs://`. Also note +that parameters in the URI need to be +[percent encoded](https://en.wikipedia.org/wiki/Percent-encoding), which is why +`://` is written as `%3A%2F%2F`. -The object that `s3_bucket()` returns is technically a `SubTreeFileSystem`, which holds a path and a file system to which it corresponds. `SubTreeFileSystem`s can be useful for holding a reference to a subdirectory somewhere (on S3 or elsewhere). +For S3, only the following options can be included in the URI as query parameters +are `region`, `scheme`, `endpoint_override`, `access_key`, `secret_key`, `allow_bucket_creation`, +and `allow_bucket_deletion`. For GCS, the supported parameters are `scheme`, `endpoint_override`, +and `retry_limit_seconds`. -One way to get a subtree is to call the `$cd()` method on a `FileSystem` +In GCS, a useful option is `retry_limit_seconds`, which sets the number of seconds +a request may spend retrying before returning an error. The current default is +15 minutes, so in many interactive contexts it's nice to set a lower value: -```r -june2019 <- bucket$cd("2019/06") -df <- read_parquet(june2019$path("data.parquet")) ``` - -`SubTreeFileSystem` can also be made from a URI: - -```r -june2019 <- SubTreeFileSystem$create("s3://ursa-labs-taxi-data/2019/06") +gs://anonymous@voltrondata-labs-datasets/nyc-taxi/?retry_limit_seconds=10 ``` ## Authentication +### S3 Authentication + To access private S3 buckets, you need typically need two secret parameters: a `access_key`, which is like a user id, and `secret_key`, which is like a token or password. There are a few options for passing these credentials: @@ -110,6 +171,31 @@ or password. There are a few options for passing these credentials: - Use an [AccessRole](https://docs.aws.amazon.com/STS/latest/APIReference/API_AssumeRole.html) for temporary access by passing the `role_arn` identifier to `S3FileSystem$create()` or `s3_bucket()`. +### GCS Authentication + +The simplest way to authenticate with GCS is to run the [gcloud](https://cloud.google.com/sdk/docs/) +command to setup application default credentials: + +``` +gcloud auth application-default login +``` + +To manually configure credentials, you can pass either `access_token` and `expiration`, for using +temporary tokens generated elsewhere, or `json_credentials`, to reference a downloaded +credentials file. + +If you haven't configured credentials, then to access *public* buckets, you +must pass `anonymous = TRUE` or `anonymous` as the user in a URI: + +```r +bucket <- gs_bucket("voltrondata-labs-datasets", anonymous = TRUE) +fs <- GcsFileSystem$create(anonymous = TRUE) +df <- read_parquet("gs://anonymous@voltrondata-labs-datasets/nyc-taxi/year=2019/month=6/data.parquet") +``` + + + ## Using a proxy server If you need to use a proxy server to connect to an S3 bucket, you can provide @@ -117,7 +203,7 @@ a URI in the form `http://user:password@host:port` to `proxy_options`. For example, a local proxy server running on port 1316 can be used like this: ```r -bucket <- s3_bucket("ursa-labs-taxi-data", proxy_options = "http://localhost:1316") +bucket <- s3_bucket("voltrondata-labs-datasets", proxy_options = "http://localhost:1316") ``` ## File systems that emulate S3