From 94e6166f92498b85cbbd68abe4839732dcbc83a5 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 09:24:14 -0700 Subject: [PATCH 01/18] Add option to not create buckets --- cpp/src/arrow/filesystem/s3fs.cc | 28 +++++++++++++++++++++------ cpp/src/arrow/filesystem/s3fs.h | 3 +++ cpp/src/arrow/filesystem/s3fs_test.cc | 1 + 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 4336b783307..9c845a0dce1 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -1677,8 +1677,24 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisHeadBucket(req); + + if (outcome.IsSuccess()) { + return Status::OK(); + } else if (!IsNotFound(outcome.GetError())) { + return ErrorToStatus(std::forward_as_tuple("When creating bucket '", bucket, "': "), + outcome.GetError()); + } + + if (!options().allow_create_buckets) { + return Status::IOError("S3 bucket '", bucket, "' does not exist and client not allowed to create buckets."); + } + S3Model::CreateBucketConfiguration config; - S3Model::CreateBucketRequest req; + S3Model::CreateBucketRequest req2; auto _region = region(); // AWS S3 treats the us-east-1 differently than other regions // https://docs.aws.amazon.com/cli/latest/reference/s3api/create-bucket.html @@ -1687,13 +1703,13 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisCreateBucket(req); - if (!outcome.IsSuccess() && !IsAlreadyExists(outcome.GetError())) { + auto outcome2 = client_->CreateBucket(req2); + if (!outcome2.IsSuccess() && !IsAlreadyExists(outcome2.GetError())) { return ErrorToStatus(std::forward_as_tuple("When creating bucket '", bucket, "': "), - outcome.GetError()); + outcome2.GetError()); } return Status::OK(); } diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index c2c99e964e6..bc4f8e46473 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -130,6 +130,9 @@ struct ARROW_EXPORT S3Options { /// Whether OutputStream writes will be issued in the background, without blocking. bool background_writes = true; + /// Whether to allow creation of new buckets + bool allow_create_buckets = false; + /// \brief Default metadata for OpenOutputStream. /// /// This will be ignored if non-empty metadata is passed to OpenOutputStream. diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index b44bc7f50cd..2b9d838e405 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -442,6 +442,7 @@ class TestS3FS : public S3TestMixin { if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared(); } + options_.allow_create_buckets = true; ASSERT_OK_AND_ASSIGN(fs_, S3FileSystem::Make(options_)); } From 1e65e1d243af1fd0e05bee8ef1849288f3ffb935 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 11:03:32 -0700 Subject: [PATCH 02/18] Test deletion control --- cpp/src/arrow/filesystem/s3fs.cc | 45 ++++++++++++++++----------- cpp/src/arrow/filesystem/s3fs_test.cc | 13 +++++++- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 9c845a0dce1..1648d468e91 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -1678,23 +1678,28 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisHeadBucket(req); + { + S3Model::HeadBucketRequest req; + req.SetBucket(ToAwsString(bucket)); + auto outcome = client_->HeadBucket(req); - if (outcome.IsSuccess()) { - return Status::OK(); - } else if (!IsNotFound(outcome.GetError())) { - return ErrorToStatus(std::forward_as_tuple("When creating bucket '", bucket, "': "), - outcome.GetError()); - } + if (outcome.IsSuccess()) { + return Status::OK(); + } else if (!IsNotFound(outcome.GetError())) { + return ErrorToStatus( + std::forward_as_tuple("When creating bucket '", bucket, "': "), + outcome.GetError()); + } - if (!options().allow_create_buckets) { - return Status::IOError("S3 bucket '", bucket, "' does not exist and client not allowed to create buckets."); + if (!options().allow_create_buckets) { + return Status::IOError( + "Permission denied: create bucket '", bucket, "'. ", + "To create buckets, enable allow_create_buckets option on filesystem."); + } } S3Model::CreateBucketConfiguration config; - S3Model::CreateBucketRequest req2; + S3Model::CreateBucketRequest req; auto _region = region(); // AWS S3 treats the us-east-1 differently than other regions // https://docs.aws.amazon.com/cli/latest/reference/s3api/create-bucket.html @@ -1703,13 +1708,13 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisCreateBucket(req2); - if (!outcome2.IsSuccess() && !IsAlreadyExists(outcome2.GetError())) { + auto outcome = client_->CreateBucket(req); + if (!outcome.IsSuccess() && !IsAlreadyExists(outcome.GetError())) { return ErrorToStatus(std::forward_as_tuple("When creating bucket '", bucket, "': "), - outcome2.GetError()); + outcome.GetError()); } return Status::OK(); } @@ -2389,13 +2394,17 @@ Status S3FileSystem::DeleteDir(const std::string& s) { return Status::NotImplemented("Cannot delete all S3 buckets"); } RETURN_NOT_OK(impl_->DeleteDirContentsAsync(path.bucket, path.key).status()); - if (path.key.empty()) { + if (path.key.empty() && options().allow_create_buckets) { // Delete bucket S3Model::DeleteBucketRequest req; req.SetBucket(ToAwsString(path.bucket)); return OutcomeToStatus( std::forward_as_tuple("When deleting bucket '", path.bucket, "': "), impl_->client_->DeleteBucket(req)); + } else if (path.key.empty()) { + return Status::IOError( + "Permission denied: delete bucket '", path.bucket, "'. ", + "To delete buckets, enable allow_create_buckets option on filesystem."); } else { // Delete "directory" RETURN_NOT_OK(impl_->DeleteObject(path.bucket, path.key + kSep)); diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 2b9d838e405..4c715451974 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -405,6 +405,8 @@ class TestS3FS : public S3TestMixin { public: void SetUp() override { S3TestMixin::SetUp(); + // Most tests will create buckets + options_.allow_create_buckets = true; MakeFileSystem(); // Set up test bucket { @@ -442,7 +444,6 @@ class TestS3FS : public S3TestMixin { if (!options_.retry_strategy) { options_.retry_strategy = std::make_shared(); } - options_.allow_create_buckets = true; ASSERT_OK_AND_ASSIGN(fs_, S3FileSystem::Make(options_)); } @@ -1125,6 +1126,16 @@ TEST_F(TestS3FS, FileSystemFromUri) { AssertFileInfo(fs.get(), path, FileType::File, 8); } +TEST_F(TestS3FS, NoCreateDeleteBucket) { + // Create a bucket to try deleting + ASSERT_OK(fs_->CreateDir("test-no-delete")); + + options_.allow_create_buckets = false; + MakeFileSystem(); + ASSERT_RAISES(IOError, fs_->CreateDir("test-no-create")); + ASSERT_RAISES(IOError, fs_->DeleteDir("test-no-delete")); +} + // Simple retry strategy that records errors encountered and its emitted retry delays class TestRetryStrategy : public S3RetryStrategy { public: From 300c9156e34a135173b661f09b1d4833f94ba187 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 12:01:04 -0700 Subject: [PATCH 03/18] Add support for option in Python interface --- python/pyarrow/_s3fs.pyx | 7 ++++++- python/pyarrow/includes/libarrow_fs.pxd | 1 + python/pyarrow/tests/test_dataset.py | 25 +++++++++++++++++++------ python/pyarrow/tests/test_fs.py | 11 +++++------ 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 4d919c8b9d6..ced0877bde3 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -150,6 +150,8 @@ cdef class S3FileSystem(FileSystem): S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost', 'port': 8020, 'username': 'username', 'password': 'password'}) + allow_create_buckets : bool, default False + If True, allows creating and deleting buckets. """ cdef: @@ -159,7 +161,8 @@ cdef class S3FileSystem(FileSystem): bint anonymous=False, region=None, scheme=None, endpoint_override=None, bint background_writes=True, default_metadata=None, role_arn=None, session_name=None, - external_id=None, load_frequency=900, proxy_options=None): + external_id=None, load_frequency=900, proxy_options=None, + allow_create_buckets=False): cdef: CS3Options options shared_ptr[CS3FileSystem] wrapped @@ -253,6 +256,8 @@ cdef class S3FileSystem(FileSystem): "'proxy_options': expected 'dict' or 'str', " f"got {type(proxy_options)} instead.") + options.allow_create_buckets = allow_create_buckets + with nogil: wrapped = GetResultValue(CS3FileSystem.Make(options)) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index e491233e88f..399738ea23d 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -155,6 +155,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_string endpoint_override c_string scheme c_bool background_writes + c_bool allow_create_buckets shared_ptr[const CKeyValueMetadata] default_metadata c_string role_arn c_string session_name diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index d2210c4b6c1..40f22045b6a 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2490,7 +2490,7 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir): @pytest.fixture @pytest.mark.parquet def s3_example_simple(s3_server): - from pyarrow.fs import FileSystem + from pyarrow.fs import S3FileSystem host, port, access_key, secret_key = s3_server['connection'] uri = ( @@ -2498,7 +2498,14 @@ def s3_example_simple(s3_server): .format(access_key, secret_key, host, port) ) - fs, path = FileSystem.from_uri(uri) + fs = S3FileSystem( + access_key=access_key, + secret_key=secret_key, + scheme="http", + endpoint_override="{}:{}".format(host, port), + allow_create_buckets=True, + ) + path = "mybucket/data.parquet" fs.create_dir("mybucket") table = pa.table({'a': [1, 2, 3]}) @@ -2552,7 +2559,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): @pytest.mark.parquet @pytest.mark.s3 def test_open_dataset_from_s3_with_filesystem_uri(s3_server): - from pyarrow.fs import FileSystem + from pyarrow.fs import S3FileSystem host, port, access_key, secret_key = s3_server['connection'] bucket = 'theirbucket' @@ -2561,11 +2568,17 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_server): access_key, secret_key, bucket, path, host, port ) - fs, path = FileSystem.from_uri(uri) - assert path == 'theirbucket/nested/folder/data.parquet' - + # At first need a fs that can create buckets + fs = S3FileSystem( + access_key=access_key, + secret_key=secret_key, + scheme="http", + endpoint_override="{}:{}".format(host, port), + allow_create_buckets=True, + ) fs.create_dir(bucket) + path = bucket + '/' + path table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 4fd72704a71..0b0e967c4b3 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -212,7 +212,8 @@ def s3fs(request, s3_server): access_key=access_key, secret_key=secret_key, endpoint_override='{}:{}'.format(host, port), - scheme='http' + scheme='http', + allow_create_buckets=True ) fs.create_dir(bucket) @@ -443,6 +444,9 @@ def test_s3fs_limited_permissions_create_bucket(s3_server): ) fs.create_dir('existing-bucket/test') + with pytest.raises(pa.ArrowIOError): + fs.create_dir('new-bucket') + def test_file_info_constructor(): dt = datetime.fromtimestamp(1568799826, timezone.utc) @@ -1315,11 +1319,6 @@ def test_filesystem_from_uri_s3(s3_server): assert isinstance(fs, S3FileSystem) assert path == "mybucket/foo/bar" - fs.create_dir(path) - [info] = fs.get_file_info([path]) - assert info.path == path - assert info.type == FileType.Directory - def test_py_filesystem(): handler = DummyHandler() From 9a8486c677c99bf44b406e6fa96281aeb82dfb20 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 12:31:03 -0700 Subject: [PATCH 04/18] Add R changes --- r/R/arrowExports.R | 4 ++-- r/R/filesystem.R | 8 ++++++-- r/man/FileSystem.Rd | 3 +++ r/src/arrowExports.cpp | 11 ++++++----- r/src/filesystem.cpp | 5 ++++- r/tests/testthat/test-s3-minio.R | 3 ++- 6 files changed, 23 insertions(+), 11 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 8ad56f227fa..12a3d3c4e8e 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1280,8 +1280,8 @@ fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_ba invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads)) } -fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes) { - .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes) +fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_create_buckets) { + .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_create_buckets) } fs___S3FileSystem__region <- function(fs) { diff --git a/r/R/filesystem.R b/r/R/filesystem.R index a6d845d4c91..3af95cc013a 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -150,6 +150,9 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `scheme`: S3 connection transport (default "https") #' - `background_writes`: logical, whether `OutputStream` writes will be issued #' in the background, without blocking (default `TRUE`) +#' - `allow_create_buckets`: logical, if TRUE, the filesystem will create or +#' delete buckets if `$CreateDir()` or `$DeleteDir()` is called on the bucket +#' level (default `FALSE`). #' #' @section Methods: #' @@ -338,7 +341,7 @@ S3FileSystem$create <- function(anonymous = FALSE, ...) { invalid_args <- intersect( c( "access_key", "secret_key", "session_token", "role_arn", "session_name", - "external_id", "load_frequency" + "external_id", "load_frequency", "allow_create_buckets" ), names(args) ) @@ -383,7 +386,8 @@ default_s3_options <- list( endpoint_override = "", scheme = "", proxy_options = "", - background_writes = TRUE + background_writes = TRUE, + allow_create_buckets = FALSE ) #' Connect to an AWS S3 bucket diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 2f3dcff670b..8f867fc794c 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -50,6 +50,9 @@ that emulate S3. \item \code{scheme}: S3 connection transport (default "https") \item \code{background_writes}: logical, whether \code{OutputStream} writes will be issued in the background, without blocking (default \code{TRUE}) +\item \code{allow_create_buckets}: logical, if TRUE, the filesystem will create or +delete buckets if \verb{$CreateDir()} or \verb{$DeleteDir()} is called on the bucket +level (default \code{FALSE}). } } diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 2e3cf0f6055..79eab9cd2f4 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3180,8 +3180,8 @@ END_CPP11 } // filesystem.cpp #if defined(ARROW_R_WITH_S3) -std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes); -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp){ +std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes, bool allow_create_buckets); +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_create_buckets_sexp){ BEGIN_CPP11 arrow::r::Input::type anonymous(anonymous_sexp); arrow::r::Input::type access_key(access_key_sexp); @@ -3196,11 +3196,12 @@ BEGIN_CPP11 arrow::r::Input::type scheme(scheme_sexp); arrow::r::Input::type proxy_options(proxy_options_sexp); arrow::r::Input::type background_writes(background_writes_sexp); - return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes)); + arrow::r::Input::type allow_create_buckets(allow_create_buckets_sexp); + return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_create_buckets)); END_CPP11 } #else -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp){ +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_create_buckets_sexp){ Rf_error("Cannot call fs___S3FileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -5432,7 +5433,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, - { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 13}, + { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 14}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index c28f52631a2..bc9658b0f47 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -282,7 +282,8 @@ std::shared_ptr fs___S3FileSystem__create( std::string session_token = "", std::string role_arn = "", std::string session_name = "", std::string external_id = "", int load_frequency = 900, std::string region = "", std::string endpoint_override = "", std::string scheme = "", - std::string proxy_options = "", bool background_writes = true) { + std::string proxy_options = "", bool background_writes = true, + bool allow_create_buckets = false) { // We need to ensure that S3 is initialized before we start messing with the // options StopIfNotOk(fs::EnsureS3Initialized()); @@ -321,6 +322,8 @@ std::shared_ptr fs___S3FileSystem__create( /// default true s3_opts.background_writes = background_writes; + s3_opts.allow_create_buckets = allow_create_buckets; + auto io_context = arrow::io::IOContext(gc_memory_pool()); return ValueOrStop(fs::S3FileSystem::Make(s3_opts, io_context)); } diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 51db355783b..356b6754dc4 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -34,7 +34,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { access_key = minio_key, secret_key = minio_secret, scheme = "http", - endpoint_override = paste0("localhost:", minio_port) + endpoint_override = paste0("localhost:", minio_port), + allow_create_buckets = TRUE ) now <- as.character(as.numeric(Sys.time())) fs$CreateDir(now) From cd979a1358773cfae1575c05629b258fb12836f0 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 14:12:37 -0700 Subject: [PATCH 05/18] Make it easier to change the create bucket behavior --- cpp/src/arrow/filesystem/s3fs.cc | 20 ++++++++++++++---- cpp/src/arrow/filesystem/s3fs.h | 2 ++ python/pyarrow/_s3fs.pyx | 8 ++++++++ python/pyarrow/includes/libarrow_fs.pxd | 1 + python/pyarrow/tests/test_dataset.py | 27 ++++++++----------------- python/pyarrow/tests/test_fs.py | 6 ++++++ r/R/arrowExports.R | 4 ++++ r/R/filesystem.R | 3 +++ r/src/arrowExports.cpp | 18 +++++++++++++++++ r/src/filesystem.cpp | 5 +++++ r/tests/testthat/test-s3-minio.R | 16 +++++++++++++++ 11 files changed, 87 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 1648d468e91..54b80616c54 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -763,6 +763,10 @@ class ClientBuilder { const S3Options& options() const { return options_; } + void allow_create_buckets(bool allow) { + options_.allow_create_buckets = allow; + } + protected: S3Options options_; Aws::Client::ClientConfiguration client_config_; @@ -1651,6 +1655,10 @@ class S3FileSystem::Impl : public std::enable_shared_from_this void SaveBackend(const Aws::Client::AWSError& error) { if (!backend_ || *backend_ == S3Backend::Other) { @@ -1693,8 +1701,8 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisoptions(); } std::string S3FileSystem::region() const { return impl_->region(); } +void S3FileSystem::allow_create_buckets(bool allow) { + impl_->allow_create_buckets(allow); +} + Result S3FileSystem::GetFileInfo(const std::string& s) { ARROW_ASSIGN_OR_RAISE(auto path, S3Path::FromString(s)); FileInfo info; @@ -2403,8 +2415,8 @@ Status S3FileSystem::DeleteDir(const std::string& s) { impl_->client_->DeleteBucket(req)); } else if (path.key.empty()) { return Status::IOError( - "Permission denied: delete bucket '", path.bucket, "'. ", - "To delete buckets, enable allow_create_buckets option on filesystem."); + "Would delete bucket: '", path.bucket, "'. ", + "To delete buckets, enable the allow_create_buckets option."); } else { // Delete "directory" RETURN_NOT_OK(impl_->DeleteObject(path.bucket, path.key + kSep)); diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index bc4f8e46473..aa2dbafeaad 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -221,6 +221,8 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { /// Return the actual region this filesystem connects to std::string region() const; + void allow_create_buckets(bool allow); + bool Equals(const FileSystem& other) const override; /// \cond FALSE diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index ced0877bde3..77f5d5ae36f 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -317,3 +317,11 @@ cdef class S3FileSystem(FileSystem): The AWS region this filesystem connects to. """ return frombytes(self.s3fs.region()) + + @property + def allow_create_buckets(self): + return self.s3fs.options().allow_create_buckets + + @allow_create_buckets.setter + def allow_create_buckets(self, value): + self.s3fs.allow_create_buckets(value) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 399738ea23d..89716e07801 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -194,6 +194,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CResult[shared_ptr[CS3FileSystem]] Make(const CS3Options& options) CS3Options options() c_string region() + void allow_create_buckets(c_bool allow) cdef CStatus CInitializeS3 "arrow::fs::InitializeS3"( const CS3GlobalOptions& options) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 40f22045b6a..72db1b703e9 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2490,7 +2490,7 @@ def test_dataset_partitioned_dictionary_type_reconstruct(tempdir): @pytest.fixture @pytest.mark.parquet def s3_example_simple(s3_server): - from pyarrow.fs import S3FileSystem + from pyarrow.fs import FileSystem host, port, access_key, secret_key = s3_server['connection'] uri = ( @@ -2498,14 +2498,8 @@ def s3_example_simple(s3_server): .format(access_key, secret_key, host, port) ) - fs = S3FileSystem( - access_key=access_key, - secret_key=secret_key, - scheme="http", - endpoint_override="{}:{}".format(host, port), - allow_create_buckets=True, - ) - path = "mybucket/data.parquet" + fs, path = FileSystem.from_uri(uri) + fs.allow_create_buckets = True fs.create_dir("mybucket") table = pa.table({'a': [1, 2, 3]}) @@ -2559,7 +2553,7 @@ def test_open_dataset_from_uri_s3_fsspec(s3_example_simple): @pytest.mark.parquet @pytest.mark.s3 def test_open_dataset_from_s3_with_filesystem_uri(s3_server): - from pyarrow.fs import S3FileSystem + from pyarrow.fs import FileSystem host, port, access_key, secret_key = s3_server['connection'] bucket = 'theirbucket' @@ -2568,17 +2562,12 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_server): access_key, secret_key, bucket, path, host, port ) - # At first need a fs that can create buckets - fs = S3FileSystem( - access_key=access_key, - secret_key=secret_key, - scheme="http", - endpoint_override="{}:{}".format(host, port), - allow_create_buckets=True, - ) + fs, path = FileSystem.from_uri(uri) + assert path == 'theirbucket/nested/folder/data.parquet' + + fs.allow_create_buckets = True fs.create_dir(bucket) - path = bucket + '/' + path table = pa.table({'a': [1, 2, 3]}) with fs.open_output_stream(path) as out: pq.write_table(table, out) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 0b0e967c4b3..110766facf4 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1319,6 +1319,12 @@ def test_filesystem_from_uri_s3(s3_server): assert isinstance(fs, S3FileSystem) assert path == "mybucket/foo/bar" + fs.allow_create_buckets = True + fs.create_dir(path) + [info] = fs.get_file_info([path]) + assert info.path == path + assert info.type == FileType.Directory + def test_py_filesystem(): handler = DummyHandler() diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 12a3d3c4e8e..63d6e03bc31 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1288,6 +1288,10 @@ fs___S3FileSystem__region <- function(fs) { .Call(`_arrow_fs___S3FileSystem__region`, fs) } +fs__S3FileSystem__allow_create_buckets <- function(fs, allow) { + invisible(.Call(`_arrow_fs__S3FileSystem__allow_create_buckets`, fs, allow)) +} + io___Readable__Read <- function(x, nbytes) { .Call(`_arrow_io___Readable__Read`, x, nbytes) } diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 3af95cc013a..1dedfc2a334 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -333,6 +333,9 @@ S3FileSystem <- R6Class("S3FileSystem", inherit = FileSystem, active = list( region = function() fs___S3FileSystem__region(self) + ), + public = list( + allow_create_buckets = function(allow) fs__S3FileSystem__allow_create_buckets(self, allow) ) ) S3FileSystem$create <- function(anonymous = FALSE, ...) { diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 79eab9cd2f4..a016cda4a8f 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3221,6 +3221,23 @@ extern "C" SEXP _arrow_fs___S3FileSystem__region(SEXP fs_sexp){ } #endif +// filesystem.cpp +#if defined(ARROW_R_WITH_S3) +void fs__S3FileSystem__allow_create_buckets(const std::shared_ptr& fs, bool allow); +extern "C" SEXP _arrow_fs__S3FileSystem__allow_create_buckets(SEXP fs_sexp, SEXP allow_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type fs(fs_sexp); + arrow::r::Input::type allow(allow_sexp); + fs__S3FileSystem__allow_create_buckets(fs, allow); + return R_NilValue; +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs__S3FileSystem__allow_create_buckets(SEXP fs_sexp, SEXP allow_sexp){ + Rf_error("Cannot call fs__S3FileSystem__allow_create_buckets(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + // io.cpp std::shared_ptr io___Readable__Read(const std::shared_ptr& x, int64_t nbytes); extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){ @@ -5435,6 +5452,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 14}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, + { "_arrow_fs__S3FileSystem__allow_create_buckets", (DL_FUNC) &_arrow_fs__S3FileSystem__allow_create_buckets, 2}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index bc9658b0f47..08bea1c5ca8 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -333,4 +333,9 @@ std::string fs___S3FileSystem__region(const std::shared_ptr& f return fs->region(); } +// [[s3::export]] +void fs__S3FileSystem__allow_create_buckets(const std::shared_ptr& fs, bool allow) { + fs->allow_create_buckets(allow); +} + #endif diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 356b6754dc4..62bac24309c 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -181,6 +181,22 @@ if (arrow_with_s3() && process_is_running("minio server")) { expect_length(fs$ls(minio_path("new_dataset_dir")), 1) }) + test_that("CreateDir fails on bucket if allow_create_buckets=False", { + now_tmp <- paste0(now, "-test-fail-delete") + fs$CreateDir(now_tmp) + + fs$allow_create_buckets(FALSE) + expect_error( + fs$CreateDir("should-fail"), + regexp = "Bucket does not exist" + ) + expect_error( + fs$DeleteDir(now_tmp), + regexp = "Would delete bucket" + ) + fs$allow_create_buckets(TRUE) + }) + test_that("Let's test copy_files too", { td <- make_temp_dir() copy_files(minio_uri("hive_dir"), td) From 3e319e20e9f534fa594dd3dcbd52df4192ef2507 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 14:15:42 -0700 Subject: [PATCH 06/18] Document methods --- python/pyarrow/_s3fs.pyx | 3 +++ r/R/filesystem.R | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 77f5d5ae36f..a812787dc3e 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -320,6 +320,9 @@ cdef class S3FileSystem(FileSystem): @property def allow_create_buckets(self): + """ + Whether to allow CreateDir and DeleteDir at the bucket-level. + """ return self.s3fs.options().allow_create_buckets @allow_create_buckets.setter diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 1dedfc2a334..d47f817db4e 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -182,6 +182,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' sequential writing. #' - `$OpenAppendStream(path)`: Open an [output stream][OutputStream] for #' appending. +#' +#' `S3FileSystem` also has a method: +#' +#' - `$allow_create_buckets()` which can set the corresponding option above. #' #' @section Active bindings: #' From 0a65bb82bd823e0cb6f04b4d8ea9d99da681af1e Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 14:42:30 -0700 Subject: [PATCH 07/18] Format --- cpp/src/arrow/filesystem/s3fs.cc | 13 ++++--------- r/R/filesystem.R | 6 +++--- r/src/filesystem.cpp | 3 ++- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 54b80616c54..9765858287f 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -763,9 +763,7 @@ class ClientBuilder { const S3Options& options() const { return options_; } - void allow_create_buckets(bool allow) { - options_.allow_create_buckets = allow; - } + void allow_create_buckets(bool allow) { options_.allow_create_buckets = allow; } protected: S3Options options_; @@ -1655,9 +1653,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_this void SaveBackend(const Aws::Client::AWSError& error) { @@ -2414,9 +2410,8 @@ Status S3FileSystem::DeleteDir(const std::string& s) { std::forward_as_tuple("When deleting bucket '", path.bucket, "': "), impl_->client_->DeleteBucket(req)); } else if (path.key.empty()) { - return Status::IOError( - "Would delete bucket: '", path.bucket, "'. ", - "To delete buckets, enable the allow_create_buckets option."); + return Status::IOError("Would delete bucket: '", path.bucket, "'. ", + "To delete buckets, enable the allow_create_buckets option."); } else { // Delete "directory" RETURN_NOT_OK(impl_->DeleteObject(path.bucket, path.key + kSep)); diff --git a/r/R/filesystem.R b/r/R/filesystem.R index d47f817db4e..d51a1d8b1e8 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -182,10 +182,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' sequential writing. #' - `$OpenAppendStream(path)`: Open an [output stream][OutputStream] for #' appending. -#' +#' #' `S3FileSystem` also has a method: -#' -#' - `$allow_create_buckets()` which can set the corresponding option above. +#' +#' - `$allow_create_buckets()` which can set the corresponding option above. #' #' @section Active bindings: #' diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 08bea1c5ca8..687f4c8b883 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -334,7 +334,8 @@ std::string fs___S3FileSystem__region(const std::shared_ptr& f } // [[s3::export]] -void fs__S3FileSystem__allow_create_buckets(const std::shared_ptr& fs, bool allow) { +void fs__S3FileSystem__allow_create_buckets(const std::shared_ptr& fs, + bool allow) { fs->allow_create_buckets(allow); } From c52568fc85b9c56e3b1fe942a0513104a9f8f742 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 20 May 2022 15:01:50 -0700 Subject: [PATCH 08/18] Adjust test expectations --- python/pyarrow/tests/test_dataset.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 72db1b703e9..445f256d457 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4531,9 +4531,21 @@ def test_write_dataset_s3_put_only(s3_server): ).to_table() assert result.equals(table) - with pytest.raises(OSError, match="Access Denied"): + # Passing create_dir is fine if the bucket already exists + ds.write_dataset( + table, "existing-bucket", filesystem=fs, + format="feather", create_dir=True, partitioning=part, + existing_data_behavior='overwrite_or_ignore' + ) + # check roundtrip + result = ds.dataset( + "existing-bucket", filesystem=fs, format="ipc", partitioning="hive" + ).to_table() + assert result.equals(table) + + with pytest.raises(OSError, match="Bucket does not exist"): ds.write_dataset( - table, "existing-bucket", filesystem=fs, + table, "non-existing-bucket", filesystem=fs, format="feather", create_dir=True, existing_data_behavior='overwrite_or_ignore' ) From 25adf4aaeaf0a6ddc5ec615c07e91225abe37ac4 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 23 May 2022 09:54:54 -0700 Subject: [PATCH 09/18] feat: allow_create_buckets supported in S3Options::FromURI() --- cpp/src/arrow/filesystem/s3fs.cc | 4 ++++ python/pyarrow/tests/test_dataset.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 9765858287f..7332cf93287 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -91,6 +91,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/optional.h" +#include "arrow/util/string.h" #include "arrow/util/task_group.h" #include "arrow/util/thread_pool.h" @@ -353,6 +354,9 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { options.scheme = kv.second; } else if (kv.first == "endpoint_override") { options.endpoint_override = kv.second; + } else if (kv.first == "allow_create_buckets") { + options.allow_create_buckets = + ::arrow::internal::AsciiEqualsCaseInsensitive(kv.second, "true"); } else { return Status::Invalid("Unexpected query parameter in S3 URI: '", kv.first, "'"); } diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 445f256d457..88612406271 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2495,11 +2495,11 @@ def s3_example_simple(s3_server): host, port, access_key, secret_key = s3_server['connection'] uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" + "&allow_create_buckets=True" .format(access_key, secret_key, host, port) ) fs, path = FileSystem.from_uri(uri) - fs.allow_create_buckets = True fs.create_dir("mybucket") table = pa.table({'a': [1, 2, 3]}) From b961c2635e935c3d2a1008ee6e4a36fdc403d466 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 23 May 2022 10:40:44 -0700 Subject: [PATCH 10/18] doc: Guide users to create buckets some other way --- cpp/src/arrow/filesystem/s3fs.h | 7 ++++++- python/pyarrow/_s3fs.pyx | 9 ++++++++- r/R/filesystem.R | 9 +++++++++ r/man/FileSystem.Rd | 16 ++++++++++++++++ 4 files changed, 39 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index aa2dbafeaad..28df1554c39 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -130,7 +130,12 @@ struct ARROW_EXPORT S3Options { /// Whether OutputStream writes will be issued in the background, without blocking. bool background_writes = true; - /// Whether to allow creation of new buckets + /// Whether to allow creation or deletion of buckets + /// + /// When S3FileSystem creates new buckets, it does not pass any non-default settings. + /// In AWS S3, the bucket and all objects will be not publicly visible, and there + /// will be no bucket policies and no resource tags. To have more control over how + /// buckets are created, use a different API to create them. bool allow_create_buckets = false; /// \brief Default metadata for OpenOutputStream. diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index a812787dc3e..75c63fff2ab 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -99,6 +99,12 @@ cdef class S3FileSystem(FileSystem): Note: S3 buckets are special and the operations available on them may be limited or more expensive than desired. + When S3FileSystem creates new buckets (assuming allow_create_buckets is True), + it does not pass any non-default settings. In AWS S3, the bucket and all + objects will be not publicly visible, and will have no bucket policies + and no resource tags. To have more control over how buckets are created, + use a different API to create them. + Parameters ---------- access_key : str, default None @@ -151,7 +157,8 @@ cdef class S3FileSystem(FileSystem): 'port': 8020, 'username': 'username', 'password': 'password'}) allow_create_buckets : bool, default False - If True, allows creating and deleting buckets. + If True, allows creating and deleting buckets. This option may also be + passed in a URI query parameter. """ cdef: diff --git a/r/R/filesystem.R b/r/R/filesystem.R index d51a1d8b1e8..38e42d69cae 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -196,6 +196,15 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `$base_path`: for `SubTreeFileSystem`, the path in `$base_fs` which is considered #' root in this `SubTreeFileSystem`. #' +#' @section Notes: +#' +#' On S3FileSystem, `$CreateDir()` on a top-level directory creates a new bucket. +#' When S3FileSystem creates new buckets (assuming allow_create_buckets is TRUE), +#' it does not pass any non-default settings. In AWS S3, the bucket and all +#' objects will be not publicly visible, and will have no bucket policies +#' and no resource tags. To have more control over how buckets are created, +#' use a different API to create them. +#' #' @usage NULL #' @format NULL #' @docType class diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 8f867fc794c..67f69a8bb1c 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -86,6 +86,11 @@ sequential writing. \item \verb{$OpenAppendStream(path)}: Open an \link[=OutputStream]{output stream} for appending. } + +\code{S3FileSystem} also has a method: +\itemize{ +\item \verb{$allow_create_buckets()} which can set the corresponding option above. +} } \section{Active bindings}{ @@ -100,3 +105,14 @@ root in this \code{SubTreeFileSystem}. } } +\section{Notes}{ + + +On S3FileSystem, \verb{$CreateDir()} on a top-level directory creates a new bucket. +When S3FileSystem creates new buckets (assuming allow_create_buckets is TRUE), +it does not pass any non-default settings. In AWS S3, the bucket and all +objects will be not publicly visible, and will have no bucket policies +and no resource tags. To have more control over how buckets are created, +use a different API to create them. +} + From b1bb3bc09ce301d23e5d32ecd392a7b091679c9b Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 25 May 2022 10:18:47 -0700 Subject: [PATCH 11/18] Add back original test --- python/pyarrow/tests/test_dataset.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 88612406271..72e9edf6b9c 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4543,6 +4543,7 @@ def test_write_dataset_s3_put_only(s3_server): ).to_table() assert result.equals(table) + # Error enforced by filesystem with pytest.raises(OSError, match="Bucket does not exist"): ds.write_dataset( table, "non-existing-bucket", filesystem=fs, @@ -4550,6 +4551,15 @@ def test_write_dataset_s3_put_only(s3_server): existing_data_behavior='overwrite_or_ignore' ) + # Error enforced by minio / S3 service + fs.allow_create_buckets = True + with pytest.raises(OSError, match="Access Denied"): + ds.write_dataset( + table, "non-existing-bucket", filesystem=fs, + format="feather", create_dir=True, + existing_data_behavior='overwrite_or_ignore' + ) + @pytest.mark.parquet def test_dataset_null_to_dictionary_cast(tempdir, dataset_reader): From 749300b16a4a6461ed3b42b9d0b5c47a96542700 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Tue, 31 May 2022 15:47:21 -0700 Subject: [PATCH 12/18] chore: change to two separate settings --- cpp/src/arrow/filesystem/s3fs.cc | 29 +++++++++++------ cpp/src/arrow/filesystem/s3fs.h | 11 +++++-- cpp/src/arrow/filesystem/s3fs_test.cc | 6 ++-- python/pyarrow/_s3fs.pyx | 43 +++++++++++++++++-------- python/pyarrow/includes/libarrow_fs.pxd | 6 ++-- python/pyarrow/tests/test_dataset.py | 6 ++-- python/pyarrow/tests/test_fs.py | 8 +++-- r/R/arrowExports.R | 12 ++++--- r/R/filesystem.R | 22 +++++++------ r/man/FileSystem.Rd | 14 ++++---- r/src/arrowExports.cpp | 43 ++++++++++++++++++------- r/src/filesystem.cpp | 17 +++++++--- r/tests/testthat/test-s3-minio.R | 11 ++++--- 13 files changed, 153 insertions(+), 75 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 7332cf93287..43eca71c2d5 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -354,8 +354,11 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { options.scheme = kv.second; } else if (kv.first == "endpoint_override") { options.endpoint_override = kv.second; - } else if (kv.first == "allow_create_buckets") { - options.allow_create_buckets = + } else if (kv.first == "allow_bucket_creation") { + options.allow_bucket_creation = + ::arrow::internal::AsciiEqualsCaseInsensitive(kv.second, "true"); + } else if (kv.first == "allow_bucket_deletion") { + options.allow_bucket_deletion = ::arrow::internal::AsciiEqualsCaseInsensitive(kv.second, "true"); } else { return Status::Invalid("Unexpected query parameter in S3 URI: '", kv.first, "'"); @@ -767,7 +770,8 @@ class ClientBuilder { const S3Options& options() const { return options_; } - void allow_create_buckets(bool allow) { options_.allow_create_buckets = allow; } + void allow_bucket_creation(bool allow) { options_.allow_bucket_creation = allow; } + void allow_bucket_deletion(bool allow) { options_.allow_bucket_deletion = allow; } protected: S3Options options_; @@ -1657,7 +1661,8 @@ class S3FileSystem::Impl : public std::enable_shared_from_this void SaveBackend(const Aws::Client::AWSError& error) { @@ -1699,10 +1704,10 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisoptions(); } std::string S3FileSystem::region() const { return impl_->region(); } -void S3FileSystem::allow_create_buckets(bool allow) { - impl_->allow_create_buckets(allow); +void S3FileSystem::allow_bucket_creation(bool allow) { + impl_->allow_bucket_creation(allow); +} + +void S3FileSystem::allow_bucket_deletion(bool allow) { + impl_->allow_bucket_deletion(allow); } Result S3FileSystem::GetFileInfo(const std::string& s) { @@ -2406,7 +2415,7 @@ Status S3FileSystem::DeleteDir(const std::string& s) { return Status::NotImplemented("Cannot delete all S3 buckets"); } RETURN_NOT_OK(impl_->DeleteDirContentsAsync(path.bucket, path.key).status()); - if (path.key.empty() && options().allow_create_buckets) { + if (path.key.empty() && options().allow_bucket_deletion) { // Delete bucket S3Model::DeleteBucketRequest req; req.SetBucket(ToAwsString(path.bucket)); @@ -2415,7 +2424,7 @@ Status S3FileSystem::DeleteDir(const std::string& s) { impl_->client_->DeleteBucket(req)); } else if (path.key.empty()) { return Status::IOError("Would delete bucket: '", path.bucket, "'. ", - "To delete buckets, enable the allow_create_buckets option."); + "To delete buckets, enable the allow_bucket_deletion option."); } else { // Delete "directory" RETURN_NOT_OK(impl_->DeleteObject(path.bucket, path.key + kSep)); diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 28df1554c39..ffca2c04a50 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -136,7 +136,10 @@ struct ARROW_EXPORT S3Options { /// In AWS S3, the bucket and all objects will be not publicly visible, and there /// will be no bucket policies and no resource tags. To have more control over how /// buckets are created, use a different API to create them. - bool allow_create_buckets = false; + bool allow_bucket_creation = false; + + /// Whether to allow deletion of buckets + bool allow_bucket_deletion = false; /// \brief Default metadata for OpenOutputStream. /// @@ -226,7 +229,11 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { /// Return the actual region this filesystem connects to std::string region() const; - void allow_create_buckets(bool allow); + /// Set create_buckets property of options + void allow_bucket_creation(bool allow); + + /// Set delete_buckets property of options + void allow_bucket_deletion(bool allow); bool Equals(const FileSystem& other) const override; diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index 4c715451974..fd628234bfd 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -406,7 +406,8 @@ class TestS3FS : public S3TestMixin { void SetUp() override { S3TestMixin::SetUp(); // Most tests will create buckets - options_.allow_create_buckets = true; + options_.allow_bucket_creation = true; + options_.allow_bucket_deletion = true; MakeFileSystem(); // Set up test bucket { @@ -1130,7 +1131,8 @@ TEST_F(TestS3FS, NoCreateDeleteBucket) { // Create a bucket to try deleting ASSERT_OK(fs_->CreateDir("test-no-delete")); - options_.allow_create_buckets = false; + options_.allow_bucket_creation = false; + options_.allow_bucket_deletion = false; MakeFileSystem(); ASSERT_RAISES(IOError, fs_->CreateDir("test-no-create")); ASSERT_RAISES(IOError, fs_->DeleteDir("test-no-delete")); diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 75c63fff2ab..d16610bc171 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -99,9 +99,9 @@ cdef class S3FileSystem(FileSystem): Note: S3 buckets are special and the operations available on them may be limited or more expensive than desired. - When S3FileSystem creates new buckets (assuming allow_create_buckets is True), - it does not pass any non-default settings. In AWS S3, the bucket and all - objects will be not publicly visible, and will have no bucket policies + When S3FileSystem creates new buckets (assuming allow_bucket_creation is + True), it does not pass any non-default settings. In AWS S3, the bucket and + all objects will be not publicly visible, and will have no bucket policies and no resource tags. To have more control over how buckets are created, use a different API to create them. @@ -156,9 +156,12 @@ cdef class S3FileSystem(FileSystem): S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost', 'port': 8020, 'username': 'username', 'password': 'password'}) - allow_create_buckets : bool, default False - If True, allows creating and deleting buckets. This option may also be - passed in a URI query parameter. + allow_bucket_creation : bool, default False + If True, allows creating buckets. This option may also be passed in a + URI query parameter. + allow_bucket_deletion : bool, default False + If True, allows deleting buckets. This option may also be passed in a + URI query parameter. """ cdef: @@ -169,7 +172,7 @@ cdef class S3FileSystem(FileSystem): endpoint_override=None, bint background_writes=True, default_metadata=None, role_arn=None, session_name=None, external_id=None, load_frequency=900, proxy_options=None, - allow_create_buckets=False): + allow_bucket_creation=False, allow_bucket_deletion=False): cdef: CS3Options options shared_ptr[CS3FileSystem] wrapped @@ -263,7 +266,8 @@ cdef class S3FileSystem(FileSystem): "'proxy_options': expected 'dict' or 'str', " f"got {type(proxy_options)} instead.") - options.allow_create_buckets = allow_create_buckets + options.allow_bucket_creation = allow_bucket_creation + options.allow_bucket_deletion = allow_bucket_deletion with nogil: wrapped = GetResultValue(CS3FileSystem.Make(options)) @@ -326,12 +330,23 @@ cdef class S3FileSystem(FileSystem): return frombytes(self.s3fs.region()) @property - def allow_create_buckets(self): + def allow_bucket_creation(self): """ - Whether to allow CreateDir and DeleteDir at the bucket-level. + Whether to allow CreateDir at the bucket-level. """ - return self.s3fs.options().allow_create_buckets + return self.s3fs.options().allow_bucket_creation - @allow_create_buckets.setter - def allow_create_buckets(self, value): - self.s3fs.allow_create_buckets(value) + @allow_bucket_creation.setter + def allow_bucket_creation(self, value): + self.s3fs.allow_bucket_creation(value) + + @property + def allow_bucket_deletion(self): + """ + Whether to allow DeleteDir at the bucket-level. + """ + return self.s3fs.options().allow_bucket_deletion + + @allow_bucket_deletion.setter + def allow_bucket_deletion(self, value): + self.s3fs.allow_bucket_deletion(value) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 89716e07801..313f3af06db 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -155,7 +155,8 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_string endpoint_override c_string scheme c_bool background_writes - c_bool allow_create_buckets + c_bool allow_bucket_creation + c_bool allow_bucket_deletion shared_ptr[const CKeyValueMetadata] default_metadata c_string role_arn c_string session_name @@ -194,7 +195,8 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CResult[shared_ptr[CS3FileSystem]] Make(const CS3Options& options) CS3Options options() c_string region() - void allow_create_buckets(c_bool allow) + void allow_bucket_creation(c_bool allow) + void allow_bucket_deletion(c_bool allow) cdef CStatus CInitializeS3 "arrow::fs::InitializeS3"( const CS3GlobalOptions& options) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 72e9edf6b9c..3299ca2c32e 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2495,7 +2495,7 @@ def s3_example_simple(s3_server): host, port, access_key, secret_key = s3_server['connection'] uri = ( "s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}" - "&allow_create_buckets=True" + "&allow_bucket_creation=True" .format(access_key, secret_key, host, port) ) @@ -2565,7 +2565,7 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_server): fs, path = FileSystem.from_uri(uri) assert path == 'theirbucket/nested/folder/data.parquet' - fs.allow_create_buckets = True + fs.allow_bucket_creation = True fs.create_dir(bucket) table = pa.table({'a': [1, 2, 3]}) @@ -4552,7 +4552,7 @@ def test_write_dataset_s3_put_only(s3_server): ) # Error enforced by minio / S3 service - fs.allow_create_buckets = True + fs.allow_bucket_creation = True with pytest.raises(OSError, match="Access Denied"): ds.write_dataset( table, "non-existing-bucket", filesystem=fs, diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 110766facf4..e486ab4e87d 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -213,7 +213,8 @@ def s3fs(request, s3_server): secret_key=secret_key, endpoint_override='{}:{}'.format(host, port), scheme='http', - allow_create_buckets=True + allow_bucket_creation=True, + allow_bucket_deletion=True ) fs.create_dir(bucket) @@ -446,6 +447,9 @@ def test_s3fs_limited_permissions_create_bucket(s3_server): with pytest.raises(pa.ArrowIOError): fs.create_dir('new-bucket') + + with pytest.raises(pa.ArrowIOError): + fs.delete_dir('existing-bucket') def test_file_info_constructor(): @@ -1319,7 +1323,7 @@ def test_filesystem_from_uri_s3(s3_server): assert isinstance(fs, S3FileSystem) assert path == "mybucket/foo/bar" - fs.allow_create_buckets = True + fs.allow_bucket_creation = True fs.create_dir(path) [info] = fs.get_file_info([path]) assert info.path == path diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 63d6e03bc31..179205e60b6 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1280,16 +1280,20 @@ fs___CopyFiles <- function(source_fs, source_sel, destination_fs, destination_ba invisible(.Call(`_arrow_fs___CopyFiles`, source_fs, source_sel, destination_fs, destination_base_dir, chunk_size, use_threads)) } -fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_create_buckets) { - .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_create_buckets) +fs___S3FileSystem__create <- function(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion) { + .Call(`_arrow_fs___S3FileSystem__create`, anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion) } fs___S3FileSystem__region <- function(fs) { .Call(`_arrow_fs___S3FileSystem__region`, fs) } -fs__S3FileSystem__allow_create_buckets <- function(fs, allow) { - invisible(.Call(`_arrow_fs__S3FileSystem__allow_create_buckets`, fs, allow)) +fs__S3FileSystem__allow_bucket_creation <- function(fs, allow) { + invisible(.Call(`_arrow_fs__S3FileSystem__allow_bucket_creation`, fs, allow)) +} + +fs__S3FileSystem__allow_bucket_deletion <- function(fs, allow) { + invisible(.Call(`_arrow_fs__S3FileSystem__allow_bucket_deletion`, fs, allow)) } io___Readable__Read <- function(x, nbytes) { diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 38e42d69cae..0c7a19c056b 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -150,9 +150,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `scheme`: S3 connection transport (default "https") #' - `background_writes`: logical, whether `OutputStream` writes will be issued #' in the background, without blocking (default `TRUE`) -#' - `allow_create_buckets`: logical, if TRUE, the filesystem will create or -#' delete buckets if `$CreateDir()` or `$DeleteDir()` is called on the bucket -#' level (default `FALSE`). +#' - `allow_bucket_creation`: logical, if TRUE, the filesystem will create +#' buckets if `$CreateDir()` is called on the bucket level (default `FALSE`). +#' - `allow_bucket_deletion`: logical, if TRUE, the filesystem will delete +#' buckets if`$DeleteDir()` is called on the bucket level (default `FALSE`). #' #' @section Methods: #' @@ -183,9 +184,10 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `$OpenAppendStream(path)`: Open an [output stream][OutputStream] for #' appending. #' -#' `S3FileSystem` also has a method: +#' `S3FileSystem` also has methods: #' -#' - `$allow_create_buckets()` which can set the corresponding option above. +#' - `$allow_bucket_creation()` which can set the corresponding option above. +#' - `$allow_bucket_deletion()` which can set the corresponding option above. #' #' @section Active bindings: #' @@ -199,7 +201,7 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' @section Notes: #' #' On S3FileSystem, `$CreateDir()` on a top-level directory creates a new bucket. -#' When S3FileSystem creates new buckets (assuming allow_create_buckets is TRUE), +#' When S3FileSystem creates new buckets (assuming allow_bucket_creation is TRUE), #' it does not pass any non-default settings. In AWS S3, the bucket and all #' objects will be not publicly visible, and will have no bucket policies #' and no resource tags. To have more control over how buckets are created, @@ -348,7 +350,8 @@ S3FileSystem <- R6Class("S3FileSystem", region = function() fs___S3FileSystem__region(self) ), public = list( - allow_create_buckets = function(allow) fs__S3FileSystem__allow_create_buckets(self, allow) + allow_bucket_creation = function(allow) fs__S3FileSystem__allow_bucket_creation(self, allow), + allow_bucket_deletion = function(allow) fs__S3FileSystem__allow_bucket_deletion(self, allow) ) ) S3FileSystem$create <- function(anonymous = FALSE, ...) { @@ -357,7 +360,7 @@ S3FileSystem$create <- function(anonymous = FALSE, ...) { invalid_args <- intersect( c( "access_key", "secret_key", "session_token", "role_arn", "session_name", - "external_id", "load_frequency", "allow_create_buckets" + "external_id", "load_frequency", "allow_bucket_creation", "allow_bucket_deletion" ), names(args) ) @@ -403,7 +406,8 @@ default_s3_options <- list( scheme = "", proxy_options = "", background_writes = TRUE, - allow_create_buckets = FALSE + allow_bucket_creation = FALSE, + allow_bucket_deletion = FALSE ) #' Connect to an AWS S3 bucket diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 67f69a8bb1c..844646b6f8e 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -50,9 +50,10 @@ that emulate S3. \item \code{scheme}: S3 connection transport (default "https") \item \code{background_writes}: logical, whether \code{OutputStream} writes will be issued in the background, without blocking (default \code{TRUE}) -\item \code{allow_create_buckets}: logical, if TRUE, the filesystem will create or -delete buckets if \verb{$CreateDir()} or \verb{$DeleteDir()} is called on the bucket -level (default \code{FALSE}). +\item \code{allow_bucket_creation}: logical, if TRUE, the filesystem will create +buckets if \verb{$CreateDir()} is called on the bucket level (default \code{FALSE}). +\item \code{allow_bucket_deletion}: logical, if TRUE, the filesystem will delete +buckets if\verb{$DeleteDir()} is called on the bucket level (default \code{FALSE}). } } @@ -87,9 +88,10 @@ sequential writing. appending. } -\code{S3FileSystem} also has a method: +\code{S3FileSystem} also has methods: \itemize{ -\item \verb{$allow_create_buckets()} which can set the corresponding option above. +\item \verb{$allow_bucket_creation()} which can set the corresponding option above. +\item \verb{$allow_bucket_deletion()} which can set the corresponding option above. } } @@ -109,7 +111,7 @@ root in this \code{SubTreeFileSystem}. On S3FileSystem, \verb{$CreateDir()} on a top-level directory creates a new bucket. -When S3FileSystem creates new buckets (assuming allow_create_buckets is TRUE), +When S3FileSystem creates new buckets (assuming allow_bucket_creation is TRUE), it does not pass any non-default settings. In AWS S3, the bucket and all objects will be not publicly visible, and will have no bucket policies and no resource tags. To have more control over how buckets are created, diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index a016cda4a8f..5631185ee52 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3180,8 +3180,8 @@ END_CPP11 } // filesystem.cpp #if defined(ARROW_R_WITH_S3) -std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes, bool allow_create_buckets); -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_create_buckets_sexp){ +std::shared_ptr fs___S3FileSystem__create(bool anonymous, std::string access_key, std::string secret_key, std::string session_token, std::string role_arn, std::string session_name, std::string external_id, int load_frequency, std::string region, std::string endpoint_override, std::string scheme, std::string proxy_options, bool background_writes, bool allow_bucket_creation, bool allow_bucket_deletion); +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp){ BEGIN_CPP11 arrow::r::Input::type anonymous(anonymous_sexp); arrow::r::Input::type access_key(access_key_sexp); @@ -3196,12 +3196,13 @@ BEGIN_CPP11 arrow::r::Input::type scheme(scheme_sexp); arrow::r::Input::type proxy_options(proxy_options_sexp); arrow::r::Input::type background_writes(background_writes_sexp); - arrow::r::Input::type allow_create_buckets(allow_create_buckets_sexp); - return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_create_buckets)); + arrow::r::Input::type allow_bucket_creation(allow_bucket_creation_sexp); + arrow::r::Input::type allow_bucket_deletion(allow_bucket_deletion_sexp); + return cpp11::as_sexp(fs___S3FileSystem__create(anonymous, access_key, secret_key, session_token, role_arn, session_name, external_id, load_frequency, region, endpoint_override, scheme, proxy_options, background_writes, allow_bucket_creation, allow_bucket_deletion)); END_CPP11 } #else -extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_create_buckets_sexp){ +extern "C" SEXP _arrow_fs___S3FileSystem__create(SEXP anonymous_sexp, SEXP access_key_sexp, SEXP secret_key_sexp, SEXP session_token_sexp, SEXP role_arn_sexp, SEXP session_name_sexp, SEXP external_id_sexp, SEXP load_frequency_sexp, SEXP region_sexp, SEXP endpoint_override_sexp, SEXP scheme_sexp, SEXP proxy_options_sexp, SEXP background_writes_sexp, SEXP allow_bucket_creation_sexp, SEXP allow_bucket_deletion_sexp){ Rf_error("Cannot call fs___S3FileSystem__create(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -3223,18 +3224,35 @@ extern "C" SEXP _arrow_fs___S3FileSystem__region(SEXP fs_sexp){ // filesystem.cpp #if defined(ARROW_R_WITH_S3) -void fs__S3FileSystem__allow_create_buckets(const std::shared_ptr& fs, bool allow); -extern "C" SEXP _arrow_fs__S3FileSystem__allow_create_buckets(SEXP fs_sexp, SEXP allow_sexp){ +void fs__S3FileSystem__allow_bucket_creation(const std::shared_ptr& fs, bool allow); +extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_creation(SEXP fs_sexp, SEXP allow_sexp){ BEGIN_CPP11 arrow::r::Input&>::type fs(fs_sexp); arrow::r::Input::type allow(allow_sexp); - fs__S3FileSystem__allow_create_buckets(fs, allow); + fs__S3FileSystem__allow_bucket_creation(fs, allow); return R_NilValue; END_CPP11 } #else -extern "C" SEXP _arrow_fs__S3FileSystem__allow_create_buckets(SEXP fs_sexp, SEXP allow_sexp){ - Rf_error("Cannot call fs__S3FileSystem__allow_create_buckets(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_creation(SEXP fs_sexp, SEXP allow_sexp){ + Rf_error("Cannot call fs__S3FileSystem__allow_bucket_creation(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); +} +#endif + +// filesystem.cpp +#if defined(ARROW_R_WITH_S3) +void fs__S3FileSystem__allow_bucket_deletion(const std::shared_ptr& fs, bool allow); +extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_deletion(SEXP fs_sexp, SEXP allow_sexp){ +BEGIN_CPP11 + arrow::r::Input&>::type fs(fs_sexp); + arrow::r::Input::type allow(allow_sexp); + fs__S3FileSystem__allow_bucket_deletion(fs, allow); + return R_NilValue; +END_CPP11 +} +#else +extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_deletion(SEXP fs_sexp, SEXP allow_sexp){ + Rf_error("Cannot call fs__S3FileSystem__allow_bucket_deletion(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); } #endif @@ -5450,9 +5468,10 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___SubTreeFileSystem__base_path", (DL_FUNC) &_arrow_fs___SubTreeFileSystem__base_path, 1}, { "_arrow_fs___FileSystemFromUri", (DL_FUNC) &_arrow_fs___FileSystemFromUri, 1}, { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, - { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 14}, + { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 15}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, - { "_arrow_fs__S3FileSystem__allow_create_buckets", (DL_FUNC) &_arrow_fs__S3FileSystem__allow_create_buckets, 2}, + { "_arrow_fs__S3FileSystem__allow_bucket_creation", (DL_FUNC) &_arrow_fs__S3FileSystem__allow_bucket_creation, 2}, + { "_arrow_fs__S3FileSystem__allow_bucket_deletion", (DL_FUNC) &_arrow_fs__S3FileSystem__allow_bucket_deletion, 2}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 687f4c8b883..2e58b188b5e 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -283,7 +283,7 @@ std::shared_ptr fs___S3FileSystem__create( std::string session_name = "", std::string external_id = "", int load_frequency = 900, std::string region = "", std::string endpoint_override = "", std::string scheme = "", std::string proxy_options = "", bool background_writes = true, - bool allow_create_buckets = false) { + bool allow_bucket_creation = false, bool allow_bucket_deletion = false) { // We need to ensure that S3 is initialized before we start messing with the // options StopIfNotOk(fs::EnsureS3Initialized()); @@ -322,7 +322,8 @@ std::shared_ptr fs___S3FileSystem__create( /// default true s3_opts.background_writes = background_writes; - s3_opts.allow_create_buckets = allow_create_buckets; + s3_opts.allow_bucket_creation = allow_bucket_creation; + s3_opts.allow_bucket_deletion = allow_bucket_deletion; auto io_context = arrow::io::IOContext(gc_memory_pool()); return ValueOrStop(fs::S3FileSystem::Make(s3_opts, io_context)); @@ -334,9 +335,15 @@ std::string fs___S3FileSystem__region(const std::shared_ptr& f } // [[s3::export]] -void fs__S3FileSystem__allow_create_buckets(const std::shared_ptr& fs, - bool allow) { - fs->allow_create_buckets(allow); +void fs__S3FileSystem__allow_bucket_creation(const std::shared_ptr& fs, + bool allow) { + fs->allow_bucket_creation(allow); +} + +// [[s3::export]] +void fs__S3FileSystem__allow_bucket_deletion(const std::shared_ptr& fs, + bool allow) { + fs->allow_bucket_deletion(allow); } #endif diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index 62bac24309c..d0f7b323cfc 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -35,7 +35,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { secret_key = minio_secret, scheme = "http", endpoint_override = paste0("localhost:", minio_port), - allow_create_buckets = TRUE + allow_bucket_creation = TRUE, + allow_bucket_deletion = TRUE ) now <- as.character(as.numeric(Sys.time())) fs$CreateDir(now) @@ -181,11 +182,12 @@ if (arrow_with_s3() && process_is_running("minio server")) { expect_length(fs$ls(minio_path("new_dataset_dir")), 1) }) - test_that("CreateDir fails on bucket if allow_create_buckets=False", { + test_that("CreateDir fails on bucket if allow_bucket_creation=False", { now_tmp <- paste0(now, "-test-fail-delete") fs$CreateDir(now_tmp) - fs$allow_create_buckets(FALSE) + fs$allow_bucket_creation(FALSE) + fs$allow_bucket_deletion(FALSE) expect_error( fs$CreateDir("should-fail"), regexp = "Bucket does not exist" @@ -194,7 +196,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { fs$DeleteDir(now_tmp), regexp = "Would delete bucket" ) - fs$allow_create_buckets(TRUE) + fs$allow_bucket_creation(TRUE) + fs$allow_bucket_deletion(TRUE) }) test_that("Let's test copy_files too", { From 31abe7cf860ce9e7a87799d5adf5f009b06ba3ca Mon Sep 17 00:00:00 2001 From: Will Jones Date: Wed, 1 Jun 2022 15:12:23 -0700 Subject: [PATCH 13/18] feat: Make boolean parsing stricter --- cpp/src/arrow/filesystem/s3fs.cc | 8 ++++---- cpp/src/arrow/util/string.cc | 10 ++++++++++ cpp/src/arrow/util/string.h | 8 ++++++++ python/pyarrow/tests/test_fs.py | 2 +- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 43eca71c2d5..ee9131d85f3 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -355,11 +355,11 @@ Result S3Options::FromUri(const Uri& uri, std::string* out_path) { } else if (kv.first == "endpoint_override") { options.endpoint_override = kv.second; } else if (kv.first == "allow_bucket_creation") { - options.allow_bucket_creation = - ::arrow::internal::AsciiEqualsCaseInsensitive(kv.second, "true"); + ARROW_ASSIGN_OR_RAISE(options.allow_bucket_creation, + ::arrow::internal::ParseBoolean(kv.second)); } else if (kv.first == "allow_bucket_deletion") { - options.allow_bucket_deletion = - ::arrow::internal::AsciiEqualsCaseInsensitive(kv.second, "true"); + ARROW_ASSIGN_OR_RAISE(options.allow_bucket_deletion, + ::arrow::internal::ParseBoolean(kv.second)); } else { return Status::Invalid("Unexpected query parameter in S3 URI: '", kv.first, "'"); } diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index d922311df1c..3a158600552 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -187,5 +187,15 @@ util::optional Replace(util::string_view s, util::string_view token s.substr(token_start + token.size()).to_string(); } +Result ParseBoolean(util::string_view value) { + if (AsciiEqualsCaseInsensitive(value, "true") || value == "1") { + return true; + } else if (AsciiEqualsCaseInsensitive(value, "false") || value == "0") { + return false; + } else { + return Status::Invalid("String is not a valid boolean value: '", value, "'."); + } +} + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index 68b8a54e313..d2c8ac38eec 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -20,6 +20,7 @@ #include #include +#include "arrow/result.h" #include "arrow/util/optional.h" #include "arrow/util/string_view.h" #include "arrow/util/visibility.h" @@ -75,5 +76,12 @@ ARROW_EXPORT util::optional Replace(util::string_view s, util::string_view token, util::string_view replacement); +/// \brief Get boolean value from string +/// +/// If "1", "true" (case-insensitive), returns true +/// If "0", "false" (case-insensitive), returns false +/// Otherwise, returns Status::Invalid +ARROW_EXPORT +arrow::Result ParseBoolean(util::string_view value); } // namespace internal } // namespace arrow diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index e486ab4e87d..56248cfe599 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -447,7 +447,7 @@ def test_s3fs_limited_permissions_create_bucket(s3_server): with pytest.raises(pa.ArrowIOError): fs.create_dir('new-bucket') - + with pytest.raises(pa.ArrowIOError): fs.delete_dir('existing-bucket') From 02a1db0a853fa7a24edb6760cc5f8ecc41279323 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Thu, 2 Jun 2022 10:44:24 -0700 Subject: [PATCH 14/18] feat: Just allow configuration before construction --- cpp/src/arrow/filesystem/s3fs.cc | 14 ---------- cpp/src/arrow/filesystem/s3fs.h | 6 ----- python/pyarrow/_s3fs.pyx | 30 +++------------------ python/pyarrow/includes/libarrow_fs.pxd | 2 -- python/pyarrow/tests/test_dataset.py | 16 +++++++---- python/pyarrow/tests/test_fs.py | 6 ++--- r/R/arrowExports.R | 8 ------ r/R/filesystem.R | 9 ------- r/man/FileSystem.Rd | 6 ----- r/src/arrowExports.cpp | 36 ------------------------- r/src/filesystem.cpp | 12 --------- r/tests/testthat/test-s3-minio.R | 16 ++++++----- 12 files changed, 28 insertions(+), 133 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index ee9131d85f3..3a1563ee5d6 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -770,9 +770,6 @@ class ClientBuilder { const S3Options& options() const { return options_; } - void allow_bucket_creation(bool allow) { options_.allow_bucket_creation = allow; } - void allow_bucket_deletion(bool allow) { options_.allow_bucket_deletion = allow; } - protected: S3Options options_; Aws::Client::ClientConfiguration client_config_; @@ -1661,9 +1658,6 @@ class S3FileSystem::Impl : public std::enable_shared_from_this void SaveBackend(const Aws::Client::AWSError& error) { if (!backend_ || *backend_ == S3Backend::Other) { @@ -2223,14 +2217,6 @@ S3Options S3FileSystem::options() const { return impl_->options(); } std::string S3FileSystem::region() const { return impl_->region(); } -void S3FileSystem::allow_bucket_creation(bool allow) { - impl_->allow_bucket_creation(allow); -} - -void S3FileSystem::allow_bucket_deletion(bool allow) { - impl_->allow_bucket_deletion(allow); -} - Result S3FileSystem::GetFileInfo(const std::string& s) { ARROW_ASSIGN_OR_RAISE(auto path, S3Path::FromString(s)); FileInfo info; diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index ffca2c04a50..6015b6dc82b 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -229,12 +229,6 @@ class ARROW_EXPORT S3FileSystem : public FileSystem { /// Return the actual region this filesystem connects to std::string region() const; - /// Set create_buckets property of options - void allow_bucket_creation(bool allow); - - /// Set delete_buckets property of options - void allow_bucket_deletion(bool allow); - bool Equals(const FileSystem& other) const override; /// \cond FALSE diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index d16610bc171..8a72b706830 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -157,11 +157,11 @@ cdef class S3FileSystem(FileSystem): 'port': 8020, 'username': 'username', 'password': 'password'}) allow_bucket_creation : bool, default False - If True, allows creating buckets. This option may also be passed in a - URI query parameter. + Whether to allow CreateDir at the bucket-level. This option may also be + passed in a URI query parameter. allow_bucket_deletion : bool, default False - If True, allows deleting buckets. This option may also be passed in a - URI query parameter. + Whether to allow DeleteDir at the bucket-level. This option may also be + passed in a URI query parameter. """ cdef: @@ -328,25 +328,3 @@ cdef class S3FileSystem(FileSystem): The AWS region this filesystem connects to. """ return frombytes(self.s3fs.region()) - - @property - def allow_bucket_creation(self): - """ - Whether to allow CreateDir at the bucket-level. - """ - return self.s3fs.options().allow_bucket_creation - - @allow_bucket_creation.setter - def allow_bucket_creation(self, value): - self.s3fs.allow_bucket_creation(value) - - @property - def allow_bucket_deletion(self): - """ - Whether to allow DeleteDir at the bucket-level. - """ - return self.s3fs.options().allow_bucket_deletion - - @allow_bucket_deletion.setter - def allow_bucket_deletion(self, value): - self.s3fs.allow_bucket_deletion(value) diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 313f3af06db..1498e413094 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -195,8 +195,6 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: CResult[shared_ptr[CS3FileSystem]] Make(const CS3Options& options) CS3Options options() c_string region() - void allow_bucket_creation(c_bool allow) - void allow_bucket_deletion(c_bool allow) cdef CStatus CInitializeS3 "arrow::fs::InitializeS3"( const CS3GlobalOptions& options) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 3299ca2c32e..07cf63d1220 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2558,14 +2558,14 @@ def test_open_dataset_from_s3_with_filesystem_uri(s3_server): host, port, access_key, secret_key = s3_server['connection'] bucket = 'theirbucket' path = 'nested/folder/data.parquet' - uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}".format( - access_key, secret_key, bucket, path, host, port - ) + uri = "s3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}"\ + "&allow_bucket_creation=true".format( + access_key, secret_key, bucket, path, host, port + ) fs, path = FileSystem.from_uri(uri) assert path == 'theirbucket/nested/folder/data.parquet' - fs.allow_bucket_creation = True fs.create_dir(bucket) table = pa.table({'a': [1, 2, 3]}) @@ -4552,7 +4552,13 @@ def test_write_dataset_s3_put_only(s3_server): ) # Error enforced by minio / S3 service - fs.allow_bucket_creation = True + fs = S3FileSystem( + access_key='limited', + secret_key='limited123', + endpoint_override='{}:{}'.format(host, port), + scheme='http', + allow_bucket_creation=True, + ) with pytest.raises(OSError, match="Access Denied"): ds.write_dataset( table, "non-existing-bucket", filesystem=fs, diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 56248cfe599..ea4b3fc79bb 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -1316,14 +1316,14 @@ def test_filesystem_from_uri_s3(s3_server): host, port, access_key, secret_key = s3_server['connection'] - uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}" \ - .format(access_key, secret_key, host, port) + uri = "s3://{}:{}@mybucket/foo/bar?scheme=http&endpoint_override={}:{}"\ + "&allow_bucket_creation=True" \ + .format(access_key, secret_key, host, port) fs, path = FileSystem.from_uri(uri) assert isinstance(fs, S3FileSystem) assert path == "mybucket/foo/bar" - fs.allow_bucket_creation = True fs.create_dir(path) [info] = fs.get_file_info([path]) assert info.path == path diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 179205e60b6..4c579840e49 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1288,14 +1288,6 @@ fs___S3FileSystem__region <- function(fs) { .Call(`_arrow_fs___S3FileSystem__region`, fs) } -fs__S3FileSystem__allow_bucket_creation <- function(fs, allow) { - invisible(.Call(`_arrow_fs__S3FileSystem__allow_bucket_creation`, fs, allow)) -} - -fs__S3FileSystem__allow_bucket_deletion <- function(fs, allow) { - invisible(.Call(`_arrow_fs__S3FileSystem__allow_bucket_deletion`, fs, allow)) -} - io___Readable__Read <- function(x, nbytes) { .Call(`_arrow_io___Readable__Read`, x, nbytes) } diff --git a/r/R/filesystem.R b/r/R/filesystem.R index 0c7a19c056b..b035430ff65 100644 --- a/r/R/filesystem.R +++ b/r/R/filesystem.R @@ -184,11 +184,6 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F #' - `$OpenAppendStream(path)`: Open an [output stream][OutputStream] for #' appending. #' -#' `S3FileSystem` also has methods: -#' -#' - `$allow_bucket_creation()` which can set the corresponding option above. -#' - `$allow_bucket_deletion()` which can set the corresponding option above. -#' #' @section Active bindings: #' #' - `$type_name`: string filesystem type name, such as "local", "s3", etc. @@ -348,10 +343,6 @@ S3FileSystem <- R6Class("S3FileSystem", inherit = FileSystem, active = list( region = function() fs___S3FileSystem__region(self) - ), - public = list( - allow_bucket_creation = function(allow) fs__S3FileSystem__allow_bucket_creation(self, allow), - allow_bucket_deletion = function(allow) fs__S3FileSystem__allow_bucket_deletion(self, allow) ) ) S3FileSystem$create <- function(anonymous = FALSE, ...) { diff --git a/r/man/FileSystem.Rd b/r/man/FileSystem.Rd index 844646b6f8e..1ed01644650 100644 --- a/r/man/FileSystem.Rd +++ b/r/man/FileSystem.Rd @@ -87,12 +87,6 @@ sequential writing. \item \verb{$OpenAppendStream(path)}: Open an \link[=OutputStream]{output stream} for appending. } - -\code{S3FileSystem} also has methods: -\itemize{ -\item \verb{$allow_bucket_creation()} which can set the corresponding option above. -\item \verb{$allow_bucket_deletion()} which can set the corresponding option above. -} } \section{Active bindings}{ diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 5631185ee52..887327d48f9 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -3222,40 +3222,6 @@ extern "C" SEXP _arrow_fs___S3FileSystem__region(SEXP fs_sexp){ } #endif -// filesystem.cpp -#if defined(ARROW_R_WITH_S3) -void fs__S3FileSystem__allow_bucket_creation(const std::shared_ptr& fs, bool allow); -extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_creation(SEXP fs_sexp, SEXP allow_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type fs(fs_sexp); - arrow::r::Input::type allow(allow_sexp); - fs__S3FileSystem__allow_bucket_creation(fs, allow); - return R_NilValue; -END_CPP11 -} -#else -extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_creation(SEXP fs_sexp, SEXP allow_sexp){ - Rf_error("Cannot call fs__S3FileSystem__allow_bucket_creation(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); -} -#endif - -// filesystem.cpp -#if defined(ARROW_R_WITH_S3) -void fs__S3FileSystem__allow_bucket_deletion(const std::shared_ptr& fs, bool allow); -extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_deletion(SEXP fs_sexp, SEXP allow_sexp){ -BEGIN_CPP11 - arrow::r::Input&>::type fs(fs_sexp); - arrow::r::Input::type allow(allow_sexp); - fs__S3FileSystem__allow_bucket_deletion(fs, allow); - return R_NilValue; -END_CPP11 -} -#else -extern "C" SEXP _arrow_fs__S3FileSystem__allow_bucket_deletion(SEXP fs_sexp, SEXP allow_sexp){ - Rf_error("Cannot call fs__S3FileSystem__allow_bucket_deletion(). See https://arrow.apache.org/docs/r/articles/install.html for help installing Arrow C++ libraries. "); -} -#endif - // io.cpp std::shared_ptr io___Readable__Read(const std::shared_ptr& x, int64_t nbytes); extern "C" SEXP _arrow_io___Readable__Read(SEXP x_sexp, SEXP nbytes_sexp){ @@ -5470,8 +5436,6 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_fs___CopyFiles", (DL_FUNC) &_arrow_fs___CopyFiles, 6}, { "_arrow_fs___S3FileSystem__create", (DL_FUNC) &_arrow_fs___S3FileSystem__create, 15}, { "_arrow_fs___S3FileSystem__region", (DL_FUNC) &_arrow_fs___S3FileSystem__region, 1}, - { "_arrow_fs__S3FileSystem__allow_bucket_creation", (DL_FUNC) &_arrow_fs__S3FileSystem__allow_bucket_creation, 2}, - { "_arrow_fs__S3FileSystem__allow_bucket_deletion", (DL_FUNC) &_arrow_fs__S3FileSystem__allow_bucket_deletion, 2}, { "_arrow_io___Readable__Read", (DL_FUNC) &_arrow_io___Readable__Read, 2}, { "_arrow_io___InputStream__Close", (DL_FUNC) &_arrow_io___InputStream__Close, 1}, { "_arrow_io___OutputStream__Close", (DL_FUNC) &_arrow_io___OutputStream__Close, 1}, diff --git a/r/src/filesystem.cpp b/r/src/filesystem.cpp index 2e58b188b5e..bcafef34e41 100644 --- a/r/src/filesystem.cpp +++ b/r/src/filesystem.cpp @@ -334,16 +334,4 @@ std::string fs___S3FileSystem__region(const std::shared_ptr& f return fs->region(); } -// [[s3::export]] -void fs__S3FileSystem__allow_bucket_creation(const std::shared_ptr& fs, - bool allow) { - fs->allow_bucket_creation(allow); -} - -// [[s3::export]] -void fs__S3FileSystem__allow_bucket_deletion(const std::shared_ptr& fs, - bool allow) { - fs->allow_bucket_deletion(allow); -} - #endif diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index d0f7b323cfc..ac5d4e9490a 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -38,6 +38,14 @@ if (arrow_with_s3() && process_is_running("minio server")) { allow_bucket_creation = TRUE, allow_bucket_deletion = TRUE ) + limited_fs <- S3FileSystem$create( + access_key = minio_key, + secret_key = minio_secret, + scheme = "http", + endpoint_override = paste0("localhost:", minio_port), + allow_bucket_creation = FALSE, + allow_bucket_deletion = FALSE + ) now <- as.character(as.numeric(Sys.time())) fs$CreateDir(now) # Clean up when we're all done @@ -186,18 +194,14 @@ if (arrow_with_s3() && process_is_running("minio server")) { now_tmp <- paste0(now, "-test-fail-delete") fs$CreateDir(now_tmp) - fs$allow_bucket_creation(FALSE) - fs$allow_bucket_deletion(FALSE) expect_error( - fs$CreateDir("should-fail"), + limited_fs$CreateDir("should-fail"), regexp = "Bucket does not exist" ) expect_error( - fs$DeleteDir(now_tmp), + limited_fs$DeleteDir(now_tmp), regexp = "Would delete bucket" ) - fs$allow_bucket_creation(TRUE) - fs$allow_bucket_deletion(TRUE) }) test_that("Let's test copy_files too", { From 76addc52865a01dcf3bdee397106808beeaec84c Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 3 Jun 2022 10:31:30 -0700 Subject: [PATCH 15/18] fix: remove mention of deletion on creation option --- cpp/src/arrow/filesystem/s3fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h index 6015b6dc82b..fa1fdb96716 100644 --- a/cpp/src/arrow/filesystem/s3fs.h +++ b/cpp/src/arrow/filesystem/s3fs.h @@ -130,7 +130,7 @@ struct ARROW_EXPORT S3Options { /// Whether OutputStream writes will be issued in the background, without blocking. bool background_writes = true; - /// Whether to allow creation or deletion of buckets + /// Whether to allow creation of buckets /// /// When S3FileSystem creates new buckets, it does not pass any non-default settings. /// In AWS S3, the bucket and all objects will be not publicly visible, and there From 367efd394e46044a87a7d9c3b861adae55fa3694 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 6 Jun 2022 08:41:42 -0700 Subject: [PATCH 16/18] fix: Make sure new options can be pickled --- cpp/src/arrow/filesystem/s3fs.cc | 2 ++ python/pyarrow/_s3fs.pyx | 4 +++- python/pyarrow/tests/test_fs.py | 8 ++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 3a1563ee5d6..9ed3389493f 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -383,6 +383,8 @@ Result S3Options::FromUri(const std::string& uri_string, bool S3Options::Equals(const S3Options& other) const { return (region == other.region && endpoint_override == other.endpoint_override && scheme == other.scheme && background_writes == other.background_writes && + allow_bucket_creation == other.allow_bucket_creation && + allow_bucket_deletion == other.allow_bucket_deletion && credentials_kind == other.credentials_kind && proxy_options.Equals(other.proxy_options) && GetAccessKey() == other.GetAccessKey() && diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 8a72b706830..c71acc47840 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -311,6 +311,8 @@ cdef class S3FileSystem(FileSystem): external_id=frombytes(opts.external_id), load_frequency=opts.load_frequency, background_writes=opts.background_writes, + allow_bucket_creation=opts.allow_bucket_creation, + allow_bucket_deletion=opts.allow_bucket_deletion, default_metadata=pyarrow_wrap_metadata(opts.default_metadata), proxy_options={'scheme': frombytes(opts.proxy_options.scheme), 'host': frombytes(opts.proxy_options.host), @@ -318,7 +320,7 @@ cdef class S3FileSystem(FileSystem): 'username': frombytes( opts.proxy_options.username), 'password': frombytes( - opts.proxy_options.password)} + opts.proxy_options.password)}, ),) ) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index ea4b3fc79bb..4bf8a1469c6 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -445,10 +445,10 @@ def test_s3fs_limited_permissions_create_bucket(s3_server): ) fs.create_dir('existing-bucket/test') - with pytest.raises(pa.ArrowIOError): + with pytest.raises(pa.ArrowIOError, match="Bucket does not exist"): fs.create_dir('new-bucket') - with pytest.raises(pa.ArrowIOError): + with pytest.raises(pa.ArrowIOError, match="Would delete bucket"): fs.delete_dir('existing-bucket') @@ -1044,6 +1044,10 @@ def test_s3_options(): assert isinstance(fs, S3FileSystem) assert pickle.loads(pickle.dumps(fs)) == fs + fs = S3FileSystem(allow_bucket_creation=True, allow_bucket_deletion=True) + assert isinstance(fs, S3FileSystem) + assert pickle.loads(pickle.dumps(fs)) == fs + with pytest.raises(ValueError): S3FileSystem(access_key='access') with pytest.raises(ValueError): From 4164e11730373e0a960c6b02590f60125019ed3a Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 6 Jun 2022 14:12:46 -0700 Subject: [PATCH 17/18] chore: push error msg test down to C++ --- cpp/src/arrow/filesystem/s3fs.cc | 4 ++-- cpp/src/arrow/filesystem/s3fs_test.cc | 12 ++++++++++-- python/pyarrow/tests/test_fs.py | 2 +- r/tests/testthat/test-s3-minio.R | 10 ++-------- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc index 9ed3389493f..70b66938fe6 100644 --- a/cpp/src/arrow/filesystem/s3fs.cc +++ b/cpp/src/arrow/filesystem/s3fs.cc @@ -1702,7 +1702,7 @@ class S3FileSystem::Impl : public std::enable_shared_from_thisclient_->DeleteBucket(req)); } else if (path.key.empty()) { - return Status::IOError("Would delete bucket: '", path.bucket, "'. ", + return Status::IOError("Would delete bucket '", path.bucket, "'. ", "To delete buckets, enable the allow_bucket_deletion option."); } else { // Delete "directory" diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc b/cpp/src/arrow/filesystem/s3fs_test.cc index fd628234bfd..7216af297a0 100644 --- a/cpp/src/arrow/filesystem/s3fs_test.cc +++ b/cpp/src/arrow/filesystem/s3fs_test.cc @@ -1134,8 +1134,16 @@ TEST_F(TestS3FS, NoCreateDeleteBucket) { options_.allow_bucket_creation = false; options_.allow_bucket_deletion = false; MakeFileSystem(); - ASSERT_RAISES(IOError, fs_->CreateDir("test-no-create")); - ASSERT_RAISES(IOError, fs_->DeleteDir("test-no-delete")); + + auto maybe_create_dir = fs_->CreateDir("test-no-create"); + ASSERT_RAISES(IOError, maybe_create_dir); + ASSERT_THAT(maybe_create_dir.message(), + ::testing::HasSubstr("Bucket 'test-no-create' not found")); + + auto maybe_delete_dir = fs_->DeleteDir("test-no-delete"); + ASSERT_RAISES(IOError, maybe_delete_dir); + ASSERT_THAT(maybe_delete_dir.message(), + ::testing::HasSubstr("Would delete bucket 'test-no-delete'")); } // Simple retry strategy that records errors encountered and its emitted retry delays diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 4bf8a1469c6..eb0617d350c 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -445,7 +445,7 @@ def test_s3fs_limited_permissions_create_bucket(s3_server): ) fs.create_dir('existing-bucket/test') - with pytest.raises(pa.ArrowIOError, match="Bucket does not exist"): + with pytest.raises(pa.ArrowIOError, match="Bucket 'new-bucket' not found"): fs.create_dir('new-bucket') with pytest.raises(pa.ArrowIOError, match="Would delete bucket"): diff --git a/r/tests/testthat/test-s3-minio.R b/r/tests/testthat/test-s3-minio.R index ac5d4e9490a..ad11d04d5e9 100644 --- a/r/tests/testthat/test-s3-minio.R +++ b/r/tests/testthat/test-s3-minio.R @@ -194,14 +194,8 @@ if (arrow_with_s3() && process_is_running("minio server")) { now_tmp <- paste0(now, "-test-fail-delete") fs$CreateDir(now_tmp) - expect_error( - limited_fs$CreateDir("should-fail"), - regexp = "Bucket does not exist" - ) - expect_error( - limited_fs$DeleteDir(now_tmp), - regexp = "Would delete bucket" - ) + expect_error(limited_fs$CreateDir("should-fail")) + expect_error(limited_fs$DeleteDir(now_tmp)) }) test_that("Let's test copy_files too", { From deb99f322454c49e03b3cef35b40475dc3b25050 Mon Sep 17 00:00:00 2001 From: Will Jones Date: Mon, 6 Jun 2022 14:28:14 -0700 Subject: [PATCH 18/18] fix: correct error match statement --- python/pyarrow/tests/test_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 07cf63d1220..8873918c012 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4544,7 +4544,8 @@ def test_write_dataset_s3_put_only(s3_server): assert result.equals(table) # Error enforced by filesystem - with pytest.raises(OSError, match="Bucket does not exist"): + with pytest.raises(OSError, + match="Bucket 'non-existing-bucket' not found"): ds.write_dataset( table, "non-existing-bucket", filesystem=fs, format="feather", create_dir=True,