From 7f60418985aa93f17eaf7762510386dde7d4c88a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 25 Mar 2022 23:05:02 +0000 Subject: [PATCH 01/85] parent a23c6dd5b404cfcabf70762d693720b849d9501b author Micah Kornfield 1648249502 +0000 committer Micah Kornfield 1650336086 +0000 parent a23c6dd5b404cfcabf70762d693720b849d9501b author Micah Kornfield 1648249502 +0000 committer Micah Kornfield 1650335999 +0000 wip --- .github/workflows/cpp.yml | 2 +- .github/workflows/python.yml | 2 + appveyor.yml | 1 + ci/docker/linux-apt-docs.dockerfile | 1 + ci/docker/ubuntu-22.04-cpp.dockerfile | 1 + ci/scripts/python_build.sh | 2 + ci/scripts/python_test.sh | 7 + ci/scripts/python_wheel_macos_build.sh | 3 +- ci/scripts/python_wheel_manylinux_build.sh | 3 +- ci/scripts/python_wheel_unix_test.sh | 5 + cpp/cmake_modules/ThirdpartyToolchain.cmake | 1 + cpp/src/arrow/filesystem/api.h | 7 +- cpp/src/arrow/filesystem/filesystem.cc | 3 +- cpp/src/arrow/filesystem/gcsfs.cc | 148 ++++++++++----- cpp/src/arrow/filesystem/gcsfs.h | 32 +++- cpp/src/arrow/filesystem/gcsfs_test.cc | 79 ++++++-- cpp/src/arrow/util/config.h.cmake | 1 + dev/archery/archery/cli.py | 2 + dev/archery/archery/lang/cpp.py | 6 +- dev/release/verify-release-candidate.sh | 13 +- .../conda-recipes/arrow-cpp/bld-pyarrow.bat | 1 + .../conda-recipes/arrow-cpp/build-pyarrow.sh | 1 + dev/tasks/homebrew-formulae/apache-arrow.rb | 2 +- dev/tasks/tasks.yml | 5 +- python/CMakeLists.txt | 4 + python/asv-build.sh | 2 + python/pyarrow/_fs.pyx | 3 + python/pyarrow/_gcsfs.pyx | 173 ++++++++++++++++++ python/pyarrow/conftest.py | 2 + python/pyarrow/fs.py | 5 + python/pyarrow/includes/libarrow_fs.pxd | 33 ++++ python/pyarrow/tests/conftest.py | 21 +++ python/pyarrow/tests/test_fs.py | 91 ++++++++- python/setup.py | 8 + 34 files changed, 592 insertions(+), 78 deletions(-) create mode 100644 python/pyarrow/_gcsfs.pyx diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 82e99160c3d..075b27e629e 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -277,7 +277,7 @@ jobs: ARROW_GANDIVA: ON # google-could-cpp uses _dupenv_s() but it can't be used with msvcrt. # We need to use ucrt to use _dupenv_s(). - # ARROW_GCS: ON + ARROW_GCS: OFF ARROW_HDFS: OFF ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} ARROW_JEMALLOC: OFF diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index b14559d12a1..b14d0db3220 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -122,6 +122,8 @@ jobs: ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON + # TODO(ARROW-16102): Turn this on once we can build client successfully. + ARROW_GCS: OFF ARROW_HDFS: ON ARROW_JEMALLOC: ON ARROW_ORC: ON diff --git a/appveyor.yml b/appveyor.yml index 8342dbf6cb3..2699e479b79 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -57,6 +57,7 @@ environment: # (as generated by cmake) - JOB: "Toolchain" GENERATOR: Ninja + ARROW_GCS: "ON" ARROW_S3: "ON" ARROW_BUILD_FLIGHT: "ON" ARROW_BUILD_GANDIVA: "ON" diff --git a/ci/docker/linux-apt-docs.dockerfile b/ci/docker/linux-apt-docs.dockerfile index 3a8a9cf8e24..c1ee003f4f3 100644 --- a/ci/docker/linux-apt-docs.dockerfile +++ b/ci/docker/linux-apt-docs.dockerfile @@ -97,6 +97,7 @@ ENV ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ ARROW_BUILD_UTILITIES=OFF \ ARROW_FLIGHT=ON \ + ARROW_GCS=ON \ ARROW_GLIB_VALA=false \ ARROW_PYTHON=ON \ ARROW_S3=ON \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index a7cc5ff38ad..2e9c96e0be7 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -156,6 +156,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_FLIGHT=ON \ ARROW_FLIGHT_SQL=ON \ ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ ARROW_HDFS=ON \ ARROW_HOME=/usr/local \ ARROW_INSTALL_NAME_RPATH=OFF \ diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index b90321643c7..b715374b925 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -58,11 +58,13 @@ export PYARROW_WITH_CUDA=${ARROW_CUDA:-OFF} export PYARROW_WITH_DATASET=${ARROW_DATASET:-ON} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT:-OFF} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA:-OFF} +export PYARROW_WITH_GCS=${ARROW_GCS:-OFF} export PYARROW_WITH_HDFS=${ARROW_HDFS:-ON} export PYARROW_WITH_ORC=${ARROW_ORC:-OFF} export PYARROW_WITH_PLASMA=${ARROW_PLASMA:-OFF} export PYARROW_WITH_PARQUET=${ARROW_PARQUET:-OFF} export PYARROW_WITH_PARQUET_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} +export PYARROW_WITH_GCS=${ARROW_GCS:-OFF} export PYARROW_WITH_S3=${ARROW_S3:-OFF} export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index e1d06c18727..4d8f1b73c3f 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -38,6 +38,7 @@ export ARROW_DEBUG_MEMORY_POOL=trap : ${PYARROW_TEST_DATASET:=${ARROW_DATASET:-ON}} : ${PYARROW_TEST_FLIGHT:=${ARROW_FLIGHT:-ON}} : ${PYARROW_TEST_GANDIVA:=${ARROW_GANDIVA:-ON}} +: ${PYARROW_TEST_GCS:=${ARROW_GCS:-ON}} : ${PYARROW_TEST_HDFS:=${ARROW_HDFS:-ON}} : ${PYARROW_TEST_ORC:=${ARROW_ORC:-ON}} : ${PYARROW_TEST_PARQUET:=${ARROW_PARQUET:-ON}} @@ -47,9 +48,15 @@ export PYARROW_TEST_CUDA export PYARROW_TEST_DATASET export PYARROW_TEST_FLIGHT export PYARROW_TEST_GANDIVA +export PYARROW_TEST_GCS export PYARROW_TEST_HDFS export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET export PYARROW_TEST_S3 +# DO NOT SUBMIT +# Without this, the previous install of testbench doesn't seem to be put in a +# place that the python env can find it. Need githash because of recent +# dependency changes. +python -m pip install "https://github.com/googleapis/storage-testbench/archive/c80ed7604824428a7b8269504ec948a62b445f5d.tar.gz" pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index b3ae912dff6..7fa43a3eaa2 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -64,7 +64,7 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} -: ${ARROW_GCS:=OFF} +: ${ARROW_GCS:=ON} : ${ARROW_HDFS:=ON} : ${ARROW_JEMALLOC:=ON} : ${ARROW_MIMALLOC:=ON} @@ -148,6 +148,7 @@ export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} +export PYARROW_WITH_GCS=${ARROW_GCS} export PYARROW_WITH_HDFS=${ARROW_HDFS} export PYARROW_WITH_ORC=${ARROW_ORC} export PYARROW_WITH_PARQUET=${ARROW_PARQUET} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index d242fe657c5..6cfd34d851f 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -51,7 +51,7 @@ echo "=== (${PYTHON_VERSION}) Building Arrow C++ libraries ===" : ${ARROW_DATASET:=ON} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=OFF} -: ${ARROW_GCS:=OFF} +: ${ARROW_GCS:=ON} : ${ARROW_HDFS:=ON} : ${ARROW_JEMALLOC:=ON} : ${ARROW_MIMALLOC:=ON} @@ -144,6 +144,7 @@ export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_DATASET=${ARROW_DATASET} export PYARROW_WITH_FLIGHT=${ARROW_FLIGHT} export PYARROW_WITH_GANDIVA=${ARROW_GANDIVA} +export PYARROW_WITH_GCS=${ARROW_GCS} export PYARROW_WITH_HDFS=${ARROW_HDFS} export PYARROW_WITH_ORC=${ARROW_ORC} export PYARROW_WITH_PARQUET=${ARROW_PARQUET} diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 99436e0c1fa..889686ad201 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -31,6 +31,7 @@ source_dir=${1} : ${ARROW_FLIGHT:=ON} : ${ARROW_SUBSTRAIT:=ON} : ${ARROW_S3:=ON} +: ${ARROW_GCS:=ON} : ${CHECK_IMPORTS:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} @@ -39,6 +40,7 @@ export PYARROW_TEST_CYTHON=OFF export PYARROW_TEST_DATASET=ON export PYARROW_TEST_FLIGHT=${ARROW_FLIGHT} export PYARROW_TEST_GANDIVA=OFF +export PYARROW_TEST_GCS=${ARROW_GCS} export PYARROW_TEST_HDFS=ON export PYARROW_TEST_ORC=ON export PYARROW_TEST_PANDAS=ON @@ -69,6 +71,9 @@ import pyarrow.orc import pyarrow.parquet import pyarrow.plasma " + if [ "${PYARROW_TEST_GCS}" == "ON" ]; then + python -c "import pyarrow._gcsfs" + fi if [ "${PYARROW_TEST_S3}" == "ON" ]; then python -c "import pyarrow._s3fs" fi diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 4082b7d2ce6..08c056532e2 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -3827,6 +3827,7 @@ if(ARROW_WITH_GRPC) message(STATUS "Forcing gRPC_SOURCE to Protobuf_SOURCE (${Protobuf_SOURCE})") set(gRPC_SOURCE "${Protobuf_SOURCE}") endif() + build_absl_once() resolve_dependency(gRPC HAVE_ALT TRUE diff --git a/cpp/src/arrow/filesystem/api.h b/cpp/src/arrow/filesystem/api.h index 5b0c97d150a..732be5f928f 100644 --- a/cpp/src/arrow/filesystem/api.h +++ b/cpp/src/arrow/filesystem/api.h @@ -21,8 +21,11 @@ #include "arrow/filesystem/filesystem.h" // IWYU pragma: export #include "arrow/filesystem/hdfs.h" // IWYU pragma: export -#include "arrow/filesystem/localfs.h" // IWYU pragma: export -#include "arrow/filesystem/mockfs.h" // IWYU pragma: export +#ifdef ARROW_GCS +#include "arrow/filesystem/gcsfs.h" // IWYU pragma: export +#endif +#include "arrow/filesystem/localfs.h" // IWYU pragma: export +#include "arrow/filesystem/mockfs.h" // IWYU pragma: export #ifdef ARROW_S3 #include "arrow/filesystem/s3fs.h" // IWYU pragma: export #endif diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 18b39125f50..48b4646bea0 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -695,8 +695,7 @@ Result> FileSystemFromUriReal(const Uri& uri, if (scheme == "gs" || scheme == "gcs") { #ifdef ARROW_GCS ARROW_ASSIGN_OR_RAISE(auto options, GcsOptions::FromUri(uri, out_path)); - ARROW_ASSIGN_OR_RAISE(auto gcsfs, GcsFileSystem::Make(options, io_context)); - return gcsfs; + return GcsFileSystem::Make(options, io_context); #else return Status::NotImplemented("Got GCS URI but Arrow compiled without GCS support"); #endif diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 9bd1b15b998..becfd8b8e74 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -33,13 +33,23 @@ namespace arrow { namespace fs { -struct GcsCredentials { - explicit GcsCredentials(std::shared_ptr c) +struct GcsCredentialsHolder { + explicit GcsCredentialsHolder(std::shared_ptr c) : credentials(std::move(c)) {} std::shared_ptr credentials; }; +bool GcsCredentials::Equals(const GcsCredentials& other) const { + if (holder_->credentials == other.holder_->credentials) { + return true; + } + return anonymous_ == other.anonymous_ && access_token_ == other.access_token_ && + expiration_ == other.expiration_ && + json_credentials_ == other.json_credentials_ && + target_service_account_ == other.target_service_account_; +} + namespace { namespace gcs = google::cloud::storage; @@ -95,12 +105,10 @@ struct GcsPath { class GcsInputStream : public arrow::io::InputStream { public: explicit GcsInputStream(gcs::ObjectReadStream stream, GcsPath path, - gcs::Generation generation, gcs::ReadFromOffset offset, - gcs::Client client) + gcs::Generation generation, gcs::Client client) : stream_(std::move(stream)), path_(std::move(path)), generation_(generation), - offset_(offset.value_or(0)), client_(std::move(client)) {} ~GcsInputStream() override = default; @@ -115,7 +123,7 @@ class GcsInputStream : public arrow::io::InputStream { Result Tell() const override { if (closed()) return Status::Invalid("Cannot use Tell() on a closed stream"); - return stream_.tellg() + offset_; + return stream_.tellg(); } // A gcs::ObjectReadStream can be "born closed". For small objects the stream returns @@ -156,7 +164,6 @@ class GcsInputStream : public arrow::io::InputStream { mutable gcs::ObjectReadStream stream_; GcsPath path_; gcs::Generation generation_; - std::int64_t offset_; gcs::Client client_; bool closed_ = false; }; @@ -297,8 +304,9 @@ google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { if (!o.endpoint_override.empty()) { options.set(scheme + "://" + o.endpoint_override); } - if (o.credentials && o.credentials->credentials) { - options.set(o.credentials->credentials); + if (o.credentials.holder() && (o.credentials.holder())->credentials) { + options.set( + o.credentials.holder()->credentials); } return options; } @@ -318,19 +326,43 @@ class GcsFileSystem::Impl { return GetFileInfoBucket(path, std::move(meta).status()); } auto meta = client_.GetObjectMetadata(path.bucket, path.object); - return GetFileInfoObject(path, meta); + Result info = GetFileInfoObject(path, meta); + if (!info.ok() || info->type() != FileType::NotFound) { + return info; + } + // Not found case. It could be this was written to GCS with a different + // "Directory" convention. So it if there are is at least one objec that + // matches the prefix we assume it is a directory. + std::string canonical = internal::EnsureTrailingSlash(path.object); + std::string end = canonical; + end.back() += 1; + auto list_result = + client_.ListObjects(path.bucket, gcs::Prefix(canonical), gcs::EndOffset(end)); + if (list_result.begin() != list_result.end()) { + // If there is at least one result it indicates this is a directory (at + // least one object exists that starts with "path/" + return FileInfo(path.full_path, FileType::Directory); + } + // Return the original not-found info if there was no match. + return info; } Result GetFileInfo(const FileSelector& select) { ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(select.base_dir)); - // Adding the trailing '/' avoids problems with files named 'a', 'ab', 'ac' where GCS - // would return all of them if the prefix is 'a'. + // Adding the trailing '/' avoids problems with files named 'a', 'ab', 'ac' where + // GCS would return all of them if the prefix is 'a'. const auto canonical = internal::EnsureTrailingSlash(p.object); - const auto max_depth = internal::Depth(canonical) + select.max_recursion; + // Need to add one level when the object is not empty because all + // directories have an extra slash. + const auto max_depth = + internal::Depth(p.object) + select.max_recursion + !p.object.empty(); auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); + auto include_trailing = select.recursive ? gcs::IncludeTrailingDelimiter(false) + : gcs::IncludeTrailingDelimiter(true); FileInfoVector result; - for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter)) { + for (auto const& o : + client_.ListObjects(p.bucket, prefix, delimiter, include_trailing)) { if (!o.ok()) { if (select.allow_not_found && o.status().code() == google::cloud::StatusCode::kNotFound) { @@ -340,11 +372,13 @@ class GcsFileSystem::Impl { } // Skip the directory itself from the results, and any result that is "too deep" // into the recursion. - if (o->name() == p.object || internal::Depth(o->name()) > max_depth) { + bool has_trailing_slash = !o->name().empty() && o->name().back() == '/'; + if (o->name() == canonical || o->name() == p.object || + internal::Depth(o->name()) > (max_depth + has_trailing_slash)) { continue; } auto path = internal::ConcatAbstractPath(o->bucket(), o->name()); - result.push_back(ToFileInfo(path, *o)); + result.push_back(ToFileInfo(path, *o, /*normalize_directories=*/true)); } // Finding any elements indicates the directory was found. if (!result.empty() || select.allow_not_found) { @@ -365,7 +399,7 @@ class GcsFileSystem::Impl { google::cloud::StatusOr CreateDirMarker(const std::string& bucket, util::string_view name) { // Make the name canonical. - const auto canonical = internal::RemoveTrailingSlash(name).to_string(); + const auto canonical = internal::EnsureTrailingSlash(name); google::cloud::StatusOr object = client_.InsertObject( bucket, canonical, std::string(), gcs::WithObjectMetadata( @@ -398,6 +432,13 @@ class GcsFileSystem::Impl { if (o) { if (IsDirectory(*o)) break; return NotDirectoryError(*o); + } else { + // If we didn't find the raw path, check if there is an entry + // ending in a slash. + o = client_.GetObjectMetadata(bucket, internal::EnsureTrailingSlash(dir)); + if (o) { + break; + } } missing_parents.push_back(dir); } @@ -417,7 +458,7 @@ class GcsFileSystem::Impl { // Note that the list of parents are sorted from deepest to most shallow, this is // convenient because as soon as we find a directory we can stop the iteration. for (auto const& d : missing_parents) { - auto o = CreateDirMarker(bucket, d); + auto o = CreateDirMarker(bucket, internal::EnsureTrailingSlash(d)); if (o) { if (IsDirectory(*o)) continue; // This is probably a race condition, something created a file before we managed @@ -430,15 +471,17 @@ class GcsFileSystem::Impl { Status CreateDir(const GcsPath& p) { if (p.object.empty()) { - return internal::ToArrowStatus( - client_ - .CreateBucket(p.bucket, gcs::BucketMetadata().set_location( - options_.default_bucket_location)) - .status()); + auto metadata = + gcs::BucketMetadata().set_location(options_.default_bucket_location); + return internal::ToArrowStatus(client_.CreateBucket(p.bucket, metadata).status()); } auto parent = p.parent(); if (!parent.object.empty()) { - auto o = client_.GetObjectMetadata(p.bucket, parent.object); + auto o = client_.GetObjectMetadata(p.bucket, + internal::EnsureTrailingSlash(parent.object)); + if (!o.ok()) { + return internal::ToArrowStatus(o.status()); + } if (!IsDirectory(*o)) return NotDirectoryError(*o); } return internal::ToArrowStatus(CreateDirMarker(p.bucket, p.object).status()); @@ -451,7 +494,8 @@ class GcsFileSystem::Impl { Status DeleteDir(const GcsPath& p, const io::IOContext& io_context) { RETURN_NOT_OK(DeleteDirContents(p, /*missing_dir_ok=*/false, io_context)); if (!p.object.empty()) { - return internal::ToArrowStatus(client_.DeleteObject(p.bucket, p.object)); + auto canonical = std::string(internal::EnsureTrailingSlash(p.object)); + return internal::ToArrowStatus(client_.DeleteObject(p.bucket, canonical)); } return internal::ToArrowStatus(client_.DeleteBucket(p.bucket)); } @@ -544,8 +588,7 @@ class GcsFileSystem::Impl { gcs::ReadFromOffset offset) { auto stream = client_.ReadObject(path.bucket, path.object, generation, offset); ARROW_GCS_RETURN_NOT_OK(stream.status()); - return std::make_shared(std::move(stream), path, gcs::Generation(), - offset, client_); + return std::make_shared(std::move(stream), path, generation, client_); } Result> OpenOutputStream( @@ -604,9 +647,15 @@ class GcsFileSystem::Impl { } static FileInfo ToFileInfo(const std::string& full_path, - const gcs::ObjectMetadata& meta) { - if (IsDirectory(meta)) { - return FileInfo(full_path, FileType::Directory); + const gcs::ObjectMetadata& meta, + bool normalize_directories = false) { + if (IsDirectory(meta) || (!full_path.empty() && full_path.back() == '/')) { + if (normalize_directories) { + auto normalized = std::string(internal::RemoveTrailingSlash(full_path)); + return FileInfo(normalized, FileType::Directory); + } else { + return FileInfo(full_path, FileType::Directory); + } } auto info = FileInfo(full_path, FileType::File); info.set_size(static_cast(meta.size())); @@ -622,32 +671,35 @@ class GcsFileSystem::Impl { }; bool GcsOptions::Equals(const GcsOptions& other) const { - return credentials == other.credentials && + return credentials.Equals(other.credentials) && endpoint_override == other.endpoint_override && scheme == other.scheme && default_bucket_location == other.default_bucket_location; } GcsOptions GcsOptions::Defaults() { GcsOptions options{}; - options.credentials = - std::make_shared(google::cloud::MakeGoogleDefaultCredentials()); + options.credentials.holder_ = std::make_shared( + google::cloud::MakeGoogleDefaultCredentials()); options.scheme = "https"; return options; } GcsOptions GcsOptions::Anonymous() { GcsOptions options{}; - options.credentials = - std::make_shared(google::cloud::MakeInsecureCredentials()); + options.credentials.holder_ = + std::make_shared(google::cloud::MakeInsecureCredentials()); + options.credentials.anonymous_ = true; options.scheme = "http"; return options; } GcsOptions GcsOptions::FromAccessToken(const std::string& access_token, - std::chrono::system_clock::time_point expiration) { + TimePoint expiration) { GcsOptions options{}; - options.credentials = std::make_shared( + options.credentials.holder_ = std::make_shared( google::cloud::MakeAccessTokenCredentials(access_token, expiration)); + options.credentials.access_token_ = access_token; + options.credentials.expiration_ = expiration; options.scheme = "https"; return options; } @@ -655,17 +707,20 @@ GcsOptions GcsOptions::FromAccessToken(const std::string& access_token, GcsOptions GcsOptions::FromImpersonatedServiceAccount( const GcsCredentials& base_credentials, const std::string& target_service_account) { GcsOptions options{}; - options.credentials = std::make_shared( + options.credentials = base_credentials; + options.credentials.holder_ = std::make_shared( google::cloud::MakeImpersonateServiceAccountCredentials( - base_credentials.credentials, target_service_account)); + base_credentials.holder_->credentials, target_service_account)); + options.credentials.target_service_account_ = target_service_account; options.scheme = "https"; return options; } GcsOptions GcsOptions::FromServiceAccountCredentials(const std::string& json_object) { GcsOptions options{}; - options.credentials = std::make_shared( + options.credentials.holder_ = std::make_shared( google::cloud::MakeServiceAccountCredentials(json_object)); + options.credentials.json_credentials_ = json_object; options.scheme = "https"; return options; } @@ -698,11 +753,17 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, options_map.emplace(kv.first, kv.second); } - if (!uri.password().empty() || !uri.username().empty()) { - return Status::Invalid("GCS does not accept username or password."); + const std::string& username = uri.username(); + bool anonymous = username == "anonymous"; + if (!uri.password().empty() || (!username.empty() && !anonymous)) { + return Status::Invalid( + "GCS does not accept username except \"anonymous\" or password."); } - auto options = GcsOptions::Defaults(); + if (anonymous) { + options = GcsOptions::Anonymous(); + } + for (const auto& kv : options_map) { if (kv.first == "location") { options.default_bucket_location = kv.second; @@ -726,6 +787,7 @@ Result GcsOptions::FromUri(const std::string& uri_string, } std::string GcsFileSystem::type_name() const { return "gcs"; } +const GcsOptions& GcsFileSystem::options() const { return impl_->options(); } bool GcsFileSystem::Equals(const FileSystem& other) const { if (this == &other) { diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index c84374cdbb8..099cd40095c 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -27,11 +27,36 @@ namespace arrow { namespace fs { -struct GcsCredentials; +// Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers. +struct GcsCredentialsHolder; +class GcsFileSystem; + +/// \brief Container for GCS Credentials and information necessary to recreate them. +class GcsCredentials { + public: + bool Equals(const GcsCredentials& other) const; + bool anonymous() const { return anonymous_; } + const std::string& access_token() { return access_token_; } + TimePoint expiration() const { return expiration_; } + const std::string& target_service_account() { return target_service_account_; } + const std::string& json_credentials() { return json_credentials_; } + const std::shared_ptr& holder() const { return holder_; } + + private: + GcsCredentials() = default; + bool anonymous_ = false; + std::string access_token_; + TimePoint expiration_; + std::string target_service_account_; + std::string json_credentials_; + std::shared_ptr holder_; + friend class GcsFileSystem; + friend struct GcsOptions; +}; /// Options for the GcsFileSystem implementation. struct ARROW_EXPORT GcsOptions { - std::shared_ptr credentials; + GcsCredentials credentials; std::string endpoint_override; std::string scheme; @@ -68,7 +93,7 @@ struct ARROW_EXPORT GcsOptions { /// tokens. Note that access tokens are time limited, you will need to manually refresh /// the tokens created by the out-of-band mechanism. static GcsOptions FromAccessToken(const std::string& access_token, - std::chrono::system_clock::time_point expiration); + TimePoint expiration); /// \brief Initialize with service account impersonation /// @@ -141,6 +166,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem { ~GcsFileSystem() override = default; std::string type_name() const override; + const GcsOptions& options() const; bool Equals(const FileSystem& other) const override; diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 9eaacb0dc15..9c4b4485cdc 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -50,6 +50,7 @@ namespace bp = boost::process; namespace gc = google::cloud; namespace gcs = google::cloud::storage; +using ::testing::Eq; using ::testing::HasSubstr; using ::testing::IsEmpty; using ::testing::Not; @@ -291,15 +292,15 @@ TEST(GcsFileSystem, OptionsCompare) { TEST(GcsFileSystem, OptionsAnonymous) { GcsOptions a = GcsOptions::Anonymous(); - EXPECT_THAT(a.credentials, NotNull()); + EXPECT_THAT(a.credentials.holder(), NotNull()); + EXPECT_THAT(a.credentials.anonymous(), true); EXPECT_EQ(a.scheme, "http"); } TEST(GcsFileSystem, OptionsFromUri) { std::string path; - GcsOptions options; - ASSERT_OK_AND_ASSIGN(options, GcsOptions::FromUri("gs://", &path)); + ASSERT_OK_AND_ASSIGN(GcsOptions options, GcsOptions::FromUri("gs://", &path)); EXPECT_EQ(options.default_bucket_location, ""); EXPECT_EQ(options.scheme, "https"); EXPECT_EQ(options.endpoint_override, ""); @@ -342,20 +343,24 @@ TEST(GcsFileSystem, OptionsFromUri) { } TEST(GcsFileSystem, OptionsAccessToken) { - auto a = GcsOptions::FromAccessToken( - "invalid-access-token-test-only", - std::chrono::system_clock::now() + std::chrono::minutes(5)); - EXPECT_THAT(a.credentials, NotNull()); + TimePoint expiration = std::chrono::system_clock::now() + std::chrono::minutes(5); + auto a = GcsOptions::FromAccessToken(/*access_token=*/"accesst", expiration); + EXPECT_THAT(a.credentials.holder(), NotNull()); + EXPECT_THAT(a.credentials.access_token(), Eq("accesst")); + EXPECT_THAT(a.credentials.expiration(), Eq(expiration)); EXPECT_EQ(a.scheme, "https"); } TEST(GcsFileSystem, OptionsImpersonateServiceAccount) { - auto base = GcsOptions::FromAccessToken( - "invalid-access-token-test-only", - std::chrono::system_clock::now() + std::chrono::minutes(5)); - auto a = GcsOptions::FromImpersonatedServiceAccount( - *base.credentials, "invalid-sa-test-only@my-project.iam.gserviceaccount.com"); - EXPECT_THAT(a.credentials, NotNull()); + TimePoint expiration = std::chrono::system_clock::now() + std::chrono::minutes(5); + auto base = GcsOptions::FromAccessToken(/*access_token=*/"at", expiration); + std::string account = "invalid-sa-test-only@my-project.iam.gserviceaccount.com"; + auto a = GcsOptions::FromImpersonatedServiceAccount(base.credentials, account); + EXPECT_THAT(a.credentials.holder(), NotNull()); + EXPECT_THAT(a.credentials.access_token(), Eq("at")); + EXPECT_THAT(a.credentials.expiration(), Eq(expiration)); + EXPECT_THAT(a.credentials.target_service_account(), Eq(account)); + EXPECT_EQ(a.scheme, "https"); } @@ -378,7 +383,8 @@ TEST(GcsFileSystem, OptionsServiceAccountCredentials) { })"""; auto a = GcsOptions::FromServiceAccountCredentials(kJsonKeyfileContents); - EXPECT_THAT(a.credentials, NotNull()); + EXPECT_THAT(a.credentials.holder(), NotNull()); + EXPECT_THAT(a.credentials.json_credentials(), kJsonKeyfileContents); EXPECT_EQ(a.scheme, "https"); } @@ -425,13 +431,13 @@ TEST(GcsFileSystem, ToArrowStatus) { } TEST(GcsFileSystem, FileSystemCompare) { - GcsOptions a_options; + GcsOptions a_options = GcsOptions::Defaults(); a_options.scheme = "http"; auto a = GcsFileSystem::Make(a_options); EXPECT_THAT(a, NotNull()); EXPECT_TRUE(a->Equals(*a)); - GcsOptions b_options; + GcsOptions b_options = GcsOptions::Defaults(); b_options.scheme = "http"; b_options.endpoint_override = "localhost:1234"; auto b = GcsFileSystem::Make(b_options); @@ -568,7 +574,46 @@ TEST_F(GcsIntegrationTest, GetFileInfoBucket) { ASSERT_RAISES(Invalid, fs->GetFileInfo("gs://" + PreexistingBucketName())); } -TEST_F(GcsIntegrationTest, GetFileInfoObject) { +TEST_F(GcsIntegrationTest, GetFileInfoObjectWithNestedStructure) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; + ASSERT_OK_AND_ASSIGN( + auto output, + fs->OpenOutputStream(PreexistingBucketPath() + kObjectName, /*metadata=*/{})); + const auto data = std::string(kLoremIpsum); + ASSERT_OK(output->Write(data.data(), data.size())); + ASSERT_OK(output->Close()); + + ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(PreexistingBucketPath() + + "test-object-dir/some_other_dir0", + /*metadata=*/{})); + ASSERT_OK(output->Write(data.data(), data.size())); + ASSERT_OK(output->Close()); + ASSERT_OK_AND_ASSIGN( + output, + fs->OpenOutputStream(PreexistingBucketPath() + kObjectName + "0", /*metadata=*/{})); + ASSERT_OK(output->Write(data.data(), data.size())); + ASSERT_OK(output->Close()); + + AssertFileInfo(fs.get(), PreexistingBucketPath() + kObjectName, FileType::File); + AssertFileInfo(fs.get(), PreexistingBucketPath() + kObjectName + "/", + FileType::NotFound); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir", + FileType::Directory); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/", + FileType::Directory); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_dir", + FileType::Directory); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_dir/", + FileType::Directory); + + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-di", + FileType::NotFound); + AssertFileInfo(fs.get(), PreexistingBucketPath() + "test-object-dir/some_other_di", + FileType::NotFound); +} + +TEST_F(GcsIntegrationTest, GetFileInfoObjectNoExplicitObject) { auto fs = GcsFileSystem::Make(TestGcsOptions()); auto object = GcsClient().GetObjectMetadata(PreexistingBucketName(), PreexistingObjectName()); diff --git a/cpp/src/arrow/util/config.h.cmake b/cpp/src/arrow/util/config.h.cmake index 55bc2d01005..bd6447a20e0 100644 --- a/cpp/src/arrow/util/config.h.cmake +++ b/cpp/src/arrow/util/config.h.cmake @@ -45,6 +45,7 @@ #cmakedefine ARROW_IPC #cmakedefine ARROW_JSON +#cmakedefine ARROW_GCS #cmakedefine ARROW_S3 #cmakedefine ARROW_USE_NATIVE_INT128 #cmakedefine ARROW_WITH_OPENTELEMETRY diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index dcf15afafe1..5f69d9ff91a 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -183,6 +183,8 @@ def _apply_options(cmd, options): @click.option("--with-r", default=None, type=BOOL, help="Build the Arrow R extensions. This is not a CMake option, " "it will toggle required options") +@click.option("--with-gcs", default=None, type=BOOL, + help="Build Arrow with Google Cloud Storage (gcs) support.") @click.option("--with-s3", default=None, type=BOOL, help="Build Arrow with S3 support.") # Compressions diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index ac3b251f489..e188a1acf42 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -56,7 +56,7 @@ def __init__(self, with_ipc=True, with_json=None, with_jni=None, with_mimalloc=None, with_jemalloc=None, with_parquet=None, with_plasma=None, with_python=True, - with_r=None, with_s3=None, + with_r=None, with_gcs=None, with_s3=None, # Compressions with_brotli=None, with_bz2=None, with_lz4=None, with_snappy=None, with_zlib=None, with_zstd=None, @@ -106,6 +106,7 @@ def __init__(self, self.with_plasma = with_plasma self.with_python = with_python self.with_r = with_r + self.with_gcs = with_gcs self.with_s3 = with_s3 self.with_brotli = with_brotli @@ -218,7 +219,7 @@ def _gen_defs(self): yield ("ARROW_FILESYSTEM", truthifier(self.with_filesystem)) yield ("ARROW_FLIGHT", truthifier(self.with_flight)) yield ("ARROW_GANDIVA", truthifier(self.with_gandiva)) - yield ("ARROW_PARQUET", truthifier(self.with_parquet)) + yield ("ARROW_GCS", truthifier(self.with_gcs)) yield ("ARROW_HDFS", truthifier(self.with_hdfs)) yield ("ARROW_HIVESERVER2", truthifier(self.with_hiveserver2)) yield ("ARROW_IPC", truthifier(self.with_ipc)) @@ -226,6 +227,7 @@ def _gen_defs(self): yield ("ARROW_JNI", truthifier(self.with_jni)) yield ("ARROW_MIMALLOC", truthifier(self.with_mimalloc)) yield ("ARROW_JEMALLOC", truthifier(self.with_jemalloc)) + yield ("ARROW_PARQUET", truthifier(self.with_parquet)) yield ("ARROW_PLASMA", truthifier(self.with_plasma)) yield ("ARROW_PYTHON", truthifier(self.with_python)) yield ("ARROW_S3", truthifier(self.with_s3)) diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index a512449aea5..cbf3c9c51e3 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -662,6 +662,9 @@ test_python() { if [ "${ARROW_GANDIVA}" = "ON" ]; then export PYARROW_WITH_GANDIVA=1 fi + if [ "${ARROW_GCS}" = "ON" ]; then + export PYARROW_WITH_GCS=1 + fi if [ "${ARROW_PLASMA}" = "ON" ]; then export PYARROW_WITH_PLASMA=1 fi @@ -694,6 +697,9 @@ import pyarrow.parquet if [ "${ARROW_GANDIVA}" == "ON" ]; then python -c "import pyarrow.gandiva" fi + if [ "${ARROW_GCS}" == "ON" ]; then + python -c "import pyarrow._gcsfs" + fi if [ "${ARROW_PLASMA}" == "ON" ]; then python -c "import pyarrow.plasma" fi @@ -701,6 +707,7 @@ import pyarrow.parquet python -c "import pyarrow._s3fs" fi + # Install test dependencies pip install -r requirements-test.txt @@ -1000,6 +1007,7 @@ test_linux_wheels() { } test_macos_wheels() { + local check_gcs=ON local check_s3=ON local check_flight=ON @@ -1019,6 +1027,7 @@ test_macos_wheels() { for platform in ${platform_tags}; do show_header "Testing Python ${pyver} wheel for platform ${platform}" if [[ "$platform" == *"10_9"* ]]; then + check_gcs=OFF check_s3=OFF fi @@ -1026,7 +1035,7 @@ test_macos_wheels() { VENV_ENV=wheel-${pyver}-${platform} PYTHON_VERSION=${pyver} maybe_setup_virtualenv || continue pip install pyarrow-${VERSION}-cp${pyver/.}-cp${python/.}-${platform}.whl - INSTALL_PYARROW=OFF ARROW_FLIGHT=${check_flight} ARROW_S3=${check_s3} \ + INSTALL_PYARROW=OFF ARROW_FLIGHT=${check_flight} ARROW_GCS=${check_gcs} ARROW_S3=${check_s3} \ ${ARROW_SOURCE_DIR}/ci/scripts/python_wheel_unix_test.sh ${ARROW_SOURCE_DIR} done done @@ -1155,9 +1164,9 @@ fi : ${ARROW_CUDA:=OFF} : ${ARROW_FLIGHT:=ON} : ${ARROW_GANDIVA:=ON} +: ${ARROW_GCS:=OFF} : ${ARROW_PLASMA:=ON} : ${ARROW_S3:=OFF} -: ${ARROW_GCS:=OFF} TEST_SUCCESS=no diff --git a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat b/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat index f0e26f0bc82..a03a37722fa 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat +++ b/dev/tasks/conda-recipes/arrow-cpp/bld-pyarrow.bat @@ -17,6 +17,7 @@ pushd "%SRC_DIR%"\python SET ARROW_HOME=%LIBRARY_PREFIX% SET SETUPTOOLS_SCM_PRETEND_VERSION=%PKG_VERSION% SET PYARROW_BUILD_TYPE=release +SET PYARROW_WITH_GCS=1 SET PYARROW_WITH_S3=1 SET PYARROW_WITH_HDFS=1 SET PYARROW_WITH_DATASET=1 diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh index 826942b62c7..ff43af7a568 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh @@ -18,6 +18,7 @@ else export PYARROW_WITH_GANDIVA=1 fi export PYARROW_WITH_HDFS=1 +export PYARROW_WITH_GCS=1 export PYARROW_WITH_ORC=1 export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PARQUET_ENCRYPTION=1 diff --git a/dev/tasks/homebrew-formulae/apache-arrow.rb b/dev/tasks/homebrew-formulae/apache-arrow.rb index a22b62afc27..94a1a67a1a4 100644 --- a/dev/tasks/homebrew-formulae/apache-arrow.rb +++ b/dev/tasks/homebrew-formulae/apache-arrow.rb @@ -72,6 +72,7 @@ def install args = %W[ -DARROW_FLIGHT=ON -DARROW_GANDIVA=ON + -DARROW_GCS=ON -DARROW_INSTALL_NAME_RPATH=OFF -DARROW_JEMALLOC=ON -DARROW_MIMALLOC=ON @@ -91,7 +92,6 @@ def install -DCMAKE_FIND_PACKAGE_PREFER_CONFIG=TRUE -DPython3_EXECUTABLE=#{Formula["python@3.9"].bin/"python3"} ] - # Re-enable -DARROW_S3=ON and add back aws-sdk-cpp to depends_on in ARROW-6437 mkdir "build" do system "cmake", "../cpp", *std_cmake_args, *args diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 95d404932cc..8ae8ec0b6da 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -469,7 +469,7 @@ tasks: {############################## Wheel OSX ####################################} # enable S3 support from macOS 10.13 so we don't need to bundle curl, crypt and ssl -{% for macos_version, macos_codename, arrow_s3 in [("10.9", "mavericks", "OFF"), +{% for macos_version, macos_codename, arrow_s3_gcs in [("10.9", "mavericks", "OFF"), ("10.13", "high-sierra", "ON")] %} {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %} @@ -479,7 +479,8 @@ tasks: params: python_version: "{{ python_version }}" macos_deployment_target: "{{ macos_version }}" - arrow_s3: "{{ arrow_s3 }}" + arrow_s3: "{{ arrow_s3_gcs }}" + arrow_gcs: "{{ arrow_s3_gcs }}" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 7386c256fa4..a657f56bb2d 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -412,6 +412,10 @@ set(CYTHON_EXTENSIONS set(LINK_LIBS arrow_shared arrow_python_shared) +if(PYARROW_BUILD_GCS) + set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _gcsfs) +endif() + if(PYARROW_BUILD_S3) set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _s3fs) endif() diff --git a/python/asv-build.sh b/python/asv-build.sh index 7de5ff4a2c2..188085f927a 100755 --- a/python/asv-build.sh +++ b/python/asv-build.sh @@ -49,6 +49,7 @@ cmake -GNinja \ -DARROW_CXXFLAGS=$CXXFLAGS \ -DARROW_USE_GLOG=off \ -DARROW_FLIGHT=on \ + -DARROW_GCS=on \ -DARROW_ORC=on \ -DARROW_PARQUET=on \ -DARROW_PYTHON=on \ @@ -66,6 +67,7 @@ export SETUPTOOLS_SCM_PRETEND_VERSION=0.0.1 export PYARROW_BUILD_TYPE=release export PYARROW_PARALLEL=8 export PYARROW_WITH_FLIGHT=1 +export PYARROW_WITH_GCS=1 export PYARROW_WITH_ORC=1 export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PLASMA=1 diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 75ad0ccd9b3..af5bebe7d6a 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -367,6 +367,9 @@ cdef class FileSystem(_Weakrefable): elif typ == 's3': from pyarrow._s3fs import S3FileSystem self = S3FileSystem.__new__(S3FileSystem) + elif typ == 'gcs': + from pyarrow._gcsfs import GcsFileSystem + self = GcsFileSystem.__new__(GcsFileSystem) elif typ == 'hdfs': from pyarrow._hdfs import HadoopFileSystem self = HadoopFileSystem.__new__(HadoopFileSystem) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx new file mode 100644 index 00000000000..e267948b688 --- /dev/null +++ b/python/pyarrow/_gcsfs.pyx @@ -0,0 +1,173 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: language_level = 3 + +from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, + pyarrow_unwrap_metadata) +from pyarrow.lib import frombytes, tobytes, KeyValueMetadata +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_fs cimport * +from pyarrow._fs cimport FileSystem +from cython.operator cimport dereference as deref + +from datetime import datetime + + +cdef class GcsFileSystem(FileSystem): + """ + GCS-backed FileSystem implementation + + By default uses the process described in https://google.aip.dev/auth/4110 + to resolve credentials. If not running on GCP this generally requires the + environment variable GOOGLE_APPLICATION_CREDENTIALS to point to a JSON + file containing credentials. + + Note: GCS buckets are special and the operations available on them may be + limited or more expensive than expected compared to local file systems. + + Note: When pickling a GcsFileSystem that uses default credential resolution + credentials are not stored in the serialized data. Therefore, when unpickling + it is assumed that the necessary credentials are in place for the target + process. + + Parameters + ---------- + anonymous : boolean, default False + Whether to connect anonymously. + If true, will not attempt to look up credentials using standard GCP + configuration methods. + access_token : str, default None + GCP access token. If provided temporary credentials will be fetched by + assuming this role. If specified an credential_token_expiration must be + specified with the token. + target_service_account : str, default None + An optional service account to try to impersonate when accessing GCS. This + requires the specified credential user/service_account has the necessary + permissions. + credential_token_expiration : datetime, default None + Expiration for credential generated with an access token. Must be specified + if token is specified. + default_bucket_location : str, default 'US-CENTRAL1' + GCP region to create buckets in. + scheme : str, default 'https' + GCS connection transport scheme. + endpoint_override : str, default None + Override endpoint with a connect string such as "localhost:9000" + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for open_output_stream. This will be ignored if + non-empty metadata is passed to open_output_stream. + """ + + cdef: + CGcsFileSystem* gcsfs + + def __init__(self, *, bint anonymous=False, access_token=None, + target_service_account=None, credential_token_expiration=None, + default_bucket_location='US-CENTRAL1', + scheme=None, + endpoint_override=None, + default_metadata=None): + cdef: + CGcsOptions options + shared_ptr[CGcsFileSystem] wrapped + + # Intentional use of truthiness because empty strings aren't valid and + # for reconstruction from pickling will give empty strings. + if anonymous and (target_service_account or access_token): + raise ValueError( + 'anonymous option is not compatible with target_service_account and ' + 'access_token please only specify only one.' + ) + elif ((access_token and credential_token_expiration is None) or + (not access_token and + credential_token_expiration is not None)): + raise ValueError( + 'access_token and credential_token_expiration must be ' + 'specified together' + ) + + elif anonymous: + options = CGcsOptions.Anonymous() + elif access_token: + options = CGcsOptions.FromAccessToken( + tobytes(access_token), + PyDateTime_to_TimePoint(credential_token_expiration)) + else: + options = CGcsOptions.Defaults() + + if target_service_account: + options = CGcsOptions.FromImpersonatedServiceAccount( + options.credentials, tobytes(target_service_account)) + + options.default_bucket_location = tobytes(default_bucket_location) + + if scheme is not None: + options.scheme = tobytes(scheme) + if endpoint_override is not None: + options.endpoint_override = tobytes(endpoint_override) + if default_metadata is not None: + if not isinstance(default_metadata, KeyValueMetadata): + default_metadata = KeyValueMetadata(default_metadata) + options.default_metadata = pyarrow_unwrap_metadata( + default_metadata) + + with nogil: + wrapped = GetResultValue(CGcsFileSystem.Make(options)) + + self.init( wrapped) + + cdef init(self, const shared_ptr[CFileSystem]& wrapped): + FileSystem.init(self, wrapped) + self.gcsfs = wrapped.get() + + @classmethod + def _reconstruct(cls, kwargs): + return cls(**kwargs) + + def _expiration_datetime_from_options(self): + cdef CGcsOptions opts = self.gcsfs.options() + expiration_ns = TimePoint_to_ns(opts.credentials.expiration()) + if expiration_ns == 0: + return None + ns_per_sec = 1000000000.0 + return datetime.fromtimestamp(expiration_ns / ns_per_sec) + + def __reduce__(self): + cdef CGcsOptions opts = self.gcsfs.options() + service_account = frombytes(opts.credentials.target_service_account()) + expiration_dt = self._expiration_datetime_from_options() + return ( + GcsFileSystem._reconstruct, (dict( + access_token=frombytes(opts.credentials.access_token()), + anonymous=opts.credentials.anonymous(), + credential_token_expiration=expiration_dt, + target_service_account=service_account, + scheme=frombytes(opts.scheme), + endpoint_override=frombytes(opts.endpoint_override), + default_bucket_location=frombytes( + opts.default_bucket_location), + default_metadata=pyarrow_wrap_metadata(opts.default_metadata), + ),)) + + @property + def default_bucket_location(self): + """ + The GCP location this filesystem will write to. + """ + return frombytes(self.gcsfs.options().default_bucket_location) diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index b114f7d1c60..2fea2e9cb23 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -26,6 +26,7 @@ 'hypothesis', 'fastparquet', 'gandiva', + 'gcs', 'gdb', 'gzip', 'hdfs', @@ -56,6 +57,7 @@ 'fastparquet': False, 'flight': False, 'gandiva': False, + 'gcs': False, 'gdb': True, 'gzip': Codec.is_available('gzip'), 'hdfs': False, diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index f22eaf03041..932fc82789a 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -45,6 +45,11 @@ except ImportError: _not_imported.append("HadoopFileSystem") +try: + from pyarrow._gcsfs import GcsFileSystem # noqa +except ImportError: + _not_imported.append("GcsFileSystem") + try: from pyarrow._s3fs import ( # noqa S3FileSystem, S3LogLevel, initialize_s3, finalize_s3, diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index e491233e88f..8a73b2afa75 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -200,6 +200,39 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: cdef CResult[c_string] ResolveS3BucketRegion(const c_string& bucket) + cdef cppclass CGcsCredentials "arrow::fs::GcsCredentials": + c_bool anonymous() + CTimePoint expiration() + c_string access_token() + c_string target_service_account() + + cdef cppclass CGcsOptions "arrow::fs::GcsOptions": + CGcsCredentials credentials + c_string endpoint_override + c_string scheme + c_string default_bucket_location + shared_ptr[const CKeyValueMetadata] default_metadata + c_bool Equals(const CS3Options& other) + + @staticmethod + CGcsOptions Defaults() + + @staticmethod + CGcsOptions Anonymous() + + @staticmethod + CGcsOptions FromAccessToken(const c_string& access_token, + CTimePoint expiration) + + @staticmethod + CGcsOptions FromImpersonatedServiceAccount(const CGcsCredentials& base_credentials, + c_string& target_service_account) + + cdef cppclass CGcsFileSystem "arrow::fs::GcsFileSystem": + @staticmethod + CResult[shared_ptr[CGcsFileSystem]] Make(const CGcsOptions& options) + CGcsOptions options() + cdef cppclass CHdfsOptions "arrow::fs::HdfsOptions": HdfsConnectionConfig connection_config int32_t buffer_size diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 0b7f1618b0e..7771f66aa3e 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -18,6 +18,7 @@ import os import pathlib import subprocess +import sys from tempfile import TemporaryDirectory import pytest @@ -173,3 +174,23 @@ def s3_server(s3_connection): finally: if proc is not None: proc.kill() + + +@pytest.fixture(scope='session') +def gcs_server(): + port = find_free_port() + env = os.environ.copy() + args = [sys.executable, '-m', 'testbench', '--port', str(port)] + proc = None + try: + proc = subprocess.Popen(args, env=env) + except OSError: + pytest.skip('`gcs test bench` command cannot be located') + else: + yield { + 'connection': ('localhost', port), + 'process': proc, + } + finally: + if proc is not None: + proc.kill() diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 4fd72704a71..ff06026a20a 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -200,6 +200,31 @@ def subtree_localfs(request, tempdir, localfs): ) +@pytest.fixture +def gcsfs(request, gcs_server): + request.config.pyarrow.requires('gcs') + from pyarrow.fs import GcsFileSystem + + host, port = gcs_server['connection'] + bucket = 'pyarrow-filesystem/' + + fs = GcsFileSystem( + endpoint_override=f'{host}:{port}', + scheme='http', + # Mock endpoint doesn't check credentials. + anonymous=True + ) + fs.create_dir(bucket) + + yield dict( + fs=fs, + pathfn=bucket.__add__, + allow_move_dir=False, + allow_append_to_file=False, + ) + fs.delete_dir(bucket) + + @pytest.fixture def s3fs(request, s3_server): request.config.pyarrow.requires('s3') @@ -345,6 +370,11 @@ def py_fsspec_s3fs(request, s3_server): id='S3FileSystem', marks=pytest.mark.s3 ), + pytest.param( + pytest.lazy_fixture('gcsfs'), + id='GcsFileSystem', + marks=pytest.mark.gcs + ), pytest.param( pytest.lazy_fixture('hdfs'), id='HadoopFileSystem', @@ -869,6 +899,10 @@ def test_open_input_file(fs, pathfn): s.write(data) read_from = len(b'some data') * 512 + with fs.open_input_file(p) as f: + result = f.read() + assert result == data + with fs.open_input_file(p) as f: f.seek(read_from) result = f.read() @@ -951,7 +985,7 @@ def test_open_output_stream_metadata(fs, pathfn): assert f.read() == data got_metadata = f.metadata() - if fs.type_name == 's3' or 'mock' in fs.type_name: + if fs.type_name in ['s3', 'gcs'] or 'mock' in fs.type_name: for k, v in metadata.items(): assert got_metadata[k] == v.encode() else: @@ -1010,6 +1044,42 @@ def test_mockfs_mtime_roundtrip(mockfs): assert info.mtime == dt +@pytest.mark.gcs +def test_gcs_options(): + from pyarrow.fs import GcsFileSystem + dt = datetime.now() + fs = GcsFileSystem(access_token='abc', + target_service_account='service_account@apache', + credential_token_expiration=dt, + default_bucket_location='us-west2', + scheme='https', endpoint_override='localhost:8999') + assert isinstance(fs, GcsFileSystem) + assert fs.default_bucket_location == 'us-west2' + assert pickle.loads(pickle.dumps(fs)) == fs + + fs = GcsFileSystem() + assert isinstance(fs, GcsFileSystem) + assert pickle.loads(pickle.dumps(fs)) == fs + + fs = GcsFileSystem(anonymous=True) + assert isinstance(fs, GcsFileSystem) + assert pickle.loads(pickle.dumps(fs)) == fs + + fs = GcsFileSystem(default_metadata={"ACL": "authenticated-read", + "Content-Type": "text/plain"}) + assert isinstance(fs, GcsFileSystem) + assert pickle.loads(pickle.dumps(fs)) == fs + + with pytest.raises(ValueError): + GcsFileSystem(access_token='access') + with pytest.raises(ValueError): + GcsFileSystem(anonymous=True, access_token='secret') + with pytest.raises(ValueError): + GcsFileSystem(anonymous=True, target_service_account='acct') + with pytest.raises(ValueError): + GcsFileSystem(credential_token_expiration=datetime.now()) + + @pytest.mark.s3 def test_s3_options(): from pyarrow.fs import S3FileSystem @@ -1321,6 +1391,25 @@ def test_filesystem_from_uri_s3(s3_server): assert info.type == FileType.Directory +@pytest.mark.gcs +def test_filesystem_from_uri_gcs(gcs_server): + from pyarrow.fs import GcsFileSystem + + host, port = gcs_server['connection'] + + uri = ("gs://anonymous@" + + f"mybucket/foo/bar?scheme=http&endpoint_override={host}:{port}") + + fs, path = FileSystem.from_uri(uri) + assert isinstance(fs, GcsFileSystem) + assert path == "mybucket/foo/bar" + + fs.create_dir(path) + [info] = fs.get_file_info([path]) + assert info.path == path + assert info.type == FileType.Directory + + def test_py_filesystem(): handler = DummyHandler() fs = PyFileSystem(handler) diff --git a/python/setup.py b/python/setup.py index 2486a8959e6..b572be1cee4 100755 --- a/python/setup.py +++ b/python/setup.py @@ -113,6 +113,8 @@ def run(self): ('with-parquet', None, 'build the Parquet extension'), ('with-parquet-encryption', None, 'build the Parquet encryption extension'), + ('with-gcs', None, + 'build the Google Cloud Storage (GCS) extension'), ('with-s3', None, 'build the Amazon S3 extension'), ('with-static-parquet', None, 'link parquet statically'), ('with-static-boost', None, 'link boost statically'), @@ -155,6 +157,8 @@ def initialize_options(self): if not hasattr(sys, 'gettotalrefcount'): self.build_type = 'release' + self.with_gcs = strtobool( + os.environ.get('PYARROW_WITH_GCS', '0')) self.with_s3 = strtobool( os.environ.get('PYARROW_WITH_S3', '0')) self.with_hdfs = strtobool( @@ -216,6 +220,7 @@ def initialize_options(self): '_parquet_encryption', '_orc', '_plasma', + '_gcsfs', '_s3fs', '_substrait', '_hdfs', @@ -281,6 +286,7 @@ def append_cmake_bool(value, varname): append_cmake_bool(self.with_parquet_encryption, 'PYARROW_BUILD_PARQUET_ENCRYPTION') append_cmake_bool(self.with_plasma, 'PYARROW_BUILD_PLASMA') + append_cmake_bool(self.with_gcs, 'PYARROW_BUILD_GCS') append_cmake_bool(self.with_s3, 'PYARROW_BUILD_S3') append_cmake_bool(self.with_hdfs, 'PYARROW_BUILD_HDFS') append_cmake_bool(self.with_tensorflow, 'PYARROW_USE_TENSORFLOW') @@ -447,6 +453,8 @@ def _failure_permitted(self, name): return True if name == '_substrait' and not self.with_substrait: return True + if name == '_gcsfs' and not self.with_gcs: + return True if name == '_s3fs' and not self.with_s3: return True if name == '_hdfs' and not self.with_hdfs: From 2fac4b4645e96bf2c330ce1752f28a61713ea75c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 Apr 2022 03:04:17 +0000 Subject: [PATCH 02/85] address PR comment pt 1 --- cpp/src/arrow/filesystem/gcsfs.cc | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index becfd8b8e74..a8f1028633d 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -34,9 +34,6 @@ namespace arrow { namespace fs { struct GcsCredentialsHolder { - explicit GcsCredentialsHolder(std::shared_ptr c) - : credentials(std::move(c)) {} - std::shared_ptr credentials; }; @@ -331,13 +328,10 @@ class GcsFileSystem::Impl { return info; } // Not found case. It could be this was written to GCS with a different - // "Directory" convention. So it if there are is at least one objec that + // "Directory" convention, so if there is at least one object that // matches the prefix we assume it is a directory. std::string canonical = internal::EnsureTrailingSlash(path.object); - std::string end = canonical; - end.back() += 1; - auto list_result = - client_.ListObjects(path.bucket, gcs::Prefix(canonical), gcs::EndOffset(end)); + auto list_result = client_.ListObjects(path.bucket, gcs::Prefix(canonical)); if (list_result.begin() != list_result.end()) { // If there is at least one result it indicates this is a directory (at // least one object exists that starts with "path/" From c4eec64e0c5b423368174d308efb7054391f6f8f Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:04:24 -0700 Subject: [PATCH 03/85] Update cpp/src/arrow/filesystem/gcsfs.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index a8f1028633d..b495a443203 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -334,7 +334,7 @@ class GcsFileSystem::Impl { auto list_result = client_.ListObjects(path.bucket, gcs::Prefix(canonical)); if (list_result.begin() != list_result.end()) { // If there is at least one result it indicates this is a directory (at - // least one object exists that starts with "path/" + // least one object exists that starts with "path/") return FileInfo(path.full_path, FileType::Directory); } // Return the original not-found info if there was no match. From 5f0d5e048f9604e75a8239c095a88dd4cdfb6722 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:05:59 -0700 Subject: [PATCH 04/85] Update cpp/src/arrow/filesystem/gcsfs.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index b495a443203..861ef43fe66 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -301,7 +301,7 @@ google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { if (!o.endpoint_override.empty()) { options.set(scheme + "://" + o.endpoint_override); } - if (o.credentials.holder() && (o.credentials.holder())->credentials) { + if (o.credentials.holder() && o.credentials.holder()->credentials) { options.set( o.credentials.holder()->credentials); } From 026ef2d702d9e77473bd62af74d0874b4eb84800 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:07:28 -0700 Subject: [PATCH 05/85] Update cpp/src/arrow/filesystem/gcsfs.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 861ef43fe66..5238f512660 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -646,7 +646,7 @@ class GcsFileSystem::Impl { if (IsDirectory(meta) || (!full_path.empty() && full_path.back() == '/')) { if (normalize_directories) { auto normalized = std::string(internal::RemoveTrailingSlash(full_path)); - return FileInfo(normalized, FileType::Directory); + return FileInfo(std::move(normalized), FileType::Directory); } else { return FileInfo(full_path, FileType::Directory); } From 7172b33d126591d5693ed95ec2dfaf650c939b87 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 Apr 2022 03:24:13 +0000 Subject: [PATCH 06/85] pr comments pt2. --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- cpp/src/arrow/filesystem/gcsfs.h | 6 +++--- cpp/src/arrow/filesystem/gcsfs_test.cc | 6 +++++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 5238f512660..88edec566e1 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -452,7 +452,7 @@ class GcsFileSystem::Impl { // Note that the list of parents are sorted from deepest to most shallow, this is // convenient because as soon as we find a directory we can stop the iteration. for (auto const& d : missing_parents) { - auto o = CreateDirMarker(bucket, internal::EnsureTrailingSlash(d)); + auto o = CreateDirMarker(bucket, d); if (o) { if (IsDirectory(*o)) continue; // This is probably a race condition, something created a file before we managed diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 099cd40095c..961f3cc3b30 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -36,10 +36,10 @@ class GcsCredentials { public: bool Equals(const GcsCredentials& other) const; bool anonymous() const { return anonymous_; } - const std::string& access_token() { return access_token_; } + const std::string& access_token() const { return access_token_; } TimePoint expiration() const { return expiration_; } - const std::string& target_service_account() { return target_service_account_; } - const std::string& json_credentials() { return json_credentials_; } + const std::string& target_service_account() const { return target_service_account_; } + const std::string& json_credentials() const { return json_credentials_; } const std::shared_ptr& holder() const { return holder_; } private: diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 9c4b4485cdc..3f1523bfbcf 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -293,7 +293,7 @@ TEST(GcsFileSystem, OptionsCompare) { TEST(GcsFileSystem, OptionsAnonymous) { GcsOptions a = GcsOptions::Anonymous(); EXPECT_THAT(a.credentials.holder(), NotNull()); - EXPECT_THAT(a.credentials.anonymous(), true); + EXPECT_TRUE(a.credentials.anonymous()); EXPECT_EQ(a.scheme, "http"); } @@ -575,6 +575,8 @@ TEST_F(GcsIntegrationTest, GetFileInfoBucket) { } TEST_F(GcsIntegrationTest, GetFileInfoObjectWithNestedStructure) { + // Adds detailed tests to handle cases of different edge cases + // with directory naming conventions (e.g. with and without slashes). auto fs = GcsFileSystem::Make(TestGcsOptions()); constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; ASSERT_OK_AND_ASSIGN( @@ -584,6 +586,8 @@ TEST_F(GcsIntegrationTest, GetFileInfoObjectWithNestedStructure) { ASSERT_OK(output->Write(data.data(), data.size())); ASSERT_OK(output->Close()); + // 0 is immediately after "/" lexicographically, ensure that this doesn't + // cause unexpected issues. ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(PreexistingBucketPath() + "test-object-dir/some_other_dir0", /*metadata=*/{})); From bb383a50ccf5424c870618851896d4d14df96035 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 Apr 2022 03:24:58 +0000 Subject: [PATCH 07/85] pr comments pt3. --- dev/archery/archery/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 5f69d9ff91a..28f0494fd1f 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -162,6 +162,8 @@ def _apply_options(cmd, options): help="Build with Flight rpc support.") @click.option("--with-gandiva", default=None, type=BOOL, help="Build with Gandiva expression compiler support.") +@click.option("--with-gcs", default=None, type=BOOL, + help="Build Arrow with Google Cloud Storage (gcs) support.") @click.option("--with-hdfs", default=None, type=BOOL, help="Build the Arrow HDFS bridge.") @click.option("--with-hiveserver2", default=None, type=BOOL, @@ -183,8 +185,6 @@ def _apply_options(cmd, options): @click.option("--with-r", default=None, type=BOOL, help="Build the Arrow R extensions. This is not a CMake option, " "it will toggle required options") -@click.option("--with-gcs", default=None, type=BOOL, - help="Build Arrow with Google Cloud Storage (gcs) support.") @click.option("--with-s3", default=None, type=BOOL, help="Build Arrow with S3 support.") # Compressions From ce804bbe844f49099023342557ddca4c6f2967e7 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:15:30 -0700 Subject: [PATCH 08/85] Update cpp/src/arrow/filesystem/gcsfs.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/filesystem/gcsfs.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 88edec566e1..2664c3a16b7 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -749,9 +749,13 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, const std::string& username = uri.username(); bool anonymous = username == "anonymous"; - if (!uri.password().empty() || (!username.empty() && !anonymous)) { + if (!username.empty() && !anonymous) { return Status::Invalid( - "GCS does not accept username except \"anonymous\" or password."); + "GCS URIs do not accept username except \"anonymous\"."); + } + if (!uri.password().empty()) { + return Status::Invalid( + "GCS URIs do not accept password."); } auto options = GcsOptions::Defaults(); if (anonymous) { From 9c038a0a9fb2e2b4071e09e9c029b110d4cc367d Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:25:08 -0700 Subject: [PATCH 09/85] Update dev/archery/archery/cli.py Co-authored-by: Antoine Pitrou --- dev/archery/archery/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 28f0494fd1f..522b6cec73a 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -185,6 +185,8 @@ def _apply_options(cmd, options): @click.option("--with-r", default=None, type=BOOL, help="Build the Arrow R extensions. This is not a CMake option, " "it will toggle required options") +@click.option("--with-gcs", default=None, type=BOOL, + help="Build Arrow with Google Cloud Storage (GCS) support.") @click.option("--with-s3", default=None, type=BOOL, help="Build Arrow with S3 support.") # Compressions From 11556d8eaeaf4f44a2d130e8a2e05d255f4c8102 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:26:05 -0700 Subject: [PATCH 10/85] Update cpp/src/arrow/filesystem/gcsfs.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/filesystem/gcsfs.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 2664c3a16b7..9baab40333b 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -757,10 +757,7 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, return Status::Invalid( "GCS URIs do not accept password."); } - auto options = GcsOptions::Defaults(); - if (anonymous) { - options = GcsOptions::Anonymous(); - } + auto options = anonymous ? GcsOptions::Anonymous() : GcsOptions::Defaults(); for (const auto& kv : options_map) { if (kv.first == "location") { From 84d7ed2e282d8292a59db79c98f99d9843b75db6 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:29:34 -0700 Subject: [PATCH 11/85] Update python/pyarrow/_gcsfs.pyx Co-authored-by: Antoine Pitrou --- python/pyarrow/_gcsfs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index e267948b688..1307aba2132 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -41,7 +41,7 @@ cdef class GcsFileSystem(FileSystem): Note: GCS buckets are special and the operations available on them may be limited or more expensive than expected compared to local file systems. - Note: When pickling a GcsFileSystem that uses default credential resolution + Note: When pickling a GcsFileSystem that uses default credentials, resolution credentials are not stored in the serialized data. Therefore, when unpickling it is assumed that the necessary credentials are in place for the target process. From afa49ce2b55f8998ca3446d8d89f5d38147754ea Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:36:24 -0700 Subject: [PATCH 12/85] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- python/pyarrow/_gcsfs.pyx | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 1307aba2132..d18e178b306 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -53,16 +53,16 @@ cdef class GcsFileSystem(FileSystem): If true, will not attempt to look up credentials using standard GCP configuration methods. access_token : str, default None - GCP access token. If provided temporary credentials will be fetched by - assuming this role. If specified an credential_token_expiration must be - specified with the token. + GCP access token. If provided, temporary credentials will be fetched by + assuming this role; also, a `credential_token_expiration` must be + specified as well. target_service_account : str, default None An optional service account to try to impersonate when accessing GCS. This - requires the specified credential user/service_account has the necessary + requires the specified credential user or service account to have the necessary permissions. credential_token_expiration : datetime, default None Expiration for credential generated with an access token. Must be specified - if token is specified. + if `access_token` is specified. default_bucket_location : str, default 'US-CENTRAL1' GCP region to create buckets in. scheme : str, default 'https' @@ -70,15 +70,15 @@ cdef class GcsFileSystem(FileSystem): endpoint_override : str, default None Override endpoint with a connect string such as "localhost:9000" default_metadata : mapping or pyarrow.KeyValueMetadata, default None - Default metadata for open_output_stream. This will be ignored if - non-empty metadata is passed to open_output_stream. + Default metadata for `open_output_stream`. This will be ignored if + non-empty metadata is passed to `open_output_stream`. """ cdef: CGcsFileSystem* gcsfs def __init__(self, *, bint anonymous=False, access_token=None, - target_service_account=None, credential_token_expiration=None, + target_service_account=None, datetime credential_token_expiration=None, default_bucket_location='US-CENTRAL1', scheme=None, endpoint_override=None, @@ -94,9 +94,7 @@ cdef class GcsFileSystem(FileSystem): 'anonymous option is not compatible with target_service_account and ' 'access_token please only specify only one.' ) - elif ((access_token and credential_token_expiration is None) or - (not access_token and - credential_token_expiration is not None)): + elif bool(access_token) != bool(credential_token_expiration): raise ValueError( 'access_token and credential_token_expiration must be ' 'specified together' @@ -141,12 +139,11 @@ cdef class GcsFileSystem(FileSystem): return cls(**kwargs) def _expiration_datetime_from_options(self): - cdef CGcsOptions opts = self.gcsfs.options() - expiration_ns = TimePoint_to_ns(opts.credentials.expiration()) + expiration_ns = TimePoint_to_ns( + self.gcsfs.options().credentials.expiration()) if expiration_ns == 0: return None - ns_per_sec = 1000000000.0 - return datetime.fromtimestamp(expiration_ns / ns_per_sec) + return datetime.fromtimestamp(expiration_ns / 1e9) def __reduce__(self): cdef CGcsOptions opts = self.gcsfs.options() From 2b41c39444f3c3604a0c7e671993ff59d8a6a78b Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 20:15:30 -0700 Subject: [PATCH 13/85] Update cpp/src/arrow/filesystem/gcsfs.cc Co-authored-by: Antoine Pitrou --- cpp/src/arrow/filesystem/gcsfs.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 9baab40333b..3f8a0449f98 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -753,6 +753,9 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, return Status::Invalid( "GCS URIs do not accept username except \"anonymous\"."); } + if (!uri.password().empty()) { + return Status::Invalid("GCS URIs do not accept password."); + } if (!uri.password().empty()) { return Status::Invalid( "GCS URIs do not accept password."); From c679dae03e0f1fa041f72406d833da0d7e4aad7a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 Apr 2022 04:57:02 +0000 Subject: [PATCH 14/85] pr review comments #4 --- cpp/src/arrow/filesystem/gcsfs.cc | 25 ++++++++++++++++++++----- cpp/src/arrow/filesystem/gcsfs.h | 2 ++ dev/archery/archery/lang/cpp.py | 7 ++++--- python/pyarrow/_gcsfs.pyx | 7 +++++-- python/pyarrow/tests/conftest.py | 2 +- 5 files changed, 32 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 3f8a0449f98..3651bce3b4f 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -21,6 +21,7 @@ #include #include "arrow/buffer.h" +#include "arrow/io/util_internal.h" #include "arrow/filesystem/gcsfs_internal.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" @@ -34,6 +35,9 @@ namespace arrow { namespace fs { struct GcsCredentialsHolder { + // Constructor needed for make_shared + GcsCredentialsHolder(std::shared_ptr credentials) + : credentials(std::move(credentials)) {} std::shared_ptr credentials; }; @@ -168,9 +172,17 @@ class GcsInputStream : public arrow::io::InputStream { class GcsOutputStream : public arrow::io::OutputStream { public: explicit GcsOutputStream(gcs::ObjectWriteStream stream) : stream_(std::move(stream)) {} - ~GcsOutputStream() override = default; + ~GcsOutputStream() { + if (!closed_) { + // The common pattern is to close OutputStreams from destructor in arrow. + io::internal::CloseFromDestructor(this); + } + } Status Close() override { + if (closed_) { + return Status::Invalid("Already closed stream."); + } stream_.Close(); closed_ = true; return internal::ToArrowStatus(stream_.last_status()); @@ -352,6 +364,7 @@ class GcsFileSystem::Impl { internal::Depth(p.object) + select.max_recursion + !p.object.empty(); auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); + // TODO(emkornfield): Add docs explaining or simplify. auto include_trailing = select.recursive ? gcs::IncludeTrailingDelimiter(false) : gcs::IncludeTrailingDelimiter(true); FileInfoVector result; @@ -367,6 +380,8 @@ class GcsFileSystem::Impl { // Skip the directory itself from the results, and any result that is "too deep" // into the recursion. bool has_trailing_slash = !o->name().empty() && o->name().back() == '/'; + // TODO(emkornfield): Add docs explaining or simplify o->name() comparison + // to both p.object and o.name() if (o->name() == canonical || o->name() == p.object || internal::Depth(o->name()) > (max_depth + has_trailing_slash)) { continue; @@ -640,6 +655,8 @@ class GcsFileSystem::Impl { return internal::ToArrowStatus(meta.status()); } + // TODO(emkornfield): Add more comments on normalize_directories + // or simplify logic. static FileInfo ToFileInfo(const std::string& full_path, const gcs::ObjectMetadata& meta, bool normalize_directories = false) { @@ -750,15 +767,13 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, const std::string& username = uri.username(); bool anonymous = username == "anonymous"; if (!username.empty() && !anonymous) { - return Status::Invalid( - "GCS URIs do not accept username except \"anonymous\"."); + return Status::Invalid("GCS URIs do not accept username except \"anonymous\"."); } if (!uri.password().empty()) { return Status::Invalid("GCS URIs do not accept password."); } if (!uri.password().empty()) { - return Status::Invalid( - "GCS URIs do not accept password."); + return Status::Invalid("GCS URIs do not accept password."); } auto options = anonymous ? GcsOptions::Anonymous() : GcsOptions::Defaults(); diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 961f3cc3b30..b483d7651e2 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -56,6 +56,8 @@ class GcsCredentials { /// Options for the GcsFileSystem implementation. struct ARROW_EXPORT GcsOptions { + // TODO(emkornfield): make this a shared ptr or use + // another mechanism to make GcsOptions GcsConstructible. GcsCredentials credentials; std::string endpoint_override; diff --git a/dev/archery/archery/lang/cpp.py b/dev/archery/archery/lang/cpp.py index e188a1acf42..158cc48badd 100644 --- a/dev/archery/archery/lang/cpp.py +++ b/dev/archery/archery/lang/cpp.py @@ -52,11 +52,12 @@ def __init__(self, # Components with_compute=None, with_csv=None, with_cuda=None, with_dataset=None, with_filesystem=None, with_flight=None, - with_gandiva=None, with_hdfs=None, with_hiveserver2=None, + with_gandiva=None, with_gcs=None, with_hdfs=None, + with_hiveserver2=None, with_ipc=True, with_json=None, with_jni=None, with_mimalloc=None, with_jemalloc=None, with_parquet=None, with_plasma=None, with_python=True, - with_r=None, with_gcs=None, with_s3=None, + with_r=None, with_s3=None, # Compressions with_brotli=None, with_bz2=None, with_lz4=None, with_snappy=None, with_zlib=None, with_zstd=None, @@ -95,6 +96,7 @@ def __init__(self, self.with_filesystem = with_filesystem self.with_flight = with_flight self.with_gandiva = with_gandiva + self.with_gcs = with_gcs self.with_hdfs = with_hdfs self.with_hiveserver2 = with_hiveserver2 self.with_ipc = with_ipc @@ -106,7 +108,6 @@ def __init__(self, self.with_plasma = with_plasma self.with_python = with_python self.with_r = with_r - self.with_gcs = with_gcs self.with_s3 = with_s3 self.with_brotli = with_brotli diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index d18e178b306..20391cc6e8b 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -63,7 +63,7 @@ cdef class GcsFileSystem(FileSystem): credential_token_expiration : datetime, default None Expiration for credential generated with an access token. Must be specified if `access_token` is specified. - default_bucket_location : str, default 'US-CENTRAL1' + default_bucket_location : str, default 'US' GCP region to create buckets in. scheme : str, default 'https' GCS connection transport scheme. @@ -79,7 +79,7 @@ cdef class GcsFileSystem(FileSystem): def __init__(self, *, bint anonymous=False, access_token=None, target_service_account=None, datetime credential_token_expiration=None, - default_bucket_location='US-CENTRAL1', + default_bucket_location='US', scheme=None, endpoint_override=None, default_metadata=None): @@ -109,6 +109,9 @@ cdef class GcsFileSystem(FileSystem): else: options = CGcsOptions.Defaults() + # Target service account requires base credentials so + # it is not part of the if/else chain above which only + # handles base credentials. if target_service_account: options = CGcsOptions.FromImpersonatedServiceAccount( options.credentials, tobytes(target_service_account)) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 7771f66aa3e..4f7f0a7bc6e 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -185,7 +185,7 @@ def gcs_server(): try: proc = subprocess.Popen(args, env=env) except OSError: - pytest.skip('`gcs test bench` command cannot be located') + pytest.skip('`gcs testbench` raised an error when executing') else: yield { 'connection': ('localhost', port), From 47e8e52fb7229162bd345ebd74e27deb306ff0f3 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 21:26:33 -0700 Subject: [PATCH 15/85] Update python/pyarrow/_gcsfs.pyx Co-authored-by: Antoine Pitrou --- python/pyarrow/_gcsfs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 20391cc6e8b..cddf31f1d17 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -92,7 +92,7 @@ cdef class GcsFileSystem(FileSystem): if anonymous and (target_service_account or access_token): raise ValueError( 'anonymous option is not compatible with target_service_account and ' - 'access_token please only specify only one.' + 'access_token' ) elif bool(access_token) != bool(credential_token_expiration): raise ValueError( From 272828ed89a7ca100434bb018b0bc22de357f506 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 21:43:25 -0700 Subject: [PATCH 16/85] Update python/pyarrow/_gcsfs.pyx Co-authored-by: Antoine Pitrou --- python/pyarrow/_gcsfs.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index cddf31f1d17..37e44ac1e59 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -123,10 +123,8 @@ cdef class GcsFileSystem(FileSystem): if endpoint_override is not None: options.endpoint_override = tobytes(endpoint_override) if default_metadata is not None: - if not isinstance(default_metadata, KeyValueMetadata): - default_metadata = KeyValueMetadata(default_metadata) options.default_metadata = pyarrow_unwrap_metadata( - default_metadata) + ensure_metadata(default_metadata)) with nogil: wrapped = GetResultValue(CGcsFileSystem.Make(options)) From fb75754ce1225bdec2324fce0061f80b04ca30f6 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Mon, 18 Apr 2022 21:43:25 -0700 Subject: [PATCH 17/85] ninja format --- cpp/src/arrow/filesystem/gcsfs.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 3651bce3b4f..28f5c301621 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -21,10 +21,10 @@ #include #include "arrow/buffer.h" -#include "arrow/io/util_internal.h" #include "arrow/filesystem/gcsfs_internal.h" #include "arrow/filesystem/path_util.h" #include "arrow/filesystem/util_internal.h" +#include "arrow/io/util_internal.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" #include "arrow/util/thread_pool.h" @@ -36,7 +36,7 @@ namespace arrow { namespace fs { struct GcsCredentialsHolder { // Constructor needed for make_shared - GcsCredentialsHolder(std::shared_ptr credentials) + GcsCredentialsHolder(std::shared_ptr credentials) : credentials(std::move(credentials)) {} std::shared_ptr credentials; }; From 4ce2fd1416af50e3eb772bcbae969e8a32c5f50f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 Apr 2022 05:43:43 +0000 Subject: [PATCH 18/85] more pr fixes --- python/pyarrow/_gcsfs.pyx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 37e44ac1e59..21f8741af73 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -19,11 +19,11 @@ from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, pyarrow_unwrap_metadata) -from pyarrow.lib import frombytes, tobytes, KeyValueMetadata +from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_fs cimport * -from pyarrow._fs cimport FileSystem +from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint from cython.operator cimport dereference as deref from datetime import datetime @@ -78,7 +78,7 @@ cdef class GcsFileSystem(FileSystem): CGcsFileSystem* gcsfs def __init__(self, *, bint anonymous=False, access_token=None, - target_service_account=None, datetime credential_token_expiration=None, + target_service_account=None, credential_token_expiration=None, default_bucket_location='US', scheme=None, endpoint_override=None, @@ -103,6 +103,8 @@ cdef class GcsFileSystem(FileSystem): elif anonymous: options = CGcsOptions.Anonymous() elif access_token: + if not isinstance(credential_token_expiration, datetime): + raise ValueError("credential_token_expiration must be a datetime") options = CGcsOptions.FromAccessToken( tobytes(access_token), PyDateTime_to_TimePoint(credential_token_expiration)) From 7b69c418398d59284e65e2b1c50148835e0805bb Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 19 Apr 2022 06:01:33 +0000 Subject: [PATCH 19/85] make ensure_metadata cpdef --- python/pyarrow/types.pxi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 0a54b401b1f..de9677f3d40 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1145,7 +1145,7 @@ cdef class KeyValueMetadata(_Metadata, Mapping): return result -cdef KeyValueMetadata ensure_metadata(object meta, c_bool allow_none=False): +cpdef KeyValueMetadata ensure_metadata(object meta, c_bool allow_none=False): if allow_none and meta is None: return None elif isinstance(meta, KeyValueMetadata): From 55e784e4537801325ea762abced181042d4d80c2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 01:56:48 +0000 Subject: [PATCH 20/85] remove build_absl_once --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 08c056532e2..4082b7d2ce6 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -3827,7 +3827,6 @@ if(ARROW_WITH_GRPC) message(STATUS "Forcing gRPC_SOURCE to Protobuf_SOURCE (${Protobuf_SOURCE})") set(gRPC_SOURCE "${Protobuf_SOURCE}") endif() - build_absl_once() resolve_dependency(gRPC HAVE_ALT TRUE From 89f061cac426077d37778e32297d7fa1a1124c77 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 02:25:21 +0000 Subject: [PATCH 21/85] fix lint --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- python/pyarrow/_gcsfs.pyx | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 28f5c301621..b81597cec8d 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -36,7 +36,7 @@ namespace arrow { namespace fs { struct GcsCredentialsHolder { // Constructor needed for make_shared - GcsCredentialsHolder(std::shared_ptr credentials) + explicit GcsCredentialsHolder(std::shared_ptr credentials) : credentials(std::move(credentials)) {} std::shared_ptr credentials; }; diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 21f8741af73..3093befe263 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -104,7 +104,8 @@ cdef class GcsFileSystem(FileSystem): options = CGcsOptions.Anonymous() elif access_token: if not isinstance(credential_token_expiration, datetime): - raise ValueError("credential_token_expiration must be a datetime") + raise ValueError( + "credential_token_expiration must be a datetime") options = CGcsOptions.FromAccessToken( tobytes(access_token), PyDateTime_to_TimePoint(credential_token_expiration)) From 359f8a85c8ca96dc97e022cd57fd28669a5247d2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 02:37:00 +0000 Subject: [PATCH 22/85] cast expiration to system clock (mac) --- cpp/src/arrow/filesystem/gcsfs.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index b81597cec8d..8a2b802aa44 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -19,6 +19,7 @@ #include #include +#include #include "arrow/buffer.h" #include "arrow/filesystem/gcsfs_internal.h" @@ -707,8 +708,11 @@ GcsOptions GcsOptions::Anonymous() { GcsOptions GcsOptions::FromAccessToken(const std::string& access_token, TimePoint expiration) { GcsOptions options{}; - options.credentials.holder_ = std::make_shared( - google::cloud::MakeAccessTokenCredentials(access_token, expiration)); + options.credentials.holder_ = + std::make_shared(google::cloud::MakeAccessTokenCredentials( + access_token, + std::chrono::time_point_cast( + expiration))); options.credentials.access_token_ = access_token; options.credentials.expiration_ = expiration; options.scheme = "https"; From 25d2305b187cb82ea4bf802ab05779e73e80fcd1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 02:38:52 +0000 Subject: [PATCH 23/85] try to remove separate testbench install --- ci/scripts/python_test.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 4d8f1b73c3f..4e2990b84d6 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -54,9 +54,4 @@ export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET export PYARROW_TEST_S3 -# DO NOT SUBMIT -# Without this, the previous install of testbench doesn't seem to be put in a -# place that the python env can find it. Need githash because of recent -# dependency changes. -python -m pip install "https://github.com/googleapis/storage-testbench/archive/c80ed7604824428a7b8269504ec948a62b445f5d.tar.gz" pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow From 8ba98e51953c42a7d7b1662c8e8b40560fffc8b9 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 03:20:29 +0000 Subject: [PATCH 24/85] some simplication --- cpp/src/arrow/filesystem/gcsfs.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 8a2b802aa44..761156975e7 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -381,9 +381,7 @@ class GcsFileSystem::Impl { // Skip the directory itself from the results, and any result that is "too deep" // into the recursion. bool has_trailing_slash = !o->name().empty() && o->name().back() == '/'; - // TODO(emkornfield): Add docs explaining or simplify o->name() comparison - // to both p.object and o.name() - if (o->name() == canonical || o->name() == p.object || + if (o->name() == canonical || internal::Depth(o->name()) > (max_depth + has_trailing_slash)) { continue; } @@ -656,8 +654,13 @@ class GcsFileSystem::Impl { return internal::ToArrowStatus(meta.status()); } - // TODO(emkornfield): Add more comments on normalize_directories - // or simplify logic. + // The normalize_directories parameter is needed because + // how a directory is listed. If a specific path is asked + // for with a trailing slash it is expected to have a trailing + // slash [1] but for recursive listings it is expected that + // directories have their path normalized [2]. + // [1] https://github.com/apache/arrow/blob/3eaa7dd0e8b3dabc5438203331f05e3e6c011e37/python/pyarrow/tests/test_fs.py#L688 + // [2] https://github.com/apache/arrow/blob/3eaa7dd0e8b3dabc5438203331f05e3e6c011e37/cpp/src/arrow/filesystem/test_util.cc#L767 static FileInfo ToFileInfo(const std::string& full_path, const gcs::ObjectMetadata& meta, bool normalize_directories = false) { From 842a29a93a12016442f49ce61dd4d2cc72b8f0c7 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 03:27:35 +0000 Subject: [PATCH 25/85] simplify further --- cpp/src/arrow/filesystem/gcsfs.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 761156975e7..4be496c8c1c 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -365,9 +365,9 @@ class GcsFileSystem::Impl { internal::Depth(p.object) + select.max_recursion + !p.object.empty(); auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); - // TODO(emkornfield): Add docs explaining or simplify. - auto include_trailing = select.recursive ? gcs::IncludeTrailingDelimiter(false) - : gcs::IncludeTrailingDelimiter(true); + // Include trailing delimiters ensures that files matching "directory" + // conventions are also included in the listing. + auto include_trailing = gcs::IncludeTrailingDelimiter(true); FileInfoVector result; for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter, include_trailing)) { From 878d991832ddc7a3ab5ed80b26ee0584823ab8f2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 04:08:46 +0000 Subject: [PATCH 26/85] Avoid counting trailing slash in Depth --- cpp/src/arrow/filesystem/gcsfs.cc | 12 ++++++------ cpp/src/arrow/filesystem/gcsfs_internal.cc | 5 ++++- cpp/src/arrow/filesystem/gcsfs_test.cc | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 4be496c8c1c..51eb66e077a 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -362,7 +362,7 @@ class GcsFileSystem::Impl { // Need to add one level when the object is not empty because all // directories have an extra slash. const auto max_depth = - internal::Depth(p.object) + select.max_recursion + !p.object.empty(); + internal::Depth(canonical) + select.max_recursion + !p.object.empty(); auto prefix = p.object.empty() ? gcs::Prefix() : gcs::Prefix(canonical); auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); // Include trailing delimiters ensures that files matching "directory" @@ -380,9 +380,7 @@ class GcsFileSystem::Impl { } // Skip the directory itself from the results, and any result that is "too deep" // into the recursion. - bool has_trailing_slash = !o->name().empty() && o->name().back() == '/'; - if (o->name() == canonical || - internal::Depth(o->name()) > (max_depth + has_trailing_slash)) { + if (o->name() == canonical || internal::Depth(o->name()) > max_depth) { continue; } auto path = internal::ConcatAbstractPath(o->bucket(), o->name()); @@ -659,8 +657,10 @@ class GcsFileSystem::Impl { // for with a trailing slash it is expected to have a trailing // slash [1] but for recursive listings it is expected that // directories have their path normalized [2]. - // [1] https://github.com/apache/arrow/blob/3eaa7dd0e8b3dabc5438203331f05e3e6c011e37/python/pyarrow/tests/test_fs.py#L688 - // [2] https://github.com/apache/arrow/blob/3eaa7dd0e8b3dabc5438203331f05e3e6c011e37/cpp/src/arrow/filesystem/test_util.cc#L767 + // [1] + // https://github.com/apache/arrow/blob/3eaa7dd0e8b3dabc5438203331f05e3e6c011e37/python/pyarrow/tests/test_fs.py#L688 + // [2] + // https://github.com/apache/arrow/blob/3eaa7dd0e8b3dabc5438203331f05e3e6c011e37/cpp/src/arrow/filesystem/test_util.cc#L767 static FileInfo ToFileInfo(const std::string& full_path, const gcs::ObjectMetadata& meta, bool normalize_directories = false) { diff --git a/cpp/src/arrow/filesystem/gcsfs_internal.cc b/cpp/src/arrow/filesystem/gcsfs_internal.cc index a75b51430f7..b8f0ab80b21 100644 --- a/cpp/src/arrow/filesystem/gcsfs_internal.cc +++ b/cpp/src/arrow/filesystem/gcsfs_internal.cc @@ -296,7 +296,10 @@ Result> FromObjectMetadata( } std::int64_t Depth(arrow::util::string_view path) { - return std::count(path.begin(), path.end(), fs::internal::kSep); + // The last slash is not counted towards depth because it represents a + // directory. + bool has_trailing_slash = !path.empty() && path.back() == '/'; + return std::count(path.begin(), path.end(), fs::internal::kSep) - has_trailing_slash; } } // namespace internal diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 3f1523bfbcf..4718a384e47 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -682,7 +682,7 @@ TEST_F(GcsIntegrationTest, GetFileInfoSelectorLimitedRecursion) { SCOPED_TRACE("Testing with max_recursion=" + std::to_string(max_recursion)); const auto max_depth = internal::Depth(internal::EnsureTrailingSlash(hierarchy.base_dir)) + - max_recursion; + max_recursion + 1; // Add 1 because files in a directory should be included std::vector expected; std::copy_if(hierarchy.contents.begin(), hierarchy.contents.end(), std::back_inserter(expected), [&](const arrow::fs::FileInfo& info) { From 18af5084fa0915d683b626014b6d093218a9c397 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 04:09:02 +0000 Subject: [PATCH 27/85] Revert "try to remove separate testbench install" This reverts commit 977966033b75da4ca0a39a1628943508ff47f63b. --- ci/scripts/python_test.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 4e2990b84d6..4d8f1b73c3f 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -54,4 +54,9 @@ export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET export PYARROW_TEST_S3 +# DO NOT SUBMIT +# Without this, the previous install of testbench doesn't seem to be put in a +# place that the python env can find it. Need githash because of recent +# dependency changes. +python -m pip install "https://github.com/googleapis/storage-testbench/archive/c80ed7604824428a7b8269504ec948a62b445f5d.tar.gz" pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow From acc2a5745cd2a49037044a9ba6fdc8b9f0303bfe Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 04:11:17 +0000 Subject: [PATCH 28/85] use latest version and remove DO NOT SUBMIT --- ci/scripts/python_test.sh | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 4d8f1b73c3f..25c4ec3126c 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -54,9 +54,7 @@ export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET export PYARROW_TEST_S3 -# DO NOT SUBMIT -# Without this, the previous install of testbench doesn't seem to be put in a -# place that the python env can find it. Need githash because of recent -# dependency changes. -python -m pip install "https://github.com/googleapis/storage-testbench/archive/c80ed7604824428a7b8269504ec948a62b445f5d.tar.gz" +# Without this, install_gcs_test_bench.sh doesn't seem to be put in a +# place that the python env can find it. +python -m pip install "https://github.com/googleapis/storage-testbench/archive/v0.16.0.tar.gz" pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow From 93be2be9cf019d3f3ed12ad3b75bbc3c57418f4f Mon Sep 17 00:00:00 2001 From: emkornfield Date: Wed, 20 Apr 2022 15:18:50 -0700 Subject: [PATCH 29/85] Update python/pyarrow/tests/conftest.py Co-authored-by: Antoine Pitrou --- python/pyarrow/tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index 4f7f0a7bc6e..a06ac92095b 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -184,8 +184,8 @@ def gcs_server(): proc = None try: proc = subprocess.Popen(args, env=env) - except OSError: - pytest.skip('`gcs testbench` raised an error when executing') + except OSError as e: + pytest.skip(f"Command {args} failed to execute: {e}") else: yield { 'connection': ('localhost', port), From 3f6954a185e3359e490517fc0554c9a5879a9235 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:04:05 +0000 Subject: [PATCH 30/85] comment out GCS ON instead of set it to off --- .github/workflows/cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 075b27e629e..82e99160c3d 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -277,7 +277,7 @@ jobs: ARROW_GANDIVA: ON # google-could-cpp uses _dupenv_s() but it can't be used with msvcrt. # We need to use ucrt to use _dupenv_s(). - ARROW_GCS: OFF + # ARROW_GCS: ON ARROW_HDFS: OFF ARROW_HOME: /mingw${{ matrix.mingw-n-bits }} ARROW_JEMALLOC: OFF From 92ccc6f130b81fe6960c079ed123b61b4ce32665 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:05:57 +0000 Subject: [PATCH 31/85] add gs/gcs to filesystemfromuri docs --- cpp/src/arrow/filesystem/filesystem.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index dfa2b740087..6dc18d7de84 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -452,7 +452,8 @@ class ARROW_EXPORT SlowFileSystem : public FileSystem { /// \brief Create a new FileSystem by URI /// -/// Recognized schemes are "file", "mock", "hdfs" and "s3fs". +/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3", +/// "gs" and "gcs". /// /// \param[in] uri a URI-based path, ex: file:///some/local/path /// \param[out] out_path (optional) Path inside the filesystem. @@ -463,7 +464,8 @@ Result> FileSystemFromUri(const std::string& uri, /// \brief Create a new FileSystem by URI with a custom IO context /// -/// Recognized schemes are "file", "mock", "hdfs" and "s3fs". +/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3", +/// "gs" and "gcs". /// /// \param[in] uri a URI-based path, ex: file:///some/local/path /// \param[in] io_context an IOContext which will be associated with the filesystem From 1b512f3092d2be1d50bf16512a4563a1ffb09b37 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:07:10 +0000 Subject: [PATCH 32/85] remove duplicate code --- cpp/src/arrow/filesystem/gcsfs.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 51eb66e077a..c65815aebcf 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -779,9 +779,6 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, if (!uri.password().empty()) { return Status::Invalid("GCS URIs do not accept password."); } - if (!uri.password().empty()) { - return Status::Invalid("GCS URIs do not accept password."); - } auto options = anonymous ? GcsOptions::Anonymous() : GcsOptions::Defaults(); for (const auto& kv : options_map) { From e9f713078efb205821bf6fe076ef72804d24a233 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:13:53 +0000 Subject: [PATCH 33/85] remove duplicate PYARROW_WITH_GCS --- ci/scripts/python_build.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/scripts/python_build.sh b/ci/scripts/python_build.sh index b715374b925..cfac68bd6ec 100755 --- a/ci/scripts/python_build.sh +++ b/ci/scripts/python_build.sh @@ -64,7 +64,6 @@ export PYARROW_WITH_ORC=${ARROW_ORC:-OFF} export PYARROW_WITH_PLASMA=${ARROW_PLASMA:-OFF} export PYARROW_WITH_PARQUET=${ARROW_PARQUET:-OFF} export PYARROW_WITH_PARQUET_ENCRYPTION=${PARQUET_REQUIRE_ENCRYPTION:-ON} -export PYARROW_WITH_GCS=${ARROW_GCS:-OFF} export PYARROW_WITH_S3=${ARROW_S3:-OFF} export PYARROW_WITH_SUBSTRAIT=${ARROW_SUBSTRAIT:-OFF} From 8ed3b299bf9d03793847f6d5a34b52302fec4144 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:17:05 +0000 Subject: [PATCH 34/85] remove duplicate cli.py --- dev/archery/archery/cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dev/archery/archery/cli.py b/dev/archery/archery/cli.py index 522b6cec73a..8b4c38b42fa 100644 --- a/dev/archery/archery/cli.py +++ b/dev/archery/archery/cli.py @@ -163,7 +163,7 @@ def _apply_options(cmd, options): @click.option("--with-gandiva", default=None, type=BOOL, help="Build with Gandiva expression compiler support.") @click.option("--with-gcs", default=None, type=BOOL, - help="Build Arrow with Google Cloud Storage (gcs) support.") + help="Build Arrow with Google Cloud Storage (GCS) support.") @click.option("--with-hdfs", default=None, type=BOOL, help="Build the Arrow HDFS bridge.") @click.option("--with-hiveserver2", default=None, type=BOOL, @@ -185,8 +185,6 @@ def _apply_options(cmd, options): @click.option("--with-r", default=None, type=BOOL, help="Build the Arrow R extensions. This is not a CMake option, " "it will toggle required options") -@click.option("--with-gcs", default=None, type=BOOL, - help="Build Arrow with Google Cloud Storage (GCS) support.") @click.option("--with-s3", default=None, type=BOOL, help="Build Arrow with S3 support.") # Compressions From 25741f282c12d47ae9977997aa3ddf7c515a055a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:21:16 +0000 Subject: [PATCH 35/85] add smoke test for FileSystemFromUri --- cpp/src/arrow/filesystem/gcsfs_test.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 4718a384e47..05c3bbab978 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -1306,6 +1306,14 @@ TEST_F(GcsIntegrationTest, OpenInputFileClosed) { ASSERT_RAISES(Invalid, stream->Seek(2)); } +TEST_F(GcsIntegrationTest, TestFileSystemFromUri) { + // Smoke test for FileSystemFromUri + ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(std::string("gs://anonymous@") + + PreexistingBucketPath())); + ASSERT_OK_AND_ASSIGN(auto fs2, FileSystemFromUri(std::string("gcs://anonymous@") + + PreexistingBucketPath())); +} + } // namespace } // namespace fs } // namespace arrow From aa4df5f7a43531e9d2bbc83ae7addb462c70fbff Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:38:45 +0000 Subject: [PATCH 36/85] separate gcs and s3 variables in tasks.yml --- dev/tasks/tasks.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 8ae8ec0b6da..f79b7cb7c6d 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -469,8 +469,8 @@ tasks: {############################## Wheel OSX ####################################} # enable S3 support from macOS 10.13 so we don't need to bundle curl, crypt and ssl -{% for macos_version, macos_codename, arrow_s3_gcs in [("10.9", "mavericks", "OFF"), - ("10.13", "high-sierra", "ON")] %} +{% for macos_version, macos_codename, arrow_s3, arrow_gcs in [("10.9", "mavericks", "OFF", "OFF"), + ("10.13", "high-sierra", "ON", "OFF")] %} {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %} wheel-macos-{{ macos_codename }}-{{ python_tag }}-amd64: @@ -479,8 +479,8 @@ tasks: params: python_version: "{{ python_version }}" macos_deployment_target: "{{ macos_version }}" - arrow_s3: "{{ arrow_s3_gcs }}" - arrow_gcs: "{{ arrow_s3_gcs }}" + arrow_s3: "{{ arrow_s3 }}" + arrow_gcs: "{{ arrow_gcs }}" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl From ad86713ee0896a010532c8d31f82eb47ed4492cb Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 20 Apr 2022 22:57:54 +0000 Subject: [PATCH 37/85] make GcsOptions default constructible --- cpp/src/arrow/filesystem/gcsfs.cc | 11 +++++++---- cpp/src/arrow/filesystem/gcsfs.h | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index c65815aebcf..0717996101b 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -685,6 +685,12 @@ class GcsFileSystem::Impl { gcs::Client client_; }; +GcsOptions::GcsOptions() { + this->credentials.holder_ = std::make_shared( + google::cloud::MakeGoogleDefaultCredentials()); + this->scheme = "https"; +} + bool GcsOptions::Equals(const GcsOptions& other) const { return credentials.Equals(other.credentials) && endpoint_override == other.endpoint_override && scheme == other.scheme && @@ -692,10 +698,7 @@ bool GcsOptions::Equals(const GcsOptions& other) const { } GcsOptions GcsOptions::Defaults() { - GcsOptions options{}; - options.credentials.holder_ = std::make_shared( - google::cloud::MakeGoogleDefaultCredentials()); - options.scheme = "https"; + GcsOptions options; return options; } diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index b483d7651e2..3a667a1ae1b 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -56,8 +56,8 @@ class GcsCredentials { /// Options for the GcsFileSystem implementation. struct ARROW_EXPORT GcsOptions { - // TODO(emkornfield): make this a shared ptr or use - // another mechanism to make GcsOptions GcsConstructible. + /// \brief Equivelant to the GcsOptions::Defaults() that returns. + GcsOptions(); GcsCredentials credentials; std::string endpoint_override; From 54da88fba53a104f28d576eefbcaddf8a329c49a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 02:28:59 +0000 Subject: [PATCH 38/85] run install_gcs_test_bench if it exists --- .github/workflows/python.yml | 3 +-- ci/scripts/python_test.sh | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index b14d0db3220..fbbff24deb1 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -122,8 +122,7 @@ jobs: ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON - # TODO(ARROW-16102): Turn this on once we can build client successfully. - ARROW_GCS: OFF + ARROW_GCS: ON ARROW_HDFS: ON ARROW_JEMALLOC: ON ARROW_ORC: ON diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index 25c4ec3126c..cf8944f87f3 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -54,7 +54,7 @@ export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET export PYARROW_TEST_S3 -# Without this, install_gcs_test_bench.sh doesn't seem to be put in a -# place that the python env can find it. -python -m pip install "https://github.com/googleapis/storage-testbench/archive/v0.16.0.tar.gz" +if [ -f "/arrow/ci/scripts/install_gcs_testbench.sh" ]; then + /arrow/ci/scripts/install_gcs_testbench.sh default +fi pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow From 05448aabc97576dc7ae7606caa91bd93b0d024fa Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 03:01:49 +0000 Subject: [PATCH 39/85] turn of gcs on macos github workflow --- .github/workflows/python.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index fbbff24deb1..c3c6b151860 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -122,7 +122,7 @@ jobs: ARROW_DATASET: ON ARROW_FLIGHT: ON ARROW_GANDIVA: ON - ARROW_GCS: ON + ARROW_GCS: OFF ARROW_HDFS: ON ARROW_JEMALLOC: ON ARROW_ORC: ON From c09bcb2544a5026058cc08f88a8f6cabae752ac3 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 03:27:59 +0000 Subject: [PATCH 40/85] install testbench on wheel test --- ci/scripts/python_wheel_unix_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 889686ad201..32ee572093d 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -72,6 +72,7 @@ import pyarrow.parquet import pyarrow.plasma " if [ "${PYARROW_TEST_GCS}" == "ON" ]; then + pip install https://github.com/googleapis/storage-testbench/archive/v0.16.0.tar.gz python -c "import pyarrow._gcsfs" fi if [ "${PYARROW_TEST_S3}" == "ON" ]; then From d312f4677978cf55fe620d8a115d1277b1c5eb51 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 04:28:30 +0000 Subject: [PATCH 41/85] fix error for calculating directory presence --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- cpp/src/arrow/filesystem/gcsfs_test.cc | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 0717996101b..dd9d154acab 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -534,7 +534,7 @@ class GcsFileSystem::Impl { submitted.push_back(DeferNotOk(io_context.executor()->Submit(async_delete, o))); } - if (!missing_dir_ok && !at_least_one_obj && !dir) { + if (!missing_dir_ok && !at_least_one_obj && !dir && !p.object.empty()) { // No files were found and no directory marker exists return Status::IOError("No such directory: ", p.full_path); } diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 05c3bbab978..07cf24b6a40 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -776,6 +776,14 @@ TEST_F(GcsIntegrationTest, CreateDirUri) { ASSERT_RAISES(Invalid, fs->CreateDir("gs://" + RandomBucketName(), true)); } +TEST_F(GcsIntegrationTest, DeleteBucketDirSuccess) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + ASSERT_OK(fs->CreateDir("pyarrow-filesystem/", /*recursive=*/true)); + EXPECT_FALSE(fs->CreateDir("/", false).ok()); + ASSERT_OK(fs->DeleteDir("pyarrow-filesystem/")); +} + + TEST_F(GcsIntegrationTest, DeleteDirSuccess) { auto fs = GcsFileSystem::Make(TestGcsOptions()); ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); From fb8d3598270038e6883559d21cf4cc44cba5a016 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 21 Apr 2022 12:10:48 +0200 Subject: [PATCH 42/85] Install the testbench using the right Python --- ci/docker/conda-python.dockerfile | 5 +++++ ci/scripts/python_test.sh | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 5ef69431b84..18106d8b18b 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -32,6 +32,11 @@ RUN mamba install -q \ nomkl && \ mamba clean --all +# XXX The GCS testbench was already installed in conda-cpp.dockerfile, +# but we changed the installed Python version above, so we need to reinstall it. +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + ENV ARROW_PYTHON=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ diff --git a/ci/scripts/python_test.sh b/ci/scripts/python_test.sh index cf8944f87f3..4e2990b84d6 100755 --- a/ci/scripts/python_test.sh +++ b/ci/scripts/python_test.sh @@ -54,7 +54,4 @@ export PYARROW_TEST_ORC export PYARROW_TEST_PARQUET export PYARROW_TEST_S3 -if [ -f "/arrow/ci/scripts/install_gcs_testbench.sh" ]; then - /arrow/ci/scripts/install_gcs_testbench.sh default -fi pytest -r s -v ${PYTEST_ARGS} --pyargs pyarrow From fefada1e56eb4574ca84f5514b0edb37e20303cd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 11:13:40 +0000 Subject: [PATCH 43/85] add ARROW_EXPORT --- cpp/src/arrow/filesystem/gcsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 3a667a1ae1b..35f18080fa6 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -32,7 +32,7 @@ struct GcsCredentialsHolder; class GcsFileSystem; /// \brief Container for GCS Credentials and information necessary to recreate them. -class GcsCredentials { +class ARROW_EXPORT GcsCredentials { public: bool Equals(const GcsCredentials& other) const; bool anonymous() const { return anonymous_; } From 28a22eb156919c3127450f785e40c7e1898220c8 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 11:17:53 +0000 Subject: [PATCH 44/85] fix alphabeetic order in build-pyarrow.sh --- dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh index ff43af7a568..6e23c5eed90 100644 --- a/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh +++ b/dev/tasks/conda-recipes/arrow-cpp/build-pyarrow.sh @@ -17,8 +17,8 @@ if [[ "${target_platform}" == "osx-arm64" ]]; then else export PYARROW_WITH_GANDIVA=1 fi -export PYARROW_WITH_HDFS=1 export PYARROW_WITH_GCS=1 +export PYARROW_WITH_HDFS=1 export PYARROW_WITH_ORC=1 export PYARROW_WITH_PARQUET=1 export PYARROW_WITH_PARQUET_ENCRYPTION=1 From 013a7d2c1a306397c5b50720f78abdbb87be5908 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 11:18:52 +0000 Subject: [PATCH 45/85] enable macOS 10.13 gcs build --- dev/tasks/tasks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index f79b7cb7c6d..1bdb4d28213 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -468,9 +468,9 @@ tasks: {############################## Wheel OSX ####################################} -# enable S3 support from macOS 10.13 so we don't need to bundle curl, crypt and ssl +# enable S3 and GCS support from macOS 10.13 so we don't need to bundle curl, crypt and ssl {% for macos_version, macos_codename, arrow_s3, arrow_gcs in [("10.9", "mavericks", "OFF", "OFF"), - ("10.13", "high-sierra", "ON", "OFF")] %} + ("10.13", "high-sierra", "ON", "ON")] %} {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %} wheel-macos-{{ macos_codename }}-{{ python_tag }}-amd64: From feb83a518b37dc0c82a26fa6d09349d0186e77ef Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 11:21:29 +0000 Subject: [PATCH 46/85] use install_gcs_bench.sh in unix wheel test script --- ci/scripts/python_wheel_unix_test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 32ee572093d..26f76101e24 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -72,7 +72,6 @@ import pyarrow.parquet import pyarrow.plasma " if [ "${PYARROW_TEST_GCS}" == "ON" ]; then - pip install https://github.com/googleapis/storage-testbench/archive/v0.16.0.tar.gz python -c "import pyarrow._gcsfs" fi if [ "${PYARROW_TEST_S3}" == "ON" ]; then @@ -89,6 +88,7 @@ fi if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Install testing dependencies pip install -U -r ${source_dir}/python/requirements-wheel-test.txt + PYTHON=python ${source_dir}/ci/scripts/install_gcs_testbench.sh default # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' python -m pytest -r s --pyargs pyarrow From 9e7ca4c9a5a9e18f0609e0ee807df5a984afebc2 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Thu, 21 Apr 2022 04:27:01 -0700 Subject: [PATCH 47/85] Update python/pyarrow/_gcsfs.pyx Co-authored-by: Joris Van den Bossche --- python/pyarrow/_gcsfs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 3093befe263..2da1473f14b 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -31,7 +31,7 @@ from datetime import datetime cdef class GcsFileSystem(FileSystem): """ - GCS-backed FileSystem implementation + Google Cloud Storage (GCS) backed FileSystem implementation By default uses the process described in https://google.aip.dev/auth/4110 to resolve credentials. If not running on GCP this generally requires the From 7048d54eec9ebb5c4205d76970d274d41d953295 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Thu, 21 Apr 2022 04:27:46 -0700 Subject: [PATCH 48/85] Update python/pyarrow/_gcsfs.pyx Co-authored-by: Joris Van den Bossche --- python/pyarrow/_gcsfs.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 2da1473f14b..4c526cbae30 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -34,9 +34,10 @@ cdef class GcsFileSystem(FileSystem): Google Cloud Storage (GCS) backed FileSystem implementation By default uses the process described in https://google.aip.dev/auth/4110 - to resolve credentials. If not running on GCP this generally requires the - environment variable GOOGLE_APPLICATION_CREDENTIALS to point to a JSON - file containing credentials. + to resolve credentials. If not running on Google Cloud Platform (GCP), + this generally requires the environment variable + GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file + containing credentials. Note: GCS buckets are special and the operations available on them may be limited or more expensive than expected compared to local file systems. From a984a48dbe0e71e4c50edba17c9676be66b96edf Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 15:41:18 +0000 Subject: [PATCH 49/85] fixup tests per feedback --- cpp/src/arrow/filesystem/gcsfs_test.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 07cf24b6a40..e5b4b06c8dd 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -431,13 +431,13 @@ TEST(GcsFileSystem, ToArrowStatus) { } TEST(GcsFileSystem, FileSystemCompare) { - GcsOptions a_options = GcsOptions::Defaults(); + GcsOptions a_options; a_options.scheme = "http"; auto a = GcsFileSystem::Make(a_options); EXPECT_THAT(a, NotNull()); EXPECT_TRUE(a->Equals(*a)); - GcsOptions b_options = GcsOptions::Defaults(); + GcsOptions b_options; b_options.scheme = "http"; b_options.endpoint_override = "localhost:1234"; auto b = GcsFileSystem::Make(b_options); @@ -1318,8 +1318,10 @@ TEST_F(GcsIntegrationTest, TestFileSystemFromUri) { // Smoke test for FileSystemFromUri ASSERT_OK_AND_ASSIGN(auto fs, FileSystemFromUri(std::string("gs://anonymous@") + PreexistingBucketPath())); + EXPECT_EQ(fs->type_name(), "gcs"); ASSERT_OK_AND_ASSIGN(auto fs2, FileSystemFromUri(std::string("gcs://anonymous@") + PreexistingBucketPath())); + EXPECT_EQ(fs2->type_name(), "gcs"); } } // namespace From 57069f3999daa6b675987004f5519510918482f6 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 21 Apr 2022 15:49:10 +0000 Subject: [PATCH 50/85] make close idempotent --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- cpp/src/arrow/filesystem/gcsfs_test.cc | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index dd9d154acab..dfc9ab2e814 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -182,7 +182,7 @@ class GcsOutputStream : public arrow::io::OutputStream { Status Close() override { if (closed_) { - return Status::Invalid("Already closed stream."); + return Status::OK(); } stream_.Close(); closed_ = true; diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index e5b4b06c8dd..e9aebf9bc1f 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -783,7 +783,6 @@ TEST_F(GcsIntegrationTest, DeleteBucketDirSuccess) { ASSERT_OK(fs->DeleteDir("pyarrow-filesystem/")); } - TEST_F(GcsIntegrationTest, DeleteDirSuccess) { auto fs = GcsFileSystem::Make(TestGcsOptions()); ASSERT_OK_AND_ASSIGN(auto hierarchy, CreateHierarchy(fs)); From b0e1b8256b0135de5dbff851b3cd61a1f73cf7e7 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 24 Apr 2022 08:14:29 +0900 Subject: [PATCH 51/85] Use bundled google-cloud-cpp-storage --- ci/docker/ubuntu-22.04-cpp.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 2e9c96e0be7..1398dcd636a 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -176,6 +176,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_WITH_ZLIB=ON \ ARROW_WITH_ZSTD=ON \ AWSSDK_SOURCE=BUNDLED \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ From 2a7904163361e3acf700252b625d012964a6043f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Sun, 24 Apr 2022 08:22:35 +0900 Subject: [PATCH 52/85] Install test dependencies in Dockerfile --- ci/docker/python-wheel-manylinux-test.dockerfile | 3 +++ ci/scripts/python_wheel_unix_test.sh | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ci/docker/python-wheel-manylinux-test.dockerfile b/ci/docker/python-wheel-manylinux-test.dockerfile index 55c27d1d7bb..cdd0ae3ced7 100644 --- a/ci/docker/python-wheel-manylinux-test.dockerfile +++ b/ci/docker/python-wheel-manylinux-test.dockerfile @@ -25,3 +25,6 @@ FROM ${arch}/python:${python} # test dependencies in a docker image COPY python/requirements-wheel-test.txt /arrow/python/ RUN pip install -r /arrow/python/requirements-wheel-test.txt + +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts/ +RUN PYTHON=python /arrow/ci/scripts/install_gcs_testbench.sh default diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index 26f76101e24..fd13dca7601 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -86,9 +86,11 @@ import pyarrow.plasma fi if [ "${CHECK_UNITTESTS}" == "ON" ]; then - # Install testing dependencies - pip install -U -r ${source_dir}/python/requirements-wheel-test.txt - PYTHON=python ${source_dir}/ci/scripts/install_gcs_testbench.sh default + # Generally, we should install testing dependencies here to install + # built wheels without testing dependencies. Testing dependencies are + # installed in ci/docker/python-wheel-manylinux-test.dockerfile to + # reduce test time. + # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' python -m pytest -r s --pyargs pyarrow From b746d0514f231c2cf23d78d3c7baafb94209710c Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 25 Apr 2022 06:13:56 +0900 Subject: [PATCH 53/85] Enable GCS on Debian and Fedora --- ci/docker/debian-10-cpp.dockerfile | 2 ++ ci/docker/debian-11-cpp.dockerfile | 2 ++ ci/docker/fedora-35-cpp.dockerfile | 2 ++ 3 files changed, 6 insertions(+) diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile index 15c85b868ba..584ce869a91 100644 --- a/ci/docker/debian-10-cpp.dockerfile +++ b/ci/docker/debian-10-cpp.dockerfile @@ -81,6 +81,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_FLIGHT=ON \ ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ ARROW_HOME=/usr/local \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ @@ -98,6 +99,7 @@ ENV ARROW_BUILD_TESTS=ON \ cares_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ gRPC_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ diff --git a/ci/docker/debian-11-cpp.dockerfile b/ci/docker/debian-11-cpp.dockerfile index 1bb67e334e8..dfccd85e559 100644 --- a/ci/docker/debian-11-cpp.dockerfile +++ b/ci/docker/debian-11-cpp.dockerfile @@ -83,6 +83,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_FLIGHT=ON \ ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ ARROW_HOME=/usr/local \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ @@ -99,6 +100,7 @@ ENV ARROW_BUILD_TESTS=ON \ AWSSDK_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PATH=/usr/lib/ccache/:$PATH \ Protobuf_SOURCE=BUNDLED diff --git a/ci/docker/fedora-35-cpp.dockerfile b/ci/docker/fedora-35-cpp.dockerfile index b79ceb894bf..947c9aba1b7 100644 --- a/ci/docker/fedora-35-cpp.dockerfile +++ b/ci/docker/fedora-35-cpp.dockerfile @@ -77,6 +77,7 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_FLIGHT=ON \ ARROW_GANDIVA_JAVA=ON \ ARROW_GANDIVA=ON \ + ARROW_GCS=ON \ ARROW_HOME=/usr/local \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ @@ -92,6 +93,7 @@ ENV ARROW_BUILD_TESTS=ON \ AWSSDK_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ + google_cloud_cpp_storage_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXECUTABLES=ON \ PARQUET_BUILD_EXAMPLES=ON \ From 72d5fea6d08278efa19b8ec12b3d01c7142336d6 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 25 Apr 2022 10:41:54 +0900 Subject: [PATCH 54/85] Don't enable GCS on Debian 10 --- ci/docker/debian-10-cpp.dockerfile | 2 -- 1 file changed, 2 deletions(-) diff --git a/ci/docker/debian-10-cpp.dockerfile b/ci/docker/debian-10-cpp.dockerfile index 584ce869a91..15c85b868ba 100644 --- a/ci/docker/debian-10-cpp.dockerfile +++ b/ci/docker/debian-10-cpp.dockerfile @@ -81,7 +81,6 @@ ENV ARROW_BUILD_TESTS=ON \ ARROW_DEPENDENCY_SOURCE=SYSTEM \ ARROW_FLIGHT=ON \ ARROW_GANDIVA=ON \ - ARROW_GCS=ON \ ARROW_HOME=/usr/local \ ARROW_ORC=ON \ ARROW_PARQUET=ON \ @@ -99,7 +98,6 @@ ENV ARROW_BUILD_TESTS=ON \ cares_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ - google_cloud_cpp_storage_SOURCE=BUNDLED \ gRPC_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ From 7ea444337da06c30f946544789949c41a3b274b0 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 26 Apr 2022 05:56:34 +0900 Subject: [PATCH 55/85] Update installing testing dependency in dev/tasks/python-wheels/ --- ci/scripts/python_wheel_unix_test.sh | 3 +++ dev/tasks/python-wheels/github.osx.amd64.yml | 6 ++++-- dev/tasks/python-wheels/github.osx.arm64.yml | 12 ++++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index fd13dca7601..2b2fe9cdf11 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -90,6 +90,9 @@ if [ "${CHECK_UNITTESTS}" == "ON" ]; then # built wheels without testing dependencies. Testing dependencies are # installed in ci/docker/python-wheel-manylinux-test.dockerfile to # reduce test time. + # + # We also need to update dev/tasks/python-wheels/*.yml when we need + # to add more steps to prepare testing dependencies. # Execute unittest, test dependencies must be installed python -c 'import pyarrow; pyarrow.create_library_symlinks()' diff --git a/dev/tasks/python-wheels/github.osx.amd64.yml b/dev/tasks/python-wheels/github.osx.amd64.yml index d0f834f40c1..01c6cfa74d5 100644 --- a/dev/tasks/python-wheels/github.osx.amd64.yml +++ b/dev/tasks/python-wheels/github.osx.amd64.yml @@ -92,7 +92,7 @@ jobs: run: | $PYTHON -m venv build-env source build-env/bin/activate - pip install --upgrade pip wheel + $PYTHON -m pip install --upgrade pip wheel arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build - name: Test Wheel @@ -100,7 +100,9 @@ jobs: run: | $PYTHON -m venv test-env source test-env/bin/activate - pip install --upgrade pip wheel + $PYTHON -m pip install --upgrade pip wheel + $PYTHON -m pip install -r arrow/python/requirements-wheel-test.txt + arrow/ci/scripts/install_gcs_testbench.sh default arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }} diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index 101d8c6ee8d..c6182e312cf 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -108,13 +108,13 @@ jobs: run: | $PYTHON -m venv build-amd64-env source build-amd64-env/bin/activate - pip install --upgrade pip wheel + $PYTHON -m pip install --upgrade pip wheel arch -x86_64 arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build - name: Fuse AMD64 and ARM64 wheels run: | source build-amd64-env/bin/activate - pip install delocate + $PYTHON -m pip install delocate amd64_wheel=$(ls arrow/python/repaired_wheels/pyarrow*x86_64.whl) arm64_wheel=$(ls arrow/python/repaired_wheels/pyarrow*arm64.whl) @@ -133,7 +133,9 @@ jobs: run: | $PYTHON -m venv test-arm64-env source test-arm64-env/bin/activate - pip install --upgrade pip wheel + $PYTHON -m pip install --upgrade pip wheel + $PYTHON -m pip install -r arrow/python/requirements-wheel-test.txt + arrow/ci/scripts/install_gcs_testbench.sh default arch -arm64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% if arch == "universal2" %} @@ -144,7 +146,9 @@ jobs: run: | $PYTHON -m venv test-amd64-env source test-amd64-env/bin/activate - pip install --upgrade pip wheel + $PYTHON -m pip install --upgrade pip wheel + $PYTHON -m pip install -r arrow/python/requirements-wheel-test.txt + arrow/ci/scripts/install_gcs_testbench.sh default arch -x86_64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% endif %} From 1d761f3a230be0ec7e1f14d77285194174afdc24 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 26 Apr 2022 16:10:14 +0900 Subject: [PATCH 56/85] Fix wrong Python use --- dev/tasks/python-wheels/github.osx.amd64.yml | 10 +++++----- dev/tasks/python-wheels/github.osx.arm64.yml | 16 ++++++++-------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dev/tasks/python-wheels/github.osx.amd64.yml b/dev/tasks/python-wheels/github.osx.amd64.yml index 01c6cfa74d5..c18e080ac21 100644 --- a/dev/tasks/python-wheels/github.osx.amd64.yml +++ b/dev/tasks/python-wheels/github.osx.amd64.yml @@ -92,17 +92,17 @@ jobs: run: | $PYTHON -m venv build-env source build-env/bin/activate - $PYTHON -m pip install --upgrade pip wheel - arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build + pip install --upgrade pip wheel + PYTHON=python arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build - name: Test Wheel shell: bash run: | $PYTHON -m venv test-env source test-env/bin/activate - $PYTHON -m pip install --upgrade pip wheel - $PYTHON -m pip install -r arrow/python/requirements-wheel-test.txt - arrow/ci/scripts/install_gcs_testbench.sh default + pip install --upgrade pip wheel + pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {{ macros.github_upload_releases("arrow/python/repaired_wheels/*.whl")|indent }} diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index c6182e312cf..23cde9fdaf6 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -108,13 +108,13 @@ jobs: run: | $PYTHON -m venv build-amd64-env source build-amd64-env/bin/activate - $PYTHON -m pip install --upgrade pip wheel + pip install --upgrade pip wheel arch -x86_64 arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build - name: Fuse AMD64 and ARM64 wheels run: | source build-amd64-env/bin/activate - $PYTHON -m pip install delocate + pip install delocate amd64_wheel=$(ls arrow/python/repaired_wheels/pyarrow*x86_64.whl) arm64_wheel=$(ls arrow/python/repaired_wheels/pyarrow*arm64.whl) @@ -133,9 +133,9 @@ jobs: run: | $PYTHON -m venv test-arm64-env source test-arm64-env/bin/activate - $PYTHON -m pip install --upgrade pip wheel - $PYTHON -m pip install -r arrow/python/requirements-wheel-test.txt - arrow/ci/scripts/install_gcs_testbench.sh default + pip install --upgrade pip wheel + pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default arch -arm64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% if arch == "universal2" %} @@ -146,9 +146,9 @@ jobs: run: | $PYTHON -m venv test-amd64-env source test-amd64-env/bin/activate - $PYTHON -m pip install --upgrade pip wheel - $PYTHON -m pip install -r arrow/python/requirements-wheel-test.txt - arrow/ci/scripts/install_gcs_testbench.sh default + pip install --upgrade pip wheel + pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default arch -x86_64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% endif %} From 8874daf41e40f2bdfb7fb725e2cf1d0e1fd451da Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 27 Apr 2022 18:26:12 +0000 Subject: [PATCH 57/85] Make failures of testbench short lived --- cpp/src/arrow/filesystem/gcsfs.cc | 20 ++++++++++++++++++-- cpp/src/arrow/filesystem/gcsfs.h | 7 +++++++ cpp/src/arrow/filesystem/gcsfs_test.cc | 21 +++++++++++++++++++-- python/pyarrow/_gcsfs.pyx | 17 +++++++++++++++-- python/pyarrow/includes/libarrow.pxd | 7 +++++++ python/pyarrow/includes/libarrow_fs.pxd | 1 + python/pyarrow/tests/test_fs.py | 8 ++++++-- 7 files changed, 73 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index dfc9ab2e814..562881d0253 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -318,6 +318,11 @@ google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { options.set( o.credentials.holder()->credentials); } + if (o.retry_limit_seconds.has_value()) { + options.set( + gcs::LimitedTimeRetryPolicy(std::chrono::seconds(*o.retry_limit_seconds)) + .clone()); + } return options; } @@ -367,7 +372,10 @@ class GcsFileSystem::Impl { auto delimiter = select.recursive ? gcs::Delimiter() : gcs::Delimiter("/"); // Include trailing delimiters ensures that files matching "directory" // conventions are also included in the listing. - auto include_trailing = gcs::IncludeTrailingDelimiter(true); + // Only included for select.recursive false because a delimiter needs + // to be specified. + auto include_trailing = select.recursive ? gcs::IncludeTrailingDelimiter(false) + : gcs::IncludeTrailingDelimiter(true); FileInfoVector result; for (auto const& o : client_.ListObjects(p.bucket, prefix, delimiter, include_trailing)) { @@ -694,7 +702,8 @@ GcsOptions::GcsOptions() { bool GcsOptions::Equals(const GcsOptions& other) const { return credentials.Equals(other.credentials) && endpoint_override == other.endpoint_override && scheme == other.scheme && - default_bucket_location == other.default_bucket_location; + default_bucket_location == other.default_bucket_location && + retry_limit_seconds == other.retry_limit_seconds; } GcsOptions GcsOptions::Defaults() { @@ -791,6 +800,13 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, options.scheme = kv.second; } else if (kv.first == "endpoint_override") { options.endpoint_override = kv.second; + } else if (kv.first == "retry_limit_seconds") { + int parsed_seconds = atoi(kv.second.c_str()); + if (parsed_seconds <= 0) { + return Status::Invalid( + "retry_limit_seconds must be a positive integer. found '", kv.second, "'"); + } + options.retry_limit_seconds = parsed_seconds; } else { return Status::Invalid("Unexpected query parameter in GCS URI: '", kv.first, "'"); } diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 35f18080fa6..d44ea6a40cb 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -22,6 +22,7 @@ #include #include "arrow/filesystem/filesystem.h" +#include "arrow/util/optional.h" #include "arrow/util/uri.h" namespace arrow { @@ -65,6 +66,12 @@ struct ARROW_EXPORT GcsOptions { /// \brief Location to use for creating buckets. std::string default_bucket_location; + /// \brief If set used to control total time allowed for retrying underlying + /// errors. + /// + /// The default policy is to retry for up to 15 minutes. + arrow::util::optional retry_limit_seconds; + /// \brief Default metadata for OpenOutputStream. /// /// This will be ignored if non-empty metadata is passed to OpenOutputStream. diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index e9aebf9bc1f..467a14cb075 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -100,6 +100,8 @@ class GcsTestbench : public ::testing::Environment { error_ = std::move(error); } + bool running() { return server_process_.running(); } + ~GcsTestbench() override { // Brutal shutdown, kill the full process group because the GCS testbench may launch // additional children. @@ -131,6 +133,7 @@ class GcsIntegrationTest : public ::testing::Test { void SetUp() override { ASSERT_THAT(Testbench(), NotNull()); ASSERT_THAT(Testbench()->error(), IsEmpty()); + ASSERT_TRUE(Testbench()->running()); // Initialize a PRNG with a small amount of entropy. generator_ = std::mt19937_64(std::random_device()()); @@ -141,7 +144,11 @@ class GcsIntegrationTest : public ::testing::Test { auto client = gcs::Client( google::cloud::Options{} .set("http://127.0.0.1:" + Testbench()->port()) - .set(gc::MakeInsecureCredentials())); + .set(gc::MakeInsecureCredentials()) + .set(std::chrono::seconds(3)) + .set( + gcs::LimitedTimeRetryPolicy(std::chrono::seconds(3)).clone())); + google::cloud::StatusOr bucket = client.CreateBucketForProject( PreexistingBucketName(), "ignored-by-testbench", gcs::BucketMetadata{}); ASSERT_TRUE(bucket.ok()) << "Failed to create bucket <" << PreexistingBucketName() @@ -168,6 +175,7 @@ class GcsIntegrationTest : public ::testing::Test { GcsOptions TestGcsOptions() { auto options = GcsOptions::Anonymous(); options.endpoint_override = "127.0.0.1:" + Testbench()->port(); + options.retry_limit_seconds = 2; return options; } @@ -328,18 +336,27 @@ TEST(GcsFileSystem, OptionsFromUri) { ASSERT_OK_AND_ASSIGN( options, GcsOptions::FromUri("gs://mybucket/foo/bar/" - "?endpoint_override=localhost&scheme=http&location=us-west2", + "?endpoint_override=localhost&scheme=http&location=us-west2" + "&retry_limit_seconds=5", &path)); EXPECT_EQ(options.default_bucket_location, "us-west2"); EXPECT_EQ(options.scheme, "http"); EXPECT_EQ(options.endpoint_override, "localhost"); EXPECT_EQ(path, "mybucket/foo/bar"); + ASSERT_TRUE(options.retry_limit_seconds.has_value()); + EXPECT_EQ(*options.retry_limit_seconds, 5); // Missing bucket name ASSERT_RAISES(Invalid, GcsOptions::FromUri("gs:///foo/bar/", &path)); // Invalid option ASSERT_RAISES(Invalid, GcsOptions::FromUri("gs://mybucket/?xxx=zzz", &path)); + + // Invalid retry limit + ASSERT_RAISES(Invalid, + GcsOptions::FromUri("gs://foo/bar/?retry_limit_seconds=0", &path)); + ASSERT_RAISES(Invalid, + GcsOptions::FromUri("gs://foo/bar/?retry_limit_seconds=-1", &path)); } TEST(GcsFileSystem, OptionsAccessToken) { diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 4c526cbae30..8af5e4abd6e 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -26,7 +26,7 @@ from pyarrow.includes.libarrow_fs cimport * from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint from cython.operator cimport dereference as deref -from datetime import datetime +from datetime import datetime, timedelta cdef class GcsFileSystem(FileSystem): @@ -73,6 +73,9 @@ cdef class GcsFileSystem(FileSystem): default_metadata : mapping or pyarrow.KeyValueMetadata, default None Default metadata for `open_output_stream`. This will be ignored if non-empty metadata is passed to `open_output_stream`. + retry_time_limit : timedelta, default None + Set the maximum amount of time the GCS client will attempt to retry + transient errors. Subsecond granularity is ignored. """ cdef: @@ -83,10 +86,12 @@ cdef class GcsFileSystem(FileSystem): default_bucket_location='US', scheme=None, endpoint_override=None, - default_metadata=None): + default_metadata=None, + retry_time_limit=None): cdef: CGcsOptions options shared_ptr[CGcsFileSystem] wrapped + int64_t time_limit_seconds # Intentional use of truthiness because empty strings aren't valid and # for reconstruction from pickling will give empty strings. @@ -129,6 +134,9 @@ cdef class GcsFileSystem(FileSystem): if default_metadata is not None: options.default_metadata = pyarrow_unwrap_metadata( ensure_metadata(default_metadata)) + if retry_time_limit is not None: + time_limit_seconds = int(retry_time_limit.total_seconds()) + options.retry_limit_seconds = time_limit_seconds with nogil: wrapped = GetResultValue(CGcsFileSystem.Make(options)) @@ -154,6 +162,10 @@ cdef class GcsFileSystem(FileSystem): cdef CGcsOptions opts = self.gcsfs.options() service_account = frombytes(opts.credentials.target_service_account()) expiration_dt = self._expiration_datetime_from_options() + retry_time_limit = None + if opts.retry_limit_seconds.has_value(): + retry_time_limit = timedelta( + seconds=opts.retry_limit_seconds.value()) return ( GcsFileSystem._reconstruct, (dict( access_token=frombytes(opts.credentials.access_token()), @@ -165,6 +177,7 @@ cdef class GcsFileSystem(FileSystem): default_bucket_location=frombytes( opts.default_bucket_location), default_metadata=pyarrow_wrap_metadata(opts.default_metadata), + retry_time_limit=retry_time_limit ),)) @property diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index ba651af50b7..c55fd315b11 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -54,6 +54,13 @@ cdef extern from "arrow/util/decimal.h" namespace "arrow" nogil: cdef cppclass CDecimal256" arrow::Decimal256": c_string ToString(int32_t scale) const +cdef extern from "arrow/util/optional.h" namespace "arrow::util" nogil: + cdef cppclass c_optional"arrow::util::optional"[T]: + c_bool has_value() + T value() + c_optional(T&) + c_optional& operator=[U](U&) + cdef extern from "arrow/config.h" namespace "arrow" nogil: cdef cppclass CBuildInfo" arrow::BuildInfo": diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 8a73b2afa75..c6da0653c91 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -211,6 +211,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_string endpoint_override c_string scheme c_string default_bucket_location + c_optional[int64_t] retry_limit_seconds shared_ptr[const CKeyValueMetadata] default_metadata c_bool Equals(const CS3Options& other) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index ff06026a20a..565e52b5208 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -207,12 +207,15 @@ def gcsfs(request, gcs_server): host, port = gcs_server['connection'] bucket = 'pyarrow-filesystem/' + # Make sure the server is alive. + assert gcs_server['process'].poll() is None fs = GcsFileSystem( endpoint_override=f'{host}:{port}', scheme='http', # Mock endpoint doesn't check credentials. - anonymous=True + anonymous=True, + retry_time_limit=timedelta(seconds=3) ) fs.create_dir(bucket) @@ -1398,7 +1401,8 @@ def test_filesystem_from_uri_gcs(gcs_server): host, port = gcs_server['connection'] uri = ("gs://anonymous@" + - f"mybucket/foo/bar?scheme=http&endpoint_override={host}:{port}") + f"mybucket/foo/bar?scheme=http&endpoint_override={host}:{port}&" + + "retry_limit_seconds=3") fs, path = FileSystem.from_uri(uri) assert isinstance(fs, GcsFileSystem) From 7ee8c748fab700b27abd128290be2492a2150cbb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 28 Apr 2022 13:36:22 +0900 Subject: [PATCH 58/85] Add missing arch --- dev/tasks/python-wheels/github.osx.arm64.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index 23cde9fdaf6..7198da7de47 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -134,8 +134,8 @@ jobs: $PYTHON -m venv test-arm64-env source test-arm64-env/bin/activate pip install --upgrade pip wheel - pip install -r arrow/python/requirements-wheel-test.txt - PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default + arch -arm64 pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arch -arm64 arrow/ci/scripts/install_gcs_testbench.sh default arch -arm64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% if arch == "universal2" %} @@ -147,8 +147,8 @@ jobs: $PYTHON -m venv test-amd64-env source test-amd64-env/bin/activate pip install --upgrade pip wheel - pip install -r arrow/python/requirements-wheel-test.txt - PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default + arch -x86_64 pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arch -x86_64 arrow/ci/scripts/install_gcs_testbench.sh default arch -x86_64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% endif %} From b547ac1a78a546c5ce660aeeaf360e637b299371 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 28 Apr 2022 16:03:42 +0000 Subject: [PATCH 59/85] use 5 second timeout for tests --- cpp/src/arrow/filesystem/gcsfs_test.cc | 6 +++--- python/pyarrow/tests/test_fs.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 467a14cb075..5d6e4840d2d 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -145,9 +145,9 @@ class GcsIntegrationTest : public ::testing::Test { google::cloud::Options{} .set("http://127.0.0.1:" + Testbench()->port()) .set(gc::MakeInsecureCredentials()) - .set(std::chrono::seconds(3)) + .set(std::chrono::seconds(5)) .set( - gcs::LimitedTimeRetryPolicy(std::chrono::seconds(3)).clone())); + gcs::LimitedTimeRetryPolicy(std::chrono::seconds(5)).clone())); google::cloud::StatusOr bucket = client.CreateBucketForProject( PreexistingBucketName(), "ignored-by-testbench", gcs::BucketMetadata{}); @@ -175,7 +175,7 @@ class GcsIntegrationTest : public ::testing::Test { GcsOptions TestGcsOptions() { auto options = GcsOptions::Anonymous(); options.endpoint_override = "127.0.0.1:" + Testbench()->port(); - options.retry_limit_seconds = 2; + options.retry_limit_seconds = 5; return options; } diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 565e52b5208..85a12c8ed56 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -215,7 +215,7 @@ def gcsfs(request, gcs_server): scheme='http', # Mock endpoint doesn't check credentials. anonymous=True, - retry_time_limit=timedelta(seconds=3) + retry_time_limit=timedelta(seconds=5) ) fs.create_dir(bucket) @@ -1402,7 +1402,7 @@ def test_filesystem_from_uri_gcs(gcs_server): uri = ("gs://anonymous@" + f"mybucket/foo/bar?scheme=http&endpoint_override={host}:{port}&" + - "retry_limit_seconds=3") + "retry_limit_seconds=5") fs, path = FileSystem.from_uri(uri) assert isinstance(fs, GcsFileSystem) From de6179c1080bb436981d6bacca6e86eb166fa566 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 2 May 2022 14:14:49 +0900 Subject: [PATCH 60/85] Try installing testbench on arm64 macOS --- ci/scripts/install_gcs_testbench.sh | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 0282e0fda50..2c63a58e3e2 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -24,10 +24,15 @@ if [ "$#" -ne 1 ]; then exit 1 fi -if [ "$(uname -m)" != "x86_64" ]; then - echo "GCS testbench won't install on non-x86 architecture" - exit 0 -fi +case "$(uname -s)-$(uname -m)" in + Linux-x86_64|Darwin-*) + : # OK + ;; + *) + echo "GCS testbench won't install on non-x86 architecture Linux" + exit 0 + ;; +esac version=$1 if [[ "${version}" -eq "default" ]]; then From e4e895c69545e66185d208b5e3ac2fd6158674d4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 6 May 2022 15:58:04 +0900 Subject: [PATCH 61/85] Install python3-dev to build crc32c locally --- .travis.yml | 3 --- ci/docker/ubuntu-20.04-cpp.dockerfile | 1 + ci/scripts/install_gcs_testbench.sh | 10 ---------- 3 files changed, 1 insertion(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index f906ba8686d..c006b22bbf7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -60,7 +60,6 @@ jobs: DOCKER_RUN_ARGS: >- " -e ARROW_BUILD_STATIC=OFF - -e ARROW_GCS=OFF -e ARROW_ORC=OFF -e ARROW_USE_GLOG=OFF -e CMAKE_UNITY_BUILD=ON @@ -96,7 +95,6 @@ jobs: " -e ARROW_BUILD_STATIC=OFF -e ARROW_FLIGHT=ON - -e ARROW_GCS=OFF -e ARROW_MIMALLOC=OFF -e ARROW_ORC=OFF -e ARROW_SUBSTRAIT=OFF @@ -148,7 +146,6 @@ jobs: " -e ARROW_BUILD_STATIC=OFF -e ARROW_FLIGHT=ON - -e ARROW_GCS=OFF -e ARROW_MIMALLOC=OFF -e ARROW_ORC=OFF -e ARROW_PARQUET=OFF diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 6e811ea2f71..0eade393cd4 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -96,6 +96,7 @@ RUN apt-get update -y -q && \ nlohmann-json3-dev \ pkg-config \ protobuf-compiler \ + python3-dev \ python3-pip \ python3-rados \ rados-objclass-dev \ diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 2c63a58e3e2..afd4751a53c 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -24,16 +24,6 @@ if [ "$#" -ne 1 ]; then exit 1 fi -case "$(uname -s)-$(uname -m)" in - Linux-x86_64|Darwin-*) - : # OK - ;; - *) - echo "GCS testbench won't install on non-x86 architecture Linux" - exit 0 - ;; -esac - version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.16.0" From bfad720a6381649a0780b062e127644a639cf921 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 9 May 2022 14:28:07 +0900 Subject: [PATCH 62/85] x86 and arm are only enabled --- ci/scripts/install_gcs_testbench.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index afd4751a53c..9e2f8212451 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -24,6 +24,16 @@ if [ "$#" -ne 1 ]; then exit 1 fi +case "$(uname -m)" in + aarch64|arm64|x86_64) + : # OK + ;; + *) + echo "GCS testbench is installed only on x86 or arm architectures: $(uname -m)" + exit 0 + ;; +esac + version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.16.0" From 2a6bcd381c3b75ed8d60e2a4ebc77b412cab635a Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 17 May 2022 09:16:47 +0900 Subject: [PATCH 63/85] Don't use wheel for grpcio on arm64 macOS --- ci/scripts/install_gcs_testbench.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 9e2f8212451..81beb90f3d6 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -34,6 +34,14 @@ case "$(uname -m)" in ;; esac +case "$(uname -s)-$(uname -m)" in + Darwin-arm64) + # Workaround for https://github.com/grpc/grpc/issues/28387 . + # Build grpcio instead of using wheel. + ${PYTHON:-python3} -m pip install --no-binary :all: grpcio + ;; +esac + version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.16.0" From 1783e81c79c8066129ea7d5715dc562574ceb286 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 17 May 2022 09:32:30 +0900 Subject: [PATCH 64/85] Use grpcio 1.44.0 --- ci/scripts/install_gcs_testbench.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 81beb90f3d6..c7a6ee7a6dc 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -38,7 +38,8 @@ case "$(uname -s)-$(uname -m)" in Darwin-arm64) # Workaround for https://github.com/grpc/grpc/issues/28387 . # Build grpcio instead of using wheel. - ${PYTHON:-python3} -m pip install --no-binary :all: grpcio + # storage-testbench 0.16.0 pins grpcio to 1.44.0. + ${PYTHON:-python3} -m pip install --no-binary :all: "grpcio==1.44.0" ;; esac From a337c67e0300fccad0e26bd9378644e5ef9c3ee4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 17 May 2022 16:11:27 +0900 Subject: [PATCH 65/85] Disable GCS on s390x --- .travis.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index c006b22bbf7..b3aa724107c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -95,13 +95,14 @@ jobs: " -e ARROW_BUILD_STATIC=OFF -e ARROW_FLIGHT=ON + -e ARROW_GCS=OFF -e ARROW_MIMALLOC=OFF -e ARROW_ORC=OFF - -e ARROW_SUBSTRAIT=OFF -e ARROW_PARQUET=OFF -e ARROW_S3=OFF - -e CMAKE_UNITY_BUILD=ON + -e ARROW_SUBSTRAIT=OFF -e CMAKE_BUILD_PARALLEL_LEVEL=2 + -e CMAKE_UNITY_BUILD=ON -e PARQUET_BUILD_EXAMPLES=OFF -e PARQUET_BUILD_EXECUTABLES=OFF -e Protobuf_SOURCE=BUNDLED @@ -146,13 +147,14 @@ jobs: " -e ARROW_BUILD_STATIC=OFF -e ARROW_FLIGHT=ON + -e ARROW_GCS=OFF -e ARROW_MIMALLOC=OFF -e ARROW_ORC=OFF -e ARROW_PARQUET=OFF -e ARROW_PYTHON=ON -e ARROW_S3=OFF - -e CMAKE_UNITY_BUILD=ON -e CMAKE_BUILD_PARALLEL_LEVEL=2 + -e CMAKE_UNITY_BUILD=ON -e PARQUET_BUILD_EXAMPLES=OFF -e PARQUET_BUILD_EXECUTABLES=OFF -e Protobuf_SOURCE=BUNDLED From 62ad5c5d3e2c3afea1fd66ebded8c0d48639c7cd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 18 May 2022 05:37:34 +0000 Subject: [PATCH 66/85] up timeout to 20 seconds --- cpp/src/arrow/filesystem/gcsfs_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 5d6e4840d2d..5f4ae542789 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -175,7 +175,7 @@ class GcsIntegrationTest : public ::testing::Test { GcsOptions TestGcsOptions() { auto options = GcsOptions::Anonymous(); options.endpoint_override = "127.0.0.1:" + Testbench()->port(); - options.retry_limit_seconds = 5; + options.retry_limit_seconds = 20; return options; } @@ -337,14 +337,14 @@ TEST(GcsFileSystem, OptionsFromUri) { options, GcsOptions::FromUri("gs://mybucket/foo/bar/" "?endpoint_override=localhost&scheme=http&location=us-west2" - "&retry_limit_seconds=5", + "&retry_limit_seconds=20", &path)); EXPECT_EQ(options.default_bucket_location, "us-west2"); EXPECT_EQ(options.scheme, "http"); EXPECT_EQ(options.endpoint_override, "localhost"); EXPECT_EQ(path, "mybucket/foo/bar"); ASSERT_TRUE(options.retry_limit_seconds.has_value()); - EXPECT_EQ(*options.retry_limit_seconds, 5); + EXPECT_EQ(*options.retry_limit_seconds, 20); // Missing bucket name ASSERT_RAISES(Invalid, GcsOptions::FromUri("gs:///foo/bar/", &path)); From 5c5b9e24e0f1e579294694cc7b06173b977483e8 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 7 Jun 2022 06:51:38 +0000 Subject: [PATCH 67/85] add back conftest --- python/pyarrow/conftest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 2fea2e9cb23..638dad8568a 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -147,6 +147,13 @@ except ImportError: pass +try: + from pyarrow.fs import GcsFileSystem # noqa + defaults['gcs'] = True +except ImportError: + pass + + try: from pyarrow.fs import S3FileSystem # noqa defaults['s3'] = True From 870ed6e934342dab5614f45c3e8fb584a6a9d074 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 7 Jun 2022 06:52:37 +0000 Subject: [PATCH 68/85] add logging to equality for debugging --- cpp/src/arrow/filesystem/gcsfs.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 562881d0253..b9a057220d1 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -28,6 +28,7 @@ #include "arrow/io/util_internal.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" #include "arrow/util/thread_pool.h" #define ARROW_GCS_RETURN_NOT_OK(expr) \ @@ -46,6 +47,11 @@ bool GcsCredentials::Equals(const GcsCredentials& other) const { if (holder_->credentials == other.holder_->credentials) { return true; } + ARROW_LOG(INFO) << "anonymous: " << (anonymous_ == other.anonymous_) + << " access_token: " << (access_token_ == other.access_token_) + << " expiration: " << (expiration_ == other.expiration_) + << " json credentials: " << (json_credentials_ == other.json_credentials_) + << " target service account: " << (target_service_account_ == other.target_service_account_); return anonymous_ == other.anonymous_ && access_token_ == other.access_token_ && expiration_ == other.expiration_ && json_credentials_ == other.json_credentials_ && @@ -700,6 +706,12 @@ GcsOptions::GcsOptions() { } bool GcsOptions::Equals(const GcsOptions& other) const { + ARROW_LOG(INFO) << "credentials: " << (credentials.Equals(other.credentials)) + << " endpont_override: " << (endpoint_override == other.endpoint_override) + << " scheme: " << (scheme == other.scheme) + << " default bucket_location " << (default_bucket_location == other.default_bucket_location) + << " retry_limit_secodns: " << (retry_limit_seconds == other.retry_limit_seconds); + return credentials.Equals(other.credentials) && endpoint_override == other.endpoint_override && scheme == other.scheme && default_bucket_location == other.default_bucket_location && From c51415803deada6dfcee9bc834ad78ad2c92b83f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 7 Jun 2022 07:38:02 +0000 Subject: [PATCH 69/85] add logging for timepoint --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index b9a057220d1..6039698dfb4 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -49,7 +49,7 @@ bool GcsCredentials::Equals(const GcsCredentials& other) const { } ARROW_LOG(INFO) << "anonymous: " << (anonymous_ == other.anonymous_) << " access_token: " << (access_token_ == other.access_token_) - << " expiration: " << (expiration_ == other.expiration_) + << " expiration: " << (expiration_ == other.expiration_) << " this:" << expiration_.time_since_epoch().count() << " " << " that" << other.expiration_.time_since_epoch().count() << " json credentials: " << (json_credentials_ == other.json_credentials_) << " target service account: " << (target_service_account_ == other.target_service_account_); return anonymous_ == other.anonymous_ && access_token_ == other.access_token_ && From 21be233c637a9bccbec83bfc71fbaa7538d94212 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2022 04:52:57 +0000 Subject: [PATCH 70/85] accept nanos and use it for expiration --- python/pyarrow/_gcsfs.pyx | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 8af5e4abd6e..a61c789a192 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -22,6 +22,7 @@ from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_python cimport TimePoint_from_ns from pyarrow.includes.libarrow_fs cimport * from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint from cython.operator cimport dereference as deref @@ -61,9 +62,10 @@ cdef class GcsFileSystem(FileSystem): An optional service account to try to impersonate when accessing GCS. This requires the specified credential user or service account to have the necessary permissions. - credential_token_expiration : datetime, default None + credential_token_expiration : datetime or int, default None Expiration for credential generated with an access token. Must be specified - if `access_token` is specified. + if `access_token` is specified. If it is an int it represents + nanoseconds since the epoch. default_bucket_location : str, default 'US' GCP region to create buckets in. scheme : str, default 'https' @@ -109,12 +111,18 @@ cdef class GcsFileSystem(FileSystem): elif anonymous: options = CGcsOptions.Anonymous() elif access_token: - if not isinstance(credential_token_expiration, datetime): - raise ValueError( - "credential_token_expiration must be a datetime") - options = CGcsOptions.FromAccessToken( - tobytes(access_token), - PyDateTime_to_TimePoint(credential_token_expiration)) + if isinstance(credential_token_expiration, int): + options = CGcsOptions.FromAccessToken( + tobytes(access_token), + TimePoint_from_ns(credential_token_expiration)) + elif isinstance(credential_token_expiration, datetime): + options = CGcsOptions.FromAccessToken( + tobytes(access_token), + PyDateTime_to_TimePoint(credential_token_expiration)) + else: + raise ValueError( + "credential_token_expiration must be a datetime or int") + else: options = CGcsOptions.Defaults() @@ -151,17 +159,17 @@ cdef class GcsFileSystem(FileSystem): def _reconstruct(cls, kwargs): return cls(**kwargs) - def _expiration_datetime_from_options(self): + def _expiration_ns_from_options(self): expiration_ns = TimePoint_to_ns( self.gcsfs.options().credentials.expiration()) if expiration_ns == 0: return None - return datetime.fromtimestamp(expiration_ns / 1e9) + return expiration_ns def __reduce__(self): cdef CGcsOptions opts = self.gcsfs.options() service_account = frombytes(opts.credentials.target_service_account()) - expiration_dt = self._expiration_datetime_from_options() + expiration_dt = self._expiration_ns_from_options() retry_time_limit = None if opts.retry_limit_seconds.has_value(): retry_time_limit = timedelta( From 2af67fd34504dddff05449e7566829f543134aa9 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2022 05:13:12 +0000 Subject: [PATCH 71/85] Revert "add logging for timepoint" This reverts commit f2614c7cd692565daac7bfcc3a47a5c11484febd. --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 6039698dfb4..b9a057220d1 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -49,7 +49,7 @@ bool GcsCredentials::Equals(const GcsCredentials& other) const { } ARROW_LOG(INFO) << "anonymous: " << (anonymous_ == other.anonymous_) << " access_token: " << (access_token_ == other.access_token_) - << " expiration: " << (expiration_ == other.expiration_) << " this:" << expiration_.time_since_epoch().count() << " " << " that" << other.expiration_.time_since_epoch().count() + << " expiration: " << (expiration_ == other.expiration_) << " json credentials: " << (json_credentials_ == other.json_credentials_) << " target service account: " << (target_service_account_ == other.target_service_account_); return anonymous_ == other.anonymous_ && access_token_ == other.access_token_ && From 314791399ebb65cfbaa69acd3611662d81054c44 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2022 05:13:17 +0000 Subject: [PATCH 72/85] Revert "add logging to equality for debugging" This reverts commit a27f86fa20a12be34c18902c3c440ed9bea8a170. --- cpp/src/arrow/filesystem/gcsfs.cc | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index b9a057220d1..562881d0253 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -28,7 +28,6 @@ #include "arrow/io/util_internal.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/logging.h" #include "arrow/util/thread_pool.h" #define ARROW_GCS_RETURN_NOT_OK(expr) \ @@ -47,11 +46,6 @@ bool GcsCredentials::Equals(const GcsCredentials& other) const { if (holder_->credentials == other.holder_->credentials) { return true; } - ARROW_LOG(INFO) << "anonymous: " << (anonymous_ == other.anonymous_) - << " access_token: " << (access_token_ == other.access_token_) - << " expiration: " << (expiration_ == other.expiration_) - << " json credentials: " << (json_credentials_ == other.json_credentials_) - << " target service account: " << (target_service_account_ == other.target_service_account_); return anonymous_ == other.anonymous_ && access_token_ == other.access_token_ && expiration_ == other.expiration_ && json_credentials_ == other.json_credentials_ && @@ -706,12 +700,6 @@ GcsOptions::GcsOptions() { } bool GcsOptions::Equals(const GcsOptions& other) const { - ARROW_LOG(INFO) << "credentials: " << (credentials.Equals(other.credentials)) - << " endpont_override: " << (endpoint_override == other.endpoint_override) - << " scheme: " << (scheme == other.scheme) - << " default bucket_location " << (default_bucket_location == other.default_bucket_location) - << " retry_limit_secodns: " << (retry_limit_seconds == other.retry_limit_seconds); - return credentials.Equals(other.credentials) && endpoint_override == other.endpoint_override && scheme == other.scheme && default_bucket_location == other.default_bucket_location && From 67324025649346cecc47f138ea15eaaf78f88d9a Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2022 05:19:24 +0000 Subject: [PATCH 73/85] fix lint --- python/pyarrow/_gcsfs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index a61c789a192..97d0d629618 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -120,7 +120,7 @@ cdef class GcsFileSystem(FileSystem): tobytes(access_token), PyDateTime_to_TimePoint(credential_token_expiration)) else: - raise ValueError( + raise ValueError( "credential_token_expiration must be a datetime or int") else: From 544acfdca2a7029499b2c114957897062913bc73 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2022 22:00:41 +0000 Subject: [PATCH 74/85] Revert "fix lint" This reverts commit 6d8e3462d58120ff2ff0e5998da8f72afd28947d. --- python/pyarrow/_gcsfs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 97d0d629618..a61c789a192 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -120,7 +120,7 @@ cdef class GcsFileSystem(FileSystem): tobytes(access_token), PyDateTime_to_TimePoint(credential_token_expiration)) else: - raise ValueError( + raise ValueError( "credential_token_expiration must be a datetime or int") else: From 843b99dcd23a8d904922dd64e78550280820c1a7 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2022 22:00:51 +0000 Subject: [PATCH 75/85] Revert "accept nanos and use it for expiration" This reverts commit a071da99a6a7c29ac01271f816ad53d157840416. --- python/pyarrow/_gcsfs.pyx | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index a61c789a192..8af5e4abd6e 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -22,7 +22,6 @@ from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata, from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * -from pyarrow.includes.libarrow_python cimport TimePoint_from_ns from pyarrow.includes.libarrow_fs cimport * from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint from cython.operator cimport dereference as deref @@ -62,10 +61,9 @@ cdef class GcsFileSystem(FileSystem): An optional service account to try to impersonate when accessing GCS. This requires the specified credential user or service account to have the necessary permissions. - credential_token_expiration : datetime or int, default None + credential_token_expiration : datetime, default None Expiration for credential generated with an access token. Must be specified - if `access_token` is specified. If it is an int it represents - nanoseconds since the epoch. + if `access_token` is specified. default_bucket_location : str, default 'US' GCP region to create buckets in. scheme : str, default 'https' @@ -111,18 +109,12 @@ cdef class GcsFileSystem(FileSystem): elif anonymous: options = CGcsOptions.Anonymous() elif access_token: - if isinstance(credential_token_expiration, int): - options = CGcsOptions.FromAccessToken( - tobytes(access_token), - TimePoint_from_ns(credential_token_expiration)) - elif isinstance(credential_token_expiration, datetime): - options = CGcsOptions.FromAccessToken( - tobytes(access_token), - PyDateTime_to_TimePoint(credential_token_expiration)) - else: - raise ValueError( - "credential_token_expiration must be a datetime or int") - + if not isinstance(credential_token_expiration, datetime): + raise ValueError( + "credential_token_expiration must be a datetime") + options = CGcsOptions.FromAccessToken( + tobytes(access_token), + PyDateTime_to_TimePoint(credential_token_expiration)) else: options = CGcsOptions.Defaults() @@ -159,17 +151,17 @@ cdef class GcsFileSystem(FileSystem): def _reconstruct(cls, kwargs): return cls(**kwargs) - def _expiration_ns_from_options(self): + def _expiration_datetime_from_options(self): expiration_ns = TimePoint_to_ns( self.gcsfs.options().credentials.expiration()) if expiration_ns == 0: return None - return expiration_ns + return datetime.fromtimestamp(expiration_ns / 1e9) def __reduce__(self): cdef CGcsOptions opts = self.gcsfs.options() service_account = frombytes(opts.credentials.target_service_account()) - expiration_dt = self._expiration_ns_from_options() + expiration_dt = self._expiration_datetime_from_options() retry_time_limit = None if opts.retry_limit_seconds.has_value(): retry_time_limit = timedelta( From 75f8397f3c97848960bdc34ee882c7bfdc897c93 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2022 22:18:52 +0000 Subject: [PATCH 76/85] pass timezone to datetime --- python/pyarrow/_gcsfs.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 8af5e4abd6e..1c598c00445 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -26,7 +26,7 @@ from pyarrow.includes.libarrow_fs cimport * from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint from cython.operator cimport dereference as deref -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone cdef class GcsFileSystem(FileSystem): @@ -156,7 +156,7 @@ cdef class GcsFileSystem(FileSystem): self.gcsfs.options().credentials.expiration()) if expiration_ns == 0: return None - return datetime.fromtimestamp(expiration_ns / 1e9) + return datetime.fromtimestamp(expiration_ns / 1.0e9, timezone.utc) def __reduce__(self): cdef CGcsOptions opts = self.gcsfs.options() From 20c35d5b927e90aa57b514e0cc41243400219dfc Mon Sep 17 00:00:00 2001 From: emkornfield Date: Wed, 8 Jun 2022 20:27:21 -0700 Subject: [PATCH 77/85] increase timeout --- cpp/src/arrow/filesystem/gcsfs_test.cc | 6 +++--- python/pyarrow/tests/test_fs.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 5f4ae542789..a1d9efb712c 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -175,7 +175,7 @@ class GcsIntegrationTest : public ::testing::Test { GcsOptions TestGcsOptions() { auto options = GcsOptions::Anonymous(); options.endpoint_override = "127.0.0.1:" + Testbench()->port(); - options.retry_limit_seconds = 20; + options.retry_limit_seconds = 40; return options; } @@ -337,14 +337,14 @@ TEST(GcsFileSystem, OptionsFromUri) { options, GcsOptions::FromUri("gs://mybucket/foo/bar/" "?endpoint_override=localhost&scheme=http&location=us-west2" - "&retry_limit_seconds=20", + "&retry_limit_seconds=40", &path)); EXPECT_EQ(options.default_bucket_location, "us-west2"); EXPECT_EQ(options.scheme, "http"); EXPECT_EQ(options.endpoint_override, "localhost"); EXPECT_EQ(path, "mybucket/foo/bar"); ASSERT_TRUE(options.retry_limit_seconds.has_value()); - EXPECT_EQ(*options.retry_limit_seconds, 20); + EXPECT_EQ(*options.retry_limit_seconds, 40); // Missing bucket name ASSERT_RAISES(Invalid, GcsOptions::FromUri("gs:///foo/bar/", &path)); diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 85a12c8ed56..4bd532525ac 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -215,7 +215,7 @@ def gcsfs(request, gcs_server): scheme='http', # Mock endpoint doesn't check credentials. anonymous=True, - retry_time_limit=timedelta(seconds=5) + retry_time_limit=timedelta(seconds=45) ) fs.create_dir(bucket) From 724694053ecead06be730ea07243becd520fe25e Mon Sep 17 00:00:00 2001 From: emkornfield Date: Fri, 10 Jun 2022 09:53:56 -0700 Subject: [PATCH 78/85] Apply suggestions from code review Co-authored-by: Antoine Pitrou --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- cpp/src/arrow/filesystem/gcsfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 562881d0253..f41cd1c9e33 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -804,7 +804,7 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, int parsed_seconds = atoi(kv.second.c_str()); if (parsed_seconds <= 0) { return Status::Invalid( - "retry_limit_seconds must be a positive integer. found '", kv.second, "'"); + "retry_limit_seconds must be a positive integer, got '", kv.second, "'"); } options.retry_limit_seconds = parsed_seconds; } else { diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index d44ea6a40cb..7d0dbcebd45 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -57,7 +57,7 @@ class ARROW_EXPORT GcsCredentials { /// Options for the GcsFileSystem implementation. struct ARROW_EXPORT GcsOptions { - /// \brief Equivelant to the GcsOptions::Defaults() that returns. + /// \brief Equivalent to GcsOptions::Defaults(). GcsOptions(); GcsCredentials credentials; From 2d5c0f762d27e775d6f56713fb018ef64b25ddbd Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 10 Jun 2022 17:25:28 +0000 Subject: [PATCH 79/85] address code review feedback --- cpp/src/arrow/filesystem/gcsfs.cc | 11 ++++++----- cpp/src/arrow/filesystem/gcsfs.h | 2 +- cpp/src/arrow/filesystem/gcsfs_test.cc | 8 ++++---- python/pyarrow/_gcsfs.pyx | 4 ++-- python/pyarrow/includes/libarrow_fs.pxd | 2 +- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index f41cd1c9e33..82d2439a607 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -320,7 +320,8 @@ google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { } if (o.retry_limit_seconds.has_value()) { options.set( - gcs::LimitedTimeRetryPolicy(std::chrono::seconds(*o.retry_limit_seconds)) + gcs::LimitedTimeRetryPolicy( + std::chrono::milliseconds(static_cast(*o.retry_limit_seconds * 1000))) .clone()); } return options; @@ -801,10 +802,10 @@ Result GcsOptions::FromUri(const arrow::internal::Uri& uri, } else if (kv.first == "endpoint_override") { options.endpoint_override = kv.second; } else if (kv.first == "retry_limit_seconds") { - int parsed_seconds = atoi(kv.second.c_str()); - if (parsed_seconds <= 0) { - return Status::Invalid( - "retry_limit_seconds must be a positive integer, got '", kv.second, "'"); + double parsed_seconds = atof(kv.second.c_str()); + if (parsed_seconds <= 0.0) { + return Status::Invalid("retry_limit_seconds must be a positive integer, got '", + kv.second, "'"); } options.retry_limit_seconds = parsed_seconds; } else { diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 7d0dbcebd45..8458c7f2108 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -70,7 +70,7 @@ struct ARROW_EXPORT GcsOptions { /// errors. /// /// The default policy is to retry for up to 15 minutes. - arrow::util::optional retry_limit_seconds; + arrow::util::optional retry_limit_seconds; /// \brief Default metadata for OpenOutputStream. /// diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index a1d9efb712c..851c4ccc2a1 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -175,7 +175,7 @@ class GcsIntegrationTest : public ::testing::Test { GcsOptions TestGcsOptions() { auto options = GcsOptions::Anonymous(); options.endpoint_override = "127.0.0.1:" + Testbench()->port(); - options.retry_limit_seconds = 40; + options.retry_limit_seconds = 60; return options; } @@ -337,14 +337,14 @@ TEST(GcsFileSystem, OptionsFromUri) { options, GcsOptions::FromUri("gs://mybucket/foo/bar/" "?endpoint_override=localhost&scheme=http&location=us-west2" - "&retry_limit_seconds=40", + "&retry_limit_seconds=40.5", &path)); EXPECT_EQ(options.default_bucket_location, "us-west2"); EXPECT_EQ(options.scheme, "http"); EXPECT_EQ(options.endpoint_override, "localhost"); EXPECT_EQ(path, "mybucket/foo/bar"); ASSERT_TRUE(options.retry_limit_seconds.has_value()); - EXPECT_EQ(*options.retry_limit_seconds, 40); + EXPECT_EQ(*options.retry_limit_seconds, 40.5); // Missing bucket name ASSERT_RAISES(Invalid, GcsOptions::FromUri("gs:///foo/bar/", &path)); @@ -796,7 +796,7 @@ TEST_F(GcsIntegrationTest, CreateDirUri) { TEST_F(GcsIntegrationTest, DeleteBucketDirSuccess) { auto fs = GcsFileSystem::Make(TestGcsOptions()); ASSERT_OK(fs->CreateDir("pyarrow-filesystem/", /*recursive=*/true)); - EXPECT_FALSE(fs->CreateDir("/", false).ok()); + ASSERT_RAISES(Invalid, fs->CreateDir("/", false)); ASSERT_OK(fs->DeleteDir("pyarrow-filesystem/")); } diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 1c598c00445..9cff12fb2ea 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -91,7 +91,7 @@ cdef class GcsFileSystem(FileSystem): cdef: CGcsOptions options shared_ptr[CGcsFileSystem] wrapped - int64_t time_limit_seconds + double time_limit_seconds # Intentional use of truthiness because empty strings aren't valid and # for reconstruction from pickling will give empty strings. @@ -135,7 +135,7 @@ cdef class GcsFileSystem(FileSystem): options.default_metadata = pyarrow_unwrap_metadata( ensure_metadata(default_metadata)) if retry_time_limit is not None: - time_limit_seconds = int(retry_time_limit.total_seconds()) + time_limit_seconds = retry_time_limit.total_seconds() options.retry_limit_seconds = time_limit_seconds with nogil: diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index c6da0653c91..d0e15010031 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -211,7 +211,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_string endpoint_override c_string scheme c_string default_bucket_location - c_optional[int64_t] retry_limit_seconds + c_optional[double] retry_limit_seconds shared_ptr[const CKeyValueMetadata] default_metadata c_bool Equals(const CS3Options& other) From 2ff901d0f0199db31a914bcfb0ac313f2503a9aa Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 10 Jun 2022 23:40:49 +0000 Subject: [PATCH 80/85] bump client policy --- cpp/src/arrow/filesystem/gcsfs_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index 851c4ccc2a1..4d8f52ef48e 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -147,7 +147,7 @@ class GcsIntegrationTest : public ::testing::Test { .set(gc::MakeInsecureCredentials()) .set(std::chrono::seconds(5)) .set( - gcs::LimitedTimeRetryPolicy(std::chrono::seconds(5)).clone())); + gcs::LimitedTimeRetryPolicy(std::chrono::seconds(45)).clone())); google::cloud::StatusOr bucket = client.CreateBucketForProject( PreexistingBucketName(), "ignored-by-testbench", gcs::BucketMetadata{}); From 28158ee4b364ad00aca94f89147410cb9bb4628e Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 11 Jun 2022 10:22:12 +0200 Subject: [PATCH 81/85] Try to understand arch issues --- dev/tasks/python-wheels/github.osx.arm64.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index 7198da7de47..cbdc07c15a4 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -48,6 +48,12 @@ jobs: {{ macros.github_checkout_arrow()|indent }} + - name: Debug runner environment + run: | + arch + $PYTHON -c "import platform; print(platform.machine())" + arch -arm64 $PYTHON -c "import platform; print(platform.machine())" + - name: Add Brew's Bison to PATH run: echo "/opt/homebrew/opt/bison/bin" >> $GITHUB_PATH From 922e4ef23400b7c7f1940ff1d2b06146e4ac4909 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 11 Jun 2022 10:31:00 +0200 Subject: [PATCH 82/85] Try to rationalize uses of arch --- dev/tasks/python-wheels/github.osx.arm64.yml | 51 +++++++++----------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index cbdc07c15a4..af60eb5bbac 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -42,18 +42,15 @@ jobs: build: name: Build wheel for OS X runs-on: ["self-hosted", "macOS", "arm64"] + # Note the GHA runner on our self-hosted ARM64 machines is currently + # a x86 binary, so need to prefix with "arch -arm64" when commands + # need to run in ARM64 mode. steps: - name: Cleanup run: rm -rf arrow vcpkg build crossbow-env build-*-env test-*-env {{ macros.github_checkout_arrow()|indent }} - - name: Debug runner environment - run: | - arch - $PYTHON -c "import platform; print(platform.machine())" - arch -arm64 $PYTHON -c "import platform; print(platform.machine())" - - name: Add Brew's Bison to PATH run: echo "/opt/homebrew/opt/bison/bin" >> $GITHUB_PATH @@ -88,10 +85,10 @@ jobs: ARROW_SIMD_LEVEL: "DEFAULT" VCPKG_DEFAULT_TRIPLET: arm64-osx-static-release run: | - $PYTHON -m venv build-arm64-env + arch -arm64 $PYTHON -m venv build-arm64-env source build-arm64-env/bin/activate pip install --upgrade pip wheel - arch -arm64 arrow/ci/scripts/python_wheel_macos_build.sh arm64 $(pwd)/arrow $(pwd)/build + arrow/ci/scripts/python_wheel_macos_build.sh arm64 $(pwd)/arrow $(pwd)/build {% if arch == "universal2" %} - name: Install AMD64 Packages @@ -112,10 +109,10 @@ jobs: ARROW_SIMD_LEVEL: "NONE" VCPKG_DEFAULT_TRIPLET: amd64-osx-static-release run: | - $PYTHON -m venv build-amd64-env + arch -x86_64 $PYTHON -m venv build-amd64-env source build-amd64-env/bin/activate pip install --upgrade pip wheel - arch -x86_64 arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build + arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build - name: Fuse AMD64 and ARM64 wheels run: | @@ -137,12 +134,12 @@ jobs: env: PYTEST_ADDOPTS: "-k 'not test_cancellation'" run: | - $PYTHON -m venv test-arm64-env + arch -arm64 $PYTHON -m venv test-arm64-env source test-arm64-env/bin/activate pip install --upgrade pip wheel - arch -arm64 pip install -r arrow/python/requirements-wheel-test.txt - PYTHON=python arch -arm64 arrow/ci/scripts/install_gcs_testbench.sh default - arch -arm64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow + pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default + arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% if arch == "universal2" %} - name: Test Wheel on AMD64 @@ -150,27 +147,27 @@ jobs: env: PYTEST_ADDOPTS: "-k 'not test_cancellation'" run: | - $PYTHON -m venv test-amd64-env + arch -x86_64 $PYTHON -m venv test-amd64-env source test-amd64-env/bin/activate pip install --upgrade pip wheel - arch -x86_64 pip install -r arrow/python/requirements-wheel-test.txt - PYTHON=python arch -x86_64 arrow/ci/scripts/install_gcs_testbench.sh default - arch -x86_64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow + pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default + arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% endif %} - name: Upload artifacts shell: bash run: | - $PYTHON -m venv crossbow-env + arch -x86_64 $PYTHON -m venv crossbow-env source crossbow-env/bin/activate - arch -x86_64 pip install -e arrow/dev/archery[crossbow-upload] - arch -x86_64 archery crossbow \ - --queue-path $(pwd) \ - --queue-remote {{ queue_remote_url }} \ - upload-artifacts \ - --sha {{ task.branch }} \ - --tag {{ task.tag }} \ - "arrow/python/repaired_wheels/*.whl" + pip install -e arrow/dev/archery[crossbow-upload] + archery crossbow \ + --queue-path $(pwd) \ + --queue-remote {{ queue_remote_url }} \ + upload-artifacts \ + --sha {{ task.branch }} \ + --tag {{ task.tag }} \ + "arrow/python/repaired_wheels/*.whl" env: CROSSBOW_GITHUB_TOKEN: {{ "${{ secrets.CROSSBOW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}" }} From 3a7c769e0cf115fe0522d03dc52b5f7854f4e3a5 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 11 Jun 2022 10:44:49 +0200 Subject: [PATCH 83/85] Revert "Try to rationalize uses of arch" This reverts commit 922e4ef23400b7c7f1940ff1d2b06146e4ac4909. --- dev/tasks/python-wheels/github.osx.arm64.yml | 51 +++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index af60eb5bbac..cbdc07c15a4 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -42,15 +42,18 @@ jobs: build: name: Build wheel for OS X runs-on: ["self-hosted", "macOS", "arm64"] - # Note the GHA runner on our self-hosted ARM64 machines is currently - # a x86 binary, so need to prefix with "arch -arm64" when commands - # need to run in ARM64 mode. steps: - name: Cleanup run: rm -rf arrow vcpkg build crossbow-env build-*-env test-*-env {{ macros.github_checkout_arrow()|indent }} + - name: Debug runner environment + run: | + arch + $PYTHON -c "import platform; print(platform.machine())" + arch -arm64 $PYTHON -c "import platform; print(platform.machine())" + - name: Add Brew's Bison to PATH run: echo "/opt/homebrew/opt/bison/bin" >> $GITHUB_PATH @@ -85,10 +88,10 @@ jobs: ARROW_SIMD_LEVEL: "DEFAULT" VCPKG_DEFAULT_TRIPLET: arm64-osx-static-release run: | - arch -arm64 $PYTHON -m venv build-arm64-env + $PYTHON -m venv build-arm64-env source build-arm64-env/bin/activate pip install --upgrade pip wheel - arrow/ci/scripts/python_wheel_macos_build.sh arm64 $(pwd)/arrow $(pwd)/build + arch -arm64 arrow/ci/scripts/python_wheel_macos_build.sh arm64 $(pwd)/arrow $(pwd)/build {% if arch == "universal2" %} - name: Install AMD64 Packages @@ -109,10 +112,10 @@ jobs: ARROW_SIMD_LEVEL: "NONE" VCPKG_DEFAULT_TRIPLET: amd64-osx-static-release run: | - arch -x86_64 $PYTHON -m venv build-amd64-env + $PYTHON -m venv build-amd64-env source build-amd64-env/bin/activate pip install --upgrade pip wheel - arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build + arch -x86_64 arrow/ci/scripts/python_wheel_macos_build.sh x86_64 $(pwd)/arrow $(pwd)/build - name: Fuse AMD64 and ARM64 wheels run: | @@ -134,12 +137,12 @@ jobs: env: PYTEST_ADDOPTS: "-k 'not test_cancellation'" run: | - arch -arm64 $PYTHON -m venv test-arm64-env + $PYTHON -m venv test-arm64-env source test-arm64-env/bin/activate pip install --upgrade pip wheel - pip install -r arrow/python/requirements-wheel-test.txt - PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default - arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow + arch -arm64 pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arch -arm64 arrow/ci/scripts/install_gcs_testbench.sh default + arch -arm64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% if arch == "universal2" %} - name: Test Wheel on AMD64 @@ -147,27 +150,27 @@ jobs: env: PYTEST_ADDOPTS: "-k 'not test_cancellation'" run: | - arch -x86_64 $PYTHON -m venv test-amd64-env + $PYTHON -m venv test-amd64-env source test-amd64-env/bin/activate pip install --upgrade pip wheel - pip install -r arrow/python/requirements-wheel-test.txt - PYTHON=python arrow/ci/scripts/install_gcs_testbench.sh default - arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow + arch -x86_64 pip install -r arrow/python/requirements-wheel-test.txt + PYTHON=python arch -x86_64 arrow/ci/scripts/install_gcs_testbench.sh default + arch -x86_64 arrow/ci/scripts/python_wheel_unix_test.sh $(pwd)/arrow {% endif %} - name: Upload artifacts shell: bash run: | - arch -x86_64 $PYTHON -m venv crossbow-env + $PYTHON -m venv crossbow-env source crossbow-env/bin/activate - pip install -e arrow/dev/archery[crossbow-upload] - archery crossbow \ - --queue-path $(pwd) \ - --queue-remote {{ queue_remote_url }} \ - upload-artifacts \ - --sha {{ task.branch }} \ - --tag {{ task.tag }} \ - "arrow/python/repaired_wheels/*.whl" + arch -x86_64 pip install -e arrow/dev/archery[crossbow-upload] + arch -x86_64 archery crossbow \ + --queue-path $(pwd) \ + --queue-remote {{ queue_remote_url }} \ + upload-artifacts \ + --sha {{ task.branch }} \ + --tag {{ task.tag }} \ + "arrow/python/repaired_wheels/*.whl" env: CROSSBOW_GITHUB_TOKEN: {{ "${{ secrets.CROSSBOW_GITHUB_TOKEN || secrets.GITHUB_TOKEN }}" }} From 0f1f1dad068e77279bfa0da99ff264541626aa5b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 11 Jun 2022 10:44:52 +0200 Subject: [PATCH 84/85] Revert "Try to understand arch issues" This reverts commit 28158ee4b364ad00aca94f89147410cb9bb4628e. --- dev/tasks/python-wheels/github.osx.arm64.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dev/tasks/python-wheels/github.osx.arm64.yml b/dev/tasks/python-wheels/github.osx.arm64.yml index cbdc07c15a4..7198da7de47 100644 --- a/dev/tasks/python-wheels/github.osx.arm64.yml +++ b/dev/tasks/python-wheels/github.osx.arm64.yml @@ -48,12 +48,6 @@ jobs: {{ macros.github_checkout_arrow()|indent }} - - name: Debug runner environment - run: | - arch - $PYTHON -c "import platform; print(platform.machine())" - arch -arm64 $PYTHON -c "import platform; print(platform.machine())" - - name: Add Brew's Bison to PATH run: echo "/opt/homebrew/opt/bison/bin" >> $GITHUB_PATH From b8336ef41518331bac2eec814481cdeaaed120df Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sun, 12 Jun 2022 10:35:32 +0200 Subject: [PATCH 85/85] Make error more informative in warnings check --- python/pyarrow/tests/test_pandas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 078714df041..143bb0e33e0 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -226,7 +226,7 @@ def test_rangeindex_doesnt_warn(self): with pytest.warns(None) as record: _check_pandas_roundtrip(df, preserve_index=True) - assert len(record) == 0 + assert len(record) == 0, [r.message for r in record] def test_multiindex_columns(self): columns = pd.MultiIndex.from_arrays([ @@ -277,7 +277,7 @@ def test_multiindex_doesnt_warn(self): with pytest.warns(None) as record: _check_pandas_roundtrip(df, preserve_index=True) - assert len(record) == 0 + assert len(record) == 0, [r.message for r in record] def test_integer_index_column(self): df = pd.DataFrame([(1, 'a'), (2, 'b'), (3, 'c')])