From 927f6c6ebe2919e4eee2a30a2883a72a833bff9e Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Tue, 5 Oct 2021 19:19:29 +0000 Subject: [PATCH 1/6] ARROW-14222: [C++] implement GCSFileSystem skeleton --- cpp/CMakeLists.txt | 6 +- cpp/cmake_modules/ThirdpartyToolchain.cmake | 13 +- cpp/src/arrow/CMakeLists.txt | 6 + cpp/src/arrow/filesystem/CMakeLists.txt | 4 + cpp/src/arrow/filesystem/gcsfs.cc | 143 ++++++++++++++++++++ cpp/src/arrow/filesystem/gcsfs.h | 111 +++++++++++++++ cpp/src/arrow/filesystem/gcsfs_test.cc | 51 +++++++ 7 files changed, 331 insertions(+), 3 deletions(-) create mode 100644 cpp/src/arrow/filesystem/gcsfs.cc create mode 100644 cpp/src/arrow/filesystem/gcsfs.h create mode 100644 cpp/src/arrow/filesystem/gcsfs_test.cc diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e98a5d54948..45b27174052 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -801,7 +801,11 @@ endif() set(ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) # boost::filesystem is needed for S3 and Flight tests as a boost::process dependency. -if(((ARROW_FLIGHT OR ARROW_S3) AND (ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION))) +if(((ARROW_FLIGHT + OR ARROW_S3 + OR ARROW_GCS) + AND (ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION) + )) list(APPEND ARROW_TEST_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY}) endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index e1201886ccf..e5ea5f39e7d 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2572,8 +2572,10 @@ macro(build_absl_once) "${ABSL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}absl_${_ABSL_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) add_library(absl::${_ABSL_LIB} STATIC IMPORTED) - set_target_properties(absl::${_ABSL_LIB} PROPERTIES IMPORTED_LOCATION - ${_ABSL_STATIC_LIBRARY}) + set_target_properties(absl::${_ABSL_LIB} + PROPERTIES IMPORTED_LOCATION ${_ABSL_STATIC_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES + "${ABSL_PREFIX}/include") list(APPEND ABSL_BUILD_BYPRODUCTS ${_ABSL_STATIC_LIBRARY}) endforeach() foreach(_ABSL_LIB ${_ABSL_INTERFACE_LIBS}) @@ -3704,6 +3706,13 @@ endmacro() if(ARROW_WITH_GOOGLE_CLOUD_CPP) resolve_dependency(google_cloud_cpp_storage) + include_directories(SYSTEM ${GOOGLE_CLOUD_CPP_INCLUDE_DIR}) + get_target_property(absl_base_INCLUDE_DIR absl::base INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${absl_base_INCLUDE_DIR}) + message(STATUS "Found google-cloud-cpp::storage static library: ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE}" + ) + message(STATUS "Found google-cloud-cpp::storage headers: ${GOOGLE_CLOUD_CPP_INCLUDE_DIR}" + ) endif() # diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d97828fe918..b82cccf24df 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -451,6 +451,12 @@ if(ARROW_FILESYSTEM) filesystem/path_util.cc filesystem/util_internal.cc) + if(ARROW_GCS) + list(APPEND ARROW_SRCS filesystem/gcsfs.cc) + set_source_files_properties(filesystem/gcsfs.cc + PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) + endif() if(ARROW_HDFS) list(APPEND ARROW_SRCS filesystem/hdfs.cc) endif() diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index c917db3b99c..67ebe54895c 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -28,6 +28,10 @@ add_arrow_test(filesystem-test EXTRA_LABELS filesystem) +if(ARROW_GCS) + add_arrow_test(gcsfs_test EXTRA_LABELS filesystem) +endif() + if(ARROW_S3) add_arrow_test(s3fs_test EXTRA_LABELS filesystem) if(TARGET arrow-s3fs-test) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc new file mode 100644 index 00000000000..52211952309 --- /dev/null +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -0,0 +1,143 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/gcsfs.h" + +#include + +#include + +#include "arrow/filesystem/path_util.h" +#include "arrow/result.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace fs { + +namespace gcs = google::cloud::storage; + +google::cloud::Options AsGoogleCloudOptions(GCSOptions const& o) { + auto options = google::cloud::Options{}; + if (!o.endpoint_override.empty()) { + auto scheme = o.scheme; + if (scheme.empty()) scheme = "https"; + options.set(scheme + "://" + o.endpoint_override); + } + return options; +} + +class GCSFileSystem::Impl { + public: + Impl(GCSOptions const& o) : client_(AsGoogleCloudOptions(o)) {} + + private: + gcs::Client client_; +}; + +std::string GCSFileSystem::type_name() const { return "gcs"; } + +bool GCSFileSystem::Equals(const FileSystem& other) const { + if (this == &other) { + return true; + } + if (other.type_name() != type_name()) { + return false; + } + const auto& fs = ::arrow::internal::checked_cast(other); + return impl_ == fs.impl_; +} + +Result GCSFileSystem::GetFileInfo(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result GCSFileSystem::GetFileInfo(const FileSelector& select) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GCSFileSystem::CreateDir(const std::string& path, bool recursive) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GCSFileSystem::DeleteDir(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GCSFileSystem::DeleteDirContents(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GCSFileSystem::DeleteRootDirContents() { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GCSFileSystem::DeleteFile(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GCSFileSystem::Move(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GCSFileSystem::CopyFile(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GCSFileSystem::OpenInputStream( + const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GCSFileSystem::OpenInputStream( + const FileInfo& info) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GCSFileSystem::OpenInputFile( + const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GCSFileSystem::OpenInputFile( + const FileInfo& info) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GCSFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr& metadata) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GCSFileSystem::OpenAppendStream( + const std::string&, const std::shared_ptr&) { + return Status::NotImplemented("Append is not supported in GCS"); +} + +GCSFileSystem::GCSFileSystem(const GCSOptions& options, const io::IOContext& context) + : FileSystem(context), impl_(std::make_shared(options)) {} + +namespace internal { + +std::shared_ptr MakeGCSFileSystemForTest(const GCSOptions& options) { + return std::shared_ptr( + new GCSFileSystem(options, io::default_io_context())); +} + +} // namespace internal + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h new file mode 100644 index 00000000000..c0e982364e7 --- /dev/null +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" + +namespace arrow { +namespace fs { +class GCSFileSystem; +struct GCSOptions; +namespace internal { +// TODO(ARROW-1231) - during development only tests should create a GCSFileSystem. +// Remove before declaring the feature complete. +std::shared_ptr MakeGCSFileSystemForTest(const GCSOptions& options); +} // namespace internal + +struct ARROW_EXPORT GCSOptions { + std::string endpoint_override; + std::string scheme; + + bool Equals(const GCSOptions& other) const; + + /// \brief Initialize with default credentials provider chain + /// + /// This is recommended if you use the standard AWS environment variables + /// and/or configuration file. + static GCSOptions Defaults(); + + /// \brief Initialize with anonymous credentials. + /// + /// This will only let you access public buckets. + static GCSOptions Anonymous(); +}; + +class ARROW_EXPORT GCSFileSystem : public FileSystem { + public: + ~GCSFileSystem() override = default; + + std::string type_name() const override; + + bool Equals(const FileSystem& other) const override; + + Result GetFileInfo(const std::string& path) override; + Result GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive = true) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteDirContents(const std::string& path) override; + + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputStream(const FileInfo& info) override; + + Result> OpenInputFile( + const std::string& path) override; + Result> OpenInputFile( + const FileInfo& info) override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + ARROW_DEPRECATED( + "Deprecated. " + "OpenAppendStream is unsupported on the GCS FileSystem.") + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + private: + /// Create a GCSFileSystem instance from the given options. + friend std::shared_ptr internal::MakeGCSFileSystemForTest( + const GCSOptions& options); + + explicit GCSFileSystem(const GCSOptions& options, const io::IOContext& io_context); + + class Impl; + std::shared_ptr impl_; +}; + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc new file mode 100644 index 00000000000..b3dad13aa98 --- /dev/null +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/gcsfs.h" + +#include +#include +#include + +#include + +#include "arrow/testing/util.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { +namespace fs { +namespace { + +using ::testing::IsEmpty; +using ::testing::Not; +using ::testing::NotNull; + +TEST(GCSFileSystem, Compare) { + auto a = internal::MakeGCSFileSystemForTest(GCSOptions{}); + EXPECT_THAT(a.get(), NotNull()); + EXPECT_EQ(a, a); + + auto b = internal::MakeGCSFileSystemForTest(GCSOptions{}); + EXPECT_THAT(b.get(), NotNull()); + EXPECT_EQ(b, b); + + EXPECT_NE(a, b); +} + +} // namespace +} // namespace fs +} // namespace arrow From ffc316cf0c9a64f9fe9b3620e22f0adf7edd19b7 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Tue, 5 Oct 2021 22:40:37 +0000 Subject: [PATCH 2/6] Fix lints --- cpp/src/arrow/filesystem/gcsfs.cc | 2 +- cpp/src/arrow/filesystem/gcsfs_test.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 52211952309..d03eaf1e880 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -42,7 +42,7 @@ google::cloud::Options AsGoogleCloudOptions(GCSOptions const& o) { class GCSFileSystem::Impl { public: - Impl(GCSOptions const& o) : client_(AsGoogleCloudOptions(o)) {} + explicit Impl(GCSOptions const& o) : client_(AsGoogleCloudOptions(o)) {} private: gcs::Client client_; diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index b3dad13aa98..dffc4b7bb8a 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -23,8 +23,8 @@ #include -#include "arrow/testing/util.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" namespace arrow { namespace fs { From a49580b8e9cb80dc7ccaa41d0b4614449eac59ad Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Wed, 6 Oct 2021 00:41:32 +0000 Subject: [PATCH 3/6] Address review comments --- cpp/src/arrow/filesystem/gcsfs.cc | 50 +++++++++++++------------- cpp/src/arrow/filesystem/gcsfs.h | 49 +++++++++++++++++-------- cpp/src/arrow/filesystem/gcsfs_test.cc | 4 +-- 3 files changed, 61 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index d03eaf1e880..0287b4d5841 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -30,7 +30,7 @@ namespace fs { namespace gcs = google::cloud::storage; -google::cloud::Options AsGoogleCloudOptions(GCSOptions const& o) { +google::cloud::Options AsGoogleCloudOptions(GcsOptions const& o) { auto options = google::cloud::Options{}; if (!o.endpoint_override.empty()) { auto scheme = o.scheme; @@ -40,101 +40,101 @@ google::cloud::Options AsGoogleCloudOptions(GCSOptions const& o) { return options; } -class GCSFileSystem::Impl { +class GcsFileSystem::Impl { public: - explicit Impl(GCSOptions const& o) : client_(AsGoogleCloudOptions(o)) {} + explicit Impl(GcsOptions const& o) : client_(AsGoogleCloudOptions(o)) {} private: gcs::Client client_; }; -std::string GCSFileSystem::type_name() const { return "gcs"; } +std::string GcsFileSystem::type_name() const { return "gcs"; } -bool GCSFileSystem::Equals(const FileSystem& other) const { +bool GcsFileSystem::Equals(const FileSystem& other) const { if (this == &other) { return true; } if (other.type_name() != type_name()) { return false; } - const auto& fs = ::arrow::internal::checked_cast(other); + const auto& fs = ::arrow::internal::checked_cast(other); return impl_ == fs.impl_; } -Result GCSFileSystem::GetFileInfo(const std::string& path) { +Result GcsFileSystem::GetFileInfo(const std::string& path) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Result GCSFileSystem::GetFileInfo(const FileSelector& select) { +Result GcsFileSystem::GetFileInfo(const FileSelector& select) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Status GCSFileSystem::CreateDir(const std::string& path, bool recursive) { +Status GcsFileSystem::CreateDir(const std::string& path, bool recursive) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Status GCSFileSystem::DeleteDir(const std::string& path) { +Status GcsFileSystem::DeleteDir(const std::string& path) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Status GCSFileSystem::DeleteDirContents(const std::string& path) { +Status GcsFileSystem::DeleteDirContents(const std::string& path) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Status GCSFileSystem::DeleteRootDirContents() { +Status GcsFileSystem::DeleteRootDirContents() { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Status GCSFileSystem::DeleteFile(const std::string& path) { +Status GcsFileSystem::DeleteFile(const std::string& path) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Status GCSFileSystem::Move(const std::string& src, const std::string& dest) { +Status GcsFileSystem::Move(const std::string& src, const std::string& dest) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Status GCSFileSystem::CopyFile(const std::string& src, const std::string& dest) { +Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Result> GCSFileSystem::OpenInputStream( +Result> GcsFileSystem::OpenInputStream( const std::string& path) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Result> GCSFileSystem::OpenInputStream( +Result> GcsFileSystem::OpenInputStream( const FileInfo& info) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Result> GCSFileSystem::OpenInputFile( +Result> GcsFileSystem::OpenInputFile( const std::string& path) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Result> GCSFileSystem::OpenInputFile( +Result> GcsFileSystem::OpenInputFile( const FileInfo& info) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Result> GCSFileSystem::OpenOutputStream( +Result> GcsFileSystem::OpenOutputStream( const std::string& path, const std::shared_ptr& metadata) { return Status::NotImplemented("The GCS FileSystem is not fully implemented"); } -Result> GCSFileSystem::OpenAppendStream( +Result> GcsFileSystem::OpenAppendStream( const std::string&, const std::shared_ptr&) { return Status::NotImplemented("Append is not supported in GCS"); } -GCSFileSystem::GCSFileSystem(const GCSOptions& options, const io::IOContext& context) +GcsFileSystem::GcsFileSystem(const GcsOptions& options, const io::IOContext& context) : FileSystem(context), impl_(std::make_shared(options)) {} namespace internal { -std::shared_ptr MakeGCSFileSystemForTest(const GCSOptions& options) { - return std::shared_ptr( - new GCSFileSystem(options, io::default_io_context())); +std::shared_ptr MakeGcsFileSystemForTest(const GcsOptions& options) { + return std::shared_ptr( + new GcsFileSystem(options, io::default_io_context())); } } // namespace internal diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index c0e982364e7..6324199fc66 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -25,35 +25,54 @@ namespace arrow { namespace fs { -class GCSFileSystem; -struct GCSOptions; +class GcsFileSystem; +struct GcsOptions; namespace internal { -// TODO(ARROW-1231) - during development only tests should create a GCSFileSystem. -// Remove before declaring the feature complete. -std::shared_ptr MakeGCSFileSystemForTest(const GCSOptions& options); +// TODO(ARROW-1231) - during development only tests should create a GcsFileSystem. +// Remove, and provide a public API, before declaring the feature complete. +std::shared_ptr MakeGcsFileSystemForTest(const GcsOptions& options); } // namespace internal -struct ARROW_EXPORT GCSOptions { +/// Options for the GcsFileSystem implementation. +struct ARROW_EXPORT GcsOptions { std::string endpoint_override; std::string scheme; - bool Equals(const GCSOptions& other) const; + bool Equals(const GcsOptions& other) const; /// \brief Initialize with default credentials provider chain /// /// This is recommended if you use the standard AWS environment variables /// and/or configuration file. - static GCSOptions Defaults(); + static GcsOptions Defaults(); /// \brief Initialize with anonymous credentials. /// /// This will only let you access public buckets. - static GCSOptions Anonymous(); + static GcsOptions Anonymous(); }; -class ARROW_EXPORT GCSFileSystem : public FileSystem { +/// \brief GCS-backed FileSystem implementation. +/// +/// Some implementation notes: +/// - TODO(ARROW-1231) - review all the notes once completed. +/// - buckets are treated as top-level directories on a "root". +/// - GCS buckets are in a global namespace, only one bucket +/// named `foo` exists in Google Cloud. +/// - Creating new top-level directories is implemented by creating +/// a bucket, this may be a slower operation than usual. +/// - A principal (service account, user, etc) can only list the +/// buckets for a single project, but can access the buckets +/// for many projects. It is possible that listing "all" +/// the buckets returns fewer buckets than you have access to. +/// - GCS does not have directories, they are emulated in this +/// library by listing objects with a common prefix. +/// - In general, GCS has much higher latency than local filesystems. +/// The throughput of GCS is comparable to the throughput of +/// a local file system. +class ARROW_EXPORT GcsFileSystem : public FileSystem { public: - ~GCSFileSystem() override = default; + ~GcsFileSystem() override = default; std::string type_name() const override; @@ -97,11 +116,11 @@ class ARROW_EXPORT GCSFileSystem : public FileSystem { const std::shared_ptr& metadata) override; private: - /// Create a GCSFileSystem instance from the given options. - friend std::shared_ptr internal::MakeGCSFileSystemForTest( - const GCSOptions& options); + /// Create a GcsFileSystem instance from the given options. + friend std::shared_ptr internal::MakeGcsFileSystemForTest( + const GcsOptions& options); - explicit GCSFileSystem(const GCSOptions& options, const io::IOContext& io_context); + explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context); class Impl; std::shared_ptr impl_; diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index dffc4b7bb8a..e1522c1b7da 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -35,11 +35,11 @@ using ::testing::Not; using ::testing::NotNull; TEST(GCSFileSystem, Compare) { - auto a = internal::MakeGCSFileSystemForTest(GCSOptions{}); + auto a = internal::MakeGcsFileSystemForTest(GcsOptions{}); EXPECT_THAT(a.get(), NotNull()); EXPECT_EQ(a, a); - auto b = internal::MakeGCSFileSystemForTest(GCSOptions{}); + auto b = internal::MakeGcsFileSystemForTest(GcsOptions{}); EXPECT_THAT(b.get(), NotNull()); EXPECT_EQ(b, b); From 2a1023831c940d4ac1d6803a4a9ca3f033698562 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Wed, 6 Oct 2021 00:49:15 +0000 Subject: [PATCH 4/6] Fix cut&paste error --- cpp/src/arrow/filesystem/gcsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 6324199fc66..6dd4fe517b1 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -42,7 +42,7 @@ struct ARROW_EXPORT GcsOptions { /// \brief Initialize with default credentials provider chain /// - /// This is recommended if you use the standard AWS environment variables + /// This is recommended if you use the standard GCP environment variables /// and/or configuration file. static GcsOptions Defaults(); From 81ad9392f779b0671ea88c4629772598883cff30 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Thu, 7 Oct 2021 17:57:37 +0000 Subject: [PATCH 5/6] Address review comments --- cpp/src/arrow/filesystem/gcsfs.cc | 17 ++++++++++++---- cpp/src/arrow/filesystem/gcsfs.h | 16 ++------------- cpp/src/arrow/filesystem/gcsfs_test.cc | 27 +++++++++++++++++++------- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc index 0287b4d5841..58bbbbfd06c 100644 --- a/cpp/src/arrow/filesystem/gcsfs.cc +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -30,10 +30,10 @@ namespace fs { namespace gcs = google::cloud::storage; -google::cloud::Options AsGoogleCloudOptions(GcsOptions const& o) { +google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { auto options = google::cloud::Options{}; if (!o.endpoint_override.empty()) { - auto scheme = o.scheme; + std::string scheme = o.scheme; if (scheme.empty()) scheme = "https"; options.set(scheme + "://" + o.endpoint_override); } @@ -42,12 +42,20 @@ google::cloud::Options AsGoogleCloudOptions(GcsOptions const& o) { class GcsFileSystem::Impl { public: - explicit Impl(GcsOptions const& o) : client_(AsGoogleCloudOptions(o)) {} + explicit Impl(GcsOptions o) + : options_(std::move(o)), client_(AsGoogleCloudOptions(options_)) {} + + GcsOptions const& options() const { return options_; } private: + GcsOptions options_; gcs::Client client_; }; +bool GcsOptions::Equals(const GcsOptions& other) const { + return endpoint_override == other.endpoint_override && scheme == other.scheme; +} + std::string GcsFileSystem::type_name() const { return "gcs"; } bool GcsFileSystem::Equals(const FileSystem& other) const { @@ -58,7 +66,7 @@ bool GcsFileSystem::Equals(const FileSystem& other) const { return false; } const auto& fs = ::arrow::internal::checked_cast(other); - return impl_ == fs.impl_; + return impl_->options().Equals(fs.impl_->options()); } Result GcsFileSystem::GetFileInfo(const std::string& path) { @@ -133,6 +141,7 @@ GcsFileSystem::GcsFileSystem(const GcsOptions& options, const io::IOContext& con namespace internal { std::shared_ptr MakeGcsFileSystemForTest(const GcsOptions& options) { + // Cannot use `std::make_shared<>` as the constructor is private. return std::shared_ptr( new GcsFileSystem(options, io::default_io_context())); } diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h index 6dd4fe517b1..2583bdee820 100644 --- a/cpp/src/arrow/filesystem/gcsfs.h +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -28,8 +28,7 @@ namespace fs { class GcsFileSystem; struct GcsOptions; namespace internal { -// TODO(ARROW-1231) - during development only tests should create a GcsFileSystem. -// Remove, and provide a public API, before declaring the feature complete. +// TODO(ARROW-1231) - remove, and provide a public API (static GcsFileSystem::Make()). std::shared_ptr MakeGcsFileSystemForTest(const GcsOptions& options); } // namespace internal @@ -39,17 +38,6 @@ struct ARROW_EXPORT GcsOptions { std::string scheme; bool Equals(const GcsOptions& other) const; - - /// \brief Initialize with default credentials provider chain - /// - /// This is recommended if you use the standard GCP environment variables - /// and/or configuration file. - static GcsOptions Defaults(); - - /// \brief Initialize with anonymous credentials. - /// - /// This will only let you access public buckets. - static GcsOptions Anonymous(); }; /// \brief GCS-backed FileSystem implementation. @@ -81,7 +69,7 @@ class ARROW_EXPORT GcsFileSystem : public FileSystem { Result GetFileInfo(const std::string& path) override; Result GetFileInfo(const FileSelector& select) override; - Status CreateDir(const std::string& path, bool recursive = true) override; + Status CreateDir(const std::string& path, bool recursive) override; Status DeleteDir(const std::string& path) override; diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index e1522c1b7da..5d8c7b5a40a 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -34,16 +34,29 @@ using ::testing::IsEmpty; using ::testing::Not; using ::testing::NotNull; -TEST(GCSFileSystem, Compare) { +TEST(GcsFileSystem, OptionsCompare) { + GcsOptions a; + GcsOptions b; + b.endpoint_override = "localhost:1234"; + EXPECT_TRUE(a.Equals(a)); + EXPECT_TRUE(b.Equals(b)); + auto c = b; + c.scheme = "http"; + EXPECT_FALSE(b.Equals(c)); +} + +TEST(GcsFileSystem, FileSystemCompare) { auto a = internal::MakeGcsFileSystemForTest(GcsOptions{}); - EXPECT_THAT(a.get(), NotNull()); - EXPECT_EQ(a, a); + EXPECT_THAT(a, NotNull()); + EXPECT_TRUE(a->Equals(*a)); - auto b = internal::MakeGcsFileSystemForTest(GcsOptions{}); - EXPECT_THAT(b.get(), NotNull()); - EXPECT_EQ(b, b); + GcsOptions options; + options.endpoint_override = "localhost:1234"; + auto b = internal::MakeGcsFileSystemForTest(options); + EXPECT_THAT(b, NotNull()); + EXPECT_TRUE(b->Equals(*b)); - EXPECT_NE(a, b); + EXPECT_FALSE(a->Equals(*b)); } } // namespace From 44e77cbc39234dbb376e8ad769d92a395a18dc14 Mon Sep 17 00:00:00 2001 From: Carlos O'Ryan Date: Fri, 8 Oct 2021 14:51:10 +0000 Subject: [PATCH 6/6] Address review comments --- cpp/cmake_modules/ThirdpartyToolchain.cmake | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index e5ea5f39e7d..0fcbcc7d705 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -3706,12 +3706,12 @@ endmacro() if(ARROW_WITH_GOOGLE_CLOUD_CPP) resolve_dependency(google_cloud_cpp_storage) - include_directories(SYSTEM ${GOOGLE_CLOUD_CPP_INCLUDE_DIR}) + get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR}) get_target_property(absl_base_INCLUDE_DIR absl::base INTERFACE_INCLUDE_DIRECTORIES) include_directories(SYSTEM ${absl_base_INCLUDE_DIR}) - message(STATUS "Found google-cloud-cpp::storage static library: ${GOOGLE_CLOUD_CPP_STATIC_LIBRARY_STORAGE}" - ) - message(STATUS "Found google-cloud-cpp::storage headers: ${GOOGLE_CLOUD_CPP_INCLUDE_DIR}" + message(STATUS "Found google-cloud-cpp::storage headers: ${google_cloud_cpp_storage_INCLUDE_DIR}" ) endif()