diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e98a5d54948..45b27174052 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -801,7 +801,11 @@ endif() set(ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_STATIC_LINK_LIBS}) # boost::filesystem is needed for S3 and Flight tests as a boost::process dependency. -if(((ARROW_FLIGHT OR ARROW_S3) AND (ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION))) +if(((ARROW_FLIGHT + OR ARROW_S3 + OR ARROW_GCS) + AND (ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION) + )) list(APPEND ARROW_TEST_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY}) endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index e1201886ccf..0fcbcc7d705 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -2572,8 +2572,10 @@ macro(build_absl_once) "${ABSL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}absl_${_ABSL_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}" ) add_library(absl::${_ABSL_LIB} STATIC IMPORTED) - set_target_properties(absl::${_ABSL_LIB} PROPERTIES IMPORTED_LOCATION - ${_ABSL_STATIC_LIBRARY}) + set_target_properties(absl::${_ABSL_LIB} + PROPERTIES IMPORTED_LOCATION ${_ABSL_STATIC_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES + "${ABSL_PREFIX}/include") list(APPEND ABSL_BUILD_BYPRODUCTS ${_ABSL_STATIC_LIBRARY}) endforeach() foreach(_ABSL_LIB ${_ABSL_INTERFACE_LIBS}) @@ -3704,6 +3706,13 @@ endmacro() if(ARROW_WITH_GOOGLE_CLOUD_CPP) resolve_dependency(google_cloud_cpp_storage) + get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage + INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR}) + get_target_property(absl_base_INCLUDE_DIR absl::base INTERFACE_INCLUDE_DIRECTORIES) + include_directories(SYSTEM ${absl_base_INCLUDE_DIR}) + message(STATUS "Found google-cloud-cpp::storage headers: ${google_cloud_cpp_storage_INCLUDE_DIR}" + ) endif() # diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d97828fe918..b82cccf24df 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -451,6 +451,12 @@ if(ARROW_FILESYSTEM) filesystem/path_util.cc filesystem/util_internal.cc) + if(ARROW_GCS) + list(APPEND ARROW_SRCS filesystem/gcsfs.cc) + set_source_files_properties(filesystem/gcsfs.cc + PROPERTIES SKIP_PRECOMPILE_HEADERS ON + SKIP_UNITY_BUILD_INCLUSION ON) + endif() if(ARROW_HDFS) list(APPEND ARROW_SRCS filesystem/hdfs.cc) endif() diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index c917db3b99c..67ebe54895c 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -28,6 +28,10 @@ add_arrow_test(filesystem-test EXTRA_LABELS filesystem) +if(ARROW_GCS) + add_arrow_test(gcsfs_test EXTRA_LABELS filesystem) +endif() + if(ARROW_S3) add_arrow_test(s3fs_test EXTRA_LABELS filesystem) if(TARGET arrow-s3fs-test) diff --git a/cpp/src/arrow/filesystem/gcsfs.cc b/cpp/src/arrow/filesystem/gcsfs.cc new file mode 100644 index 00000000000..58bbbbfd06c --- /dev/null +++ b/cpp/src/arrow/filesystem/gcsfs.cc @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/gcsfs.h" + +#include + +#include + +#include "arrow/filesystem/path_util.h" +#include "arrow/result.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace fs { + +namespace gcs = google::cloud::storage; + +google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { + auto options = google::cloud::Options{}; + if (!o.endpoint_override.empty()) { + std::string scheme = o.scheme; + if (scheme.empty()) scheme = "https"; + options.set(scheme + "://" + o.endpoint_override); + } + return options; +} + +class GcsFileSystem::Impl { + public: + explicit Impl(GcsOptions o) + : options_(std::move(o)), client_(AsGoogleCloudOptions(options_)) {} + + GcsOptions const& options() const { return options_; } + + private: + GcsOptions options_; + gcs::Client client_; +}; + +bool GcsOptions::Equals(const GcsOptions& other) const { + return endpoint_override == other.endpoint_override && scheme == other.scheme; +} + +std::string GcsFileSystem::type_name() const { return "gcs"; } + +bool GcsFileSystem::Equals(const FileSystem& other) const { + if (this == &other) { + return true; + } + if (other.type_name() != type_name()) { + return false; + } + const auto& fs = ::arrow::internal::checked_cast(other); + return impl_->options().Equals(fs.impl_->options()); +} + +Result GcsFileSystem::GetFileInfo(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result GcsFileSystem::GetFileInfo(const FileSelector& select) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GcsFileSystem::CreateDir(const std::string& path, bool recursive) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GcsFileSystem::DeleteDir(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GcsFileSystem::DeleteDirContents(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GcsFileSystem::DeleteRootDirContents() { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GcsFileSystem::DeleteFile(const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GcsFileSystem::Move(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GcsFileSystem::OpenInputStream( + const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GcsFileSystem::OpenInputStream( + const FileInfo& info) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GcsFileSystem::OpenInputFile( + const std::string& path) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GcsFileSystem::OpenInputFile( + const FileInfo& info) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GcsFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr& metadata) { + return Status::NotImplemented("The GCS FileSystem is not fully implemented"); +} + +Result> GcsFileSystem::OpenAppendStream( + const std::string&, const std::shared_ptr&) { + return Status::NotImplemented("Append is not supported in GCS"); +} + +GcsFileSystem::GcsFileSystem(const GcsOptions& options, const io::IOContext& context) + : FileSystem(context), impl_(std::make_shared(options)) {} + +namespace internal { + +std::shared_ptr MakeGcsFileSystemForTest(const GcsOptions& options) { + // Cannot use `std::make_shared<>` as the constructor is private. + return std::shared_ptr( + new GcsFileSystem(options, io::default_io_context())); +} + +} // namespace internal + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/gcsfs.h b/cpp/src/arrow/filesystem/gcsfs.h new file mode 100644 index 00000000000..2583bdee820 --- /dev/null +++ b/cpp/src/arrow/filesystem/gcsfs.h @@ -0,0 +1,118 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "arrow/filesystem/filesystem.h" + +namespace arrow { +namespace fs { +class GcsFileSystem; +struct GcsOptions; +namespace internal { +// TODO(ARROW-1231) - remove, and provide a public API (static GcsFileSystem::Make()). +std::shared_ptr MakeGcsFileSystemForTest(const GcsOptions& options); +} // namespace internal + +/// Options for the GcsFileSystem implementation. +struct ARROW_EXPORT GcsOptions { + std::string endpoint_override; + std::string scheme; + + bool Equals(const GcsOptions& other) const; +}; + +/// \brief GCS-backed FileSystem implementation. +/// +/// Some implementation notes: +/// - TODO(ARROW-1231) - review all the notes once completed. +/// - buckets are treated as top-level directories on a "root". +/// - GCS buckets are in a global namespace, only one bucket +/// named `foo` exists in Google Cloud. +/// - Creating new top-level directories is implemented by creating +/// a bucket, this may be a slower operation than usual. +/// - A principal (service account, user, etc) can only list the +/// buckets for a single project, but can access the buckets +/// for many projects. It is possible that listing "all" +/// the buckets returns fewer buckets than you have access to. +/// - GCS does not have directories, they are emulated in this +/// library by listing objects with a common prefix. +/// - In general, GCS has much higher latency than local filesystems. +/// The throughput of GCS is comparable to the throughput of +/// a local file system. +class ARROW_EXPORT GcsFileSystem : public FileSystem { + public: + ~GcsFileSystem() override = default; + + std::string type_name() const override; + + bool Equals(const FileSystem& other) const override; + + Result GetFileInfo(const std::string& path) override; + Result GetFileInfo(const FileSelector& select) override; + + Status CreateDir(const std::string& path, bool recursive) override; + + Status DeleteDir(const std::string& path) override; + + Status DeleteDirContents(const std::string& path) override; + + Status DeleteRootDirContents() override; + + Status DeleteFile(const std::string& path) override; + + Status Move(const std::string& src, const std::string& dest) override; + + Status CopyFile(const std::string& src, const std::string& dest) override; + + Result> OpenInputStream( + const std::string& path) override; + Result> OpenInputStream(const FileInfo& info) override; + + Result> OpenInputFile( + const std::string& path) override; + Result> OpenInputFile( + const FileInfo& info) override; + + Result> OpenOutputStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + ARROW_DEPRECATED( + "Deprecated. " + "OpenAppendStream is unsupported on the GCS FileSystem.") + Result> OpenAppendStream( + const std::string& path, + const std::shared_ptr& metadata) override; + + private: + /// Create a GcsFileSystem instance from the given options. + friend std::shared_ptr internal::MakeGcsFileSystemForTest( + const GcsOptions& options); + + explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context); + + class Impl; + std::shared_ptr impl_; +}; + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc new file mode 100644 index 00000000000..5d8c7b5a40a --- /dev/null +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/gcsfs.h" + +#include +#include +#include + +#include + +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" + +namespace arrow { +namespace fs { +namespace { + +using ::testing::IsEmpty; +using ::testing::Not; +using ::testing::NotNull; + +TEST(GcsFileSystem, OptionsCompare) { + GcsOptions a; + GcsOptions b; + b.endpoint_override = "localhost:1234"; + EXPECT_TRUE(a.Equals(a)); + EXPECT_TRUE(b.Equals(b)); + auto c = b; + c.scheme = "http"; + EXPECT_FALSE(b.Equals(c)); +} + +TEST(GcsFileSystem, FileSystemCompare) { + auto a = internal::MakeGcsFileSystemForTest(GcsOptions{}); + EXPECT_THAT(a, NotNull()); + EXPECT_TRUE(a->Equals(*a)); + + GcsOptions options; + options.endpoint_override = "localhost:1234"; + auto b = internal::MakeGcsFileSystemForTest(options); + EXPECT_THAT(b, NotNull()); + EXPECT_TRUE(b->Equals(*b)); + + EXPECT_FALSE(a->Equals(*b)); +} + +} // namespace +} // namespace fs +} // namespace arrow