Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -801,7 +801,11 @@ endif()
set(ARROW_SHARED_PRIVATE_LINK_LIBS ${ARROW_STATIC_LINK_LIBS})

# boost::filesystem is needed for S3 and Flight tests as a boost::process dependency.
if(((ARROW_FLIGHT OR ARROW_S3) AND (ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION)))
if(((ARROW_FLIGHT
OR ARROW_S3
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if the mac build is a transient error or somehow due to this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not think is this change, as the build has -DARROW_GCS=OFF:

https://github.com/apache/arrow/pull/11331/checks?check_run_id=3830637295#step:9:62

and the change is (modulo reformatting) changing (ARROW_FLIGHT OR ARROW_S3) to (ARROW_FLIGHT OR ARROW_S3 OR ARROW_GCS). In addition, this change has been there since the first commit in the branch.

At a guess this is either a transient or the point in master where I based the branch has a problem, I can rebase if you think that would help.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, lets try a rebase.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at the builds at master, I do not think a rebase would help, and it seems even less likely that these changes caused the build breaks:

https://github.com/apache/arrow/actions/runs/1317260919

https://github.com/apache/arrow/actions?query=event%3Apush+branch%3Amaster

OR ARROW_GCS)
AND (ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION)
))
list(APPEND ARROW_TEST_LINK_LIBS ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_SYSTEM_LIBRARY})
endif()

Expand Down
13 changes: 11 additions & 2 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2572,8 +2572,10 @@ macro(build_absl_once)
"${ABSL_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}absl_${_ABSL_LIB}${CMAKE_STATIC_LIBRARY_SUFFIX}"
)
add_library(absl::${_ABSL_LIB} STATIC IMPORTED)
set_target_properties(absl::${_ABSL_LIB} PROPERTIES IMPORTED_LOCATION
${_ABSL_STATIC_LIBRARY})
set_target_properties(absl::${_ABSL_LIB}
PROPERTIES IMPORTED_LOCATION ${_ABSL_STATIC_LIBRARY}
INTERFACE_INCLUDE_DIRECTORIES
"${ABSL_PREFIX}/include")
list(APPEND ABSL_BUILD_BYPRODUCTS ${_ABSL_STATIC_LIBRARY})
endforeach()
foreach(_ABSL_LIB ${_ABSL_INTERFACE_LIBS})
Expand Down Expand Up @@ -3704,6 +3706,13 @@ endmacro()

if(ARROW_WITH_GOOGLE_CLOUD_CPP)
resolve_dependency(google_cloud_cpp_storage)
get_target_property(google_cloud_cpp_storage_INCLUDE_DIR google-cloud-cpp::storage
INTERFACE_INCLUDE_DIRECTORIES)
include_directories(SYSTEM ${google_cloud_cpp_storage_INCLUDE_DIR})
get_target_property(absl_base_INCLUDE_DIR absl::base INTERFACE_INCLUDE_DIRECTORIES)
include_directories(SYSTEM ${absl_base_INCLUDE_DIR})
message(STATUS "Found google-cloud-cpp::storage headers: ${google_cloud_cpp_storage_INCLUDE_DIR}"
)
endif()

#
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,12 @@ if(ARROW_FILESYSTEM)
filesystem/path_util.cc
filesystem/util_internal.cc)

if(ARROW_GCS)
list(APPEND ARROW_SRCS filesystem/gcsfs.cc)
set_source_files_properties(filesystem/gcsfs.cc
PROPERTIES SKIP_PRECOMPILE_HEADERS ON
SKIP_UNITY_BUILD_INCLUSION ON)
endif()
if(ARROW_HDFS)
list(APPEND ARROW_SRCS filesystem/hdfs.cc)
endif()
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ add_arrow_test(filesystem-test
EXTRA_LABELS
filesystem)

if(ARROW_GCS)
add_arrow_test(gcsfs_test EXTRA_LABELS filesystem)
endif()

if(ARROW_S3)
add_arrow_test(s3fs_test EXTRA_LABELS filesystem)
if(TARGET arrow-s3fs-test)
Expand Down
152 changes: 152 additions & 0 deletions cpp/src/arrow/filesystem/gcsfs.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/filesystem/gcsfs.h"

#include <google/cloud/storage/client.h>

#include <sstream>

#include "arrow/filesystem/path_util.h"
#include "arrow/result.h"
#include "arrow/util/checked_cast.h"

namespace arrow {
namespace fs {

namespace gcs = google::cloud::storage;

google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) {
auto options = google::cloud::Options{};
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to prefer this over google::cloud::Options options; ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Almost Always Auto", another holy war. I can change it if you think it is confusing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, the almost always use auto (my undertanding this is from effective C++) directly contradicts the style guide in use (which is only use auto if type is obvious). In this case type is obvious, it just seems like there is more text in this case.

if (!o.endpoint_override.empty()) {
std::string scheme = o.scheme;
if (scheme.empty()) scheme = "https";
options.set<gcs::RestEndpointOption>(scheme + "://" + o.endpoint_override);
}
return options;
}

class GcsFileSystem::Impl {
public:
explicit Impl(GcsOptions o)
: options_(std::move(o)), client_(AsGoogleCloudOptions(options_)) {}

GcsOptions const& options() const { return options_; }

private:
GcsOptions options_;
gcs::Client client_;
};

bool GcsOptions::Equals(const GcsOptions& other) const {
return endpoint_override == other.endpoint_override && scheme == other.scheme;
}

std::string GcsFileSystem::type_name() const { return "gcs"; }

bool GcsFileSystem::Equals(const FileSystem& other) const {
if (this == &other) {
return true;
}
if (other.type_name() != type_name()) {
return false;
}
const auto& fs = ::arrow::internal::checked_cast<const GcsFileSystem&>(other);
return impl_->options().Equals(fs.impl_->options());
}

Result<FileInfo> GcsFileSystem::GetFileInfo(const std::string& path) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Result<FileInfoVector> GcsFileSystem::GetFileInfo(const FileSelector& select) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Status GcsFileSystem::CreateDir(const std::string& path, bool recursive) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Status GcsFileSystem::DeleteDir(const std::string& path) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Status GcsFileSystem::DeleteDirContents(const std::string& path) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Status GcsFileSystem::DeleteRootDirContents() {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Status GcsFileSystem::DeleteFile(const std::string& path) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Status GcsFileSystem::Move(const std::string& src, const std::string& dest) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::InputStream>> GcsFileSystem::OpenInputStream(
const std::string& path) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::InputStream>> GcsFileSystem::OpenInputStream(
const FileInfo& info) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::RandomAccessFile>> GcsFileSystem::OpenInputFile(
const std::string& path) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::RandomAccessFile>> GcsFileSystem::OpenInputFile(
const FileInfo& info) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::OutputStream>> GcsFileSystem::OpenOutputStream(
const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
return Status::NotImplemented("The GCS FileSystem is not fully implemented");
}

Result<std::shared_ptr<io::OutputStream>> GcsFileSystem::OpenAppendStream(
const std::string&, const std::shared_ptr<const KeyValueMetadata>&) {
return Status::NotImplemented("Append is not supported in GCS");
}

GcsFileSystem::GcsFileSystem(const GcsOptions& options, const io::IOContext& context)
: FileSystem(context), impl_(std::make_shared<Impl>(options)) {}

namespace internal {

std::shared_ptr<GcsFileSystem> MakeGcsFileSystemForTest(const GcsOptions& options) {
// Cannot use `std::make_shared<>` as the constructor is private.
return std::shared_ptr<GcsFileSystem>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe comment on why std::make_shared isn't used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

new GcsFileSystem(options, io::default_io_context()));
}

} // namespace internal

} // namespace fs
} // namespace arrow
118 changes: 118 additions & 0 deletions cpp/src/arrow/filesystem/gcsfs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <memory>
#include <string>
#include <vector>

#include "arrow/filesystem/filesystem.h"

namespace arrow {
namespace fs {
class GcsFileSystem;
struct GcsOptions;
namespace internal {
// TODO(ARROW-1231) - remove, and provide a public API (static GcsFileSystem::Make()).
std::shared_ptr<GcsFileSystem> MakeGcsFileSystemForTest(const GcsOptions& options);
Copy link
Contributor

@emkornfield emkornfield Oct 7, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The TODO covers make a static method Make on GcsFileSystem?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed to make the TODO more explicit

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hope that is what you had in mind.

} // namespace internal

/// Options for the GcsFileSystem implementation.
struct ARROW_EXPORT GcsOptions {
std::string endpoint_override;
std::string scheme;

bool Equals(const GcsOptions& other) const;
};

/// \brief GCS-backed FileSystem implementation.
///
/// Some implementation notes:
/// - TODO(ARROW-1231) - review all the notes once completed.
/// - buckets are treated as top-level directories on a "root".
/// - GCS buckets are in a global namespace, only one bucket
/// named `foo` exists in Google Cloud.
/// - Creating new top-level directories is implemented by creating
/// a bucket, this may be a slower operation than usual.
/// - A principal (service account, user, etc) can only list the
/// buckets for a single project, but can access the buckets
/// for many projects. It is possible that listing "all"
/// the buckets returns fewer buckets than you have access to.
/// - GCS does not have directories, they are emulated in this
/// library by listing objects with a common prefix.
/// - In general, GCS has much higher latency than local filesystems.
/// The throughput of GCS is comparable to the throughput of
/// a local file system.
class ARROW_EXPORT GcsFileSystem : public FileSystem {
public:
~GcsFileSystem() override = default;

std::string type_name() const override;

bool Equals(const FileSystem& other) const override;

Result<FileInfo> GetFileInfo(const std::string& path) override;
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;

Status CreateDir(const std::string& path, bool recursive) override;

Status DeleteDir(const std::string& path) override;

Status DeleteDirContents(const std::string& path) override;

Status DeleteRootDirContents() override;

Status DeleteFile(const std::string& path) override;

Status Move(const std::string& src, const std::string& dest) override;

Status CopyFile(const std::string& src, const std::string& dest) override;

Result<std::shared_ptr<io::InputStream>> OpenInputStream(
const std::string& path) override;
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;

Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const std::string& path) override;
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
const FileInfo& info) override;

Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;

ARROW_DEPRECATED(
"Deprecated. "
"OpenAppendStream is unsupported on the GCS FileSystem.")
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
const std::string& path,
const std::shared_ptr<const KeyValueMetadata>& metadata) override;

private:
/// Create a GcsFileSystem instance from the given options.
friend std::shared_ptr<GcsFileSystem> internal::MakeGcsFileSystemForTest(
const GcsOptions& options);

explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context);

class Impl;
std::shared_ptr<Impl> impl_;
};

} // namespace fs
} // namespace arrow
Loading