-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-14222: [C++] implement GCSFileSystem skeleton #11331
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
927f6c6
ffc316c
a49580b
2a10238
81ad939
44e77cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,152 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "arrow/filesystem/gcsfs.h" | ||
|
|
||
| #include <google/cloud/storage/client.h> | ||
|
|
||
| #include <sstream> | ||
|
|
||
| #include "arrow/filesystem/path_util.h" | ||
| #include "arrow/result.h" | ||
| #include "arrow/util/checked_cast.h" | ||
|
|
||
| namespace arrow { | ||
| namespace fs { | ||
|
|
||
| namespace gcs = google::cloud::storage; | ||
|
|
||
| google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { | ||
| auto options = google::cloud::Options{}; | ||
|
||
| if (!o.endpoint_override.empty()) { | ||
| std::string scheme = o.scheme; | ||
| if (scheme.empty()) scheme = "https"; | ||
| options.set<gcs::RestEndpointOption>(scheme + "://" + o.endpoint_override); | ||
| } | ||
| return options; | ||
| } | ||
|
|
||
| class GcsFileSystem::Impl { | ||
| public: | ||
| explicit Impl(GcsOptions o) | ||
| : options_(std::move(o)), client_(AsGoogleCloudOptions(options_)) {} | ||
|
|
||
| GcsOptions const& options() const { return options_; } | ||
|
|
||
| private: | ||
| GcsOptions options_; | ||
| gcs::Client client_; | ||
| }; | ||
|
|
||
| bool GcsOptions::Equals(const GcsOptions& other) const { | ||
| return endpoint_override == other.endpoint_override && scheme == other.scheme; | ||
| } | ||
|
|
||
| std::string GcsFileSystem::type_name() const { return "gcs"; } | ||
|
|
||
| bool GcsFileSystem::Equals(const FileSystem& other) const { | ||
| if (this == &other) { | ||
| return true; | ||
| } | ||
| if (other.type_name() != type_name()) { | ||
| return false; | ||
| } | ||
| const auto& fs = ::arrow::internal::checked_cast<const GcsFileSystem&>(other); | ||
| return impl_->options().Equals(fs.impl_->options()); | ||
| } | ||
|
|
||
| Result<FileInfo> GcsFileSystem::GetFileInfo(const std::string& path) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Result<FileInfoVector> GcsFileSystem::GetFileInfo(const FileSelector& select) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Status GcsFileSystem::CreateDir(const std::string& path, bool recursive) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Status GcsFileSystem::DeleteDir(const std::string& path) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Status GcsFileSystem::DeleteDirContents(const std::string& path) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Status GcsFileSystem::DeleteRootDirContents() { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Status GcsFileSystem::DeleteFile(const std::string& path) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Status GcsFileSystem::Move(const std::string& src, const std::string& dest) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::InputStream>> GcsFileSystem::OpenInputStream( | ||
| const std::string& path) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::InputStream>> GcsFileSystem::OpenInputStream( | ||
| const FileInfo& info) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::RandomAccessFile>> GcsFileSystem::OpenInputFile( | ||
| const std::string& path) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::RandomAccessFile>> GcsFileSystem::OpenInputFile( | ||
| const FileInfo& info) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::OutputStream>> GcsFileSystem::OpenOutputStream( | ||
| const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::OutputStream>> GcsFileSystem::OpenAppendStream( | ||
| const std::string&, const std::shared_ptr<const KeyValueMetadata>&) { | ||
| return Status::NotImplemented("Append is not supported in GCS"); | ||
| } | ||
|
|
||
| GcsFileSystem::GcsFileSystem(const GcsOptions& options, const io::IOContext& context) | ||
| : FileSystem(context), impl_(std::make_shared<Impl>(options)) {} | ||
|
|
||
| namespace internal { | ||
|
|
||
| std::shared_ptr<GcsFileSystem> MakeGcsFileSystemForTest(const GcsOptions& options) { | ||
| // Cannot use `std::make_shared<>` as the constructor is private. | ||
| return std::shared_ptr<GcsFileSystem>( | ||
|
||
| new GcsFileSystem(options, io::default_io_context())); | ||
| } | ||
|
|
||
| } // namespace internal | ||
|
|
||
| } // namespace fs | ||
| } // namespace arrow | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,118 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <memory> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include "arrow/filesystem/filesystem.h" | ||
|
|
||
| namespace arrow { | ||
| namespace fs { | ||
| class GcsFileSystem; | ||
| struct GcsOptions; | ||
| namespace internal { | ||
| // TODO(ARROW-1231) - remove, and provide a public API (static GcsFileSystem::Make()). | ||
| std::shared_ptr<GcsFileSystem> MakeGcsFileSystemForTest(const GcsOptions& options); | ||
|
||
| } // namespace internal | ||
|
|
||
| /// Options for the GcsFileSystem implementation. | ||
| struct ARROW_EXPORT GcsOptions { | ||
| std::string endpoint_override; | ||
| std::string scheme; | ||
|
|
||
| bool Equals(const GcsOptions& other) const; | ||
| }; | ||
|
|
||
| /// \brief GCS-backed FileSystem implementation. | ||
| /// | ||
| /// Some implementation notes: | ||
| /// - TODO(ARROW-1231) - review all the notes once completed. | ||
| /// - buckets are treated as top-level directories on a "root". | ||
| /// - GCS buckets are in a global namespace, only one bucket | ||
| /// named `foo` exists in Google Cloud. | ||
| /// - Creating new top-level directories is implemented by creating | ||
| /// a bucket, this may be a slower operation than usual. | ||
| /// - A principal (service account, user, etc) can only list the | ||
| /// buckets for a single project, but can access the buckets | ||
| /// for many projects. It is possible that listing "all" | ||
| /// the buckets returns fewer buckets than you have access to. | ||
| /// - GCS does not have directories, they are emulated in this | ||
| /// library by listing objects with a common prefix. | ||
| /// - In general, GCS has much higher latency than local filesystems. | ||
| /// The throughput of GCS is comparable to the throughput of | ||
| /// a local file system. | ||
| class ARROW_EXPORT GcsFileSystem : public FileSystem { | ||
| public: | ||
| ~GcsFileSystem() override = default; | ||
|
|
||
| std::string type_name() const override; | ||
|
|
||
| bool Equals(const FileSystem& other) const override; | ||
|
|
||
| Result<FileInfo> GetFileInfo(const std::string& path) override; | ||
| Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; | ||
|
|
||
| Status CreateDir(const std::string& path, bool recursive) override; | ||
|
|
||
| Status DeleteDir(const std::string& path) override; | ||
|
|
||
| Status DeleteDirContents(const std::string& path) override; | ||
|
|
||
| Status DeleteRootDirContents() override; | ||
|
|
||
| Status DeleteFile(const std::string& path) override; | ||
|
|
||
| Status Move(const std::string& src, const std::string& dest) override; | ||
|
|
||
| Status CopyFile(const std::string& src, const std::string& dest) override; | ||
|
|
||
| Result<std::shared_ptr<io::InputStream>> OpenInputStream( | ||
| const std::string& path) override; | ||
| Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; | ||
|
|
||
| Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( | ||
| const std::string& path) override; | ||
| Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( | ||
| const FileInfo& info) override; | ||
|
|
||
| Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( | ||
| const std::string& path, | ||
| const std::shared_ptr<const KeyValueMetadata>& metadata) override; | ||
|
|
||
| ARROW_DEPRECATED( | ||
| "Deprecated. " | ||
| "OpenAppendStream is unsupported on the GCS FileSystem.") | ||
| Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( | ||
| const std::string& path, | ||
| const std::shared_ptr<const KeyValueMetadata>& metadata) override; | ||
|
|
||
| private: | ||
| /// Create a GcsFileSystem instance from the given options. | ||
| friend std::shared_ptr<GcsFileSystem> internal::MakeGcsFileSystemForTest( | ||
| const GcsOptions& options); | ||
|
|
||
| explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context); | ||
|
|
||
| class Impl; | ||
| std::shared_ptr<Impl> impl_; | ||
| }; | ||
|
|
||
| } // namespace fs | ||
| } // namespace arrow | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if the mac build is a transient error or somehow due to this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not think is this change, as the build has
-DARROW_GCS=OFF:https://github.com/apache/arrow/pull/11331/checks?check_run_id=3830637295#step:9:62
and the change is (modulo reformatting) changing
(ARROW_FLIGHT OR ARROW_S3)to(ARROW_FLIGHT OR ARROW_S3 OR ARROW_GCS). In addition, this change has been there since the first commit in the branch.At a guess this is either a transient or the point in
masterwhere I based the branch has a problem, I can rebase if you think that would help.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, lets try a rebase.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the builds at
master, I do not think a rebase would help, and it seems even less likely that these changes caused the build breaks:https://github.com/apache/arrow/actions/runs/1317260919
https://github.com/apache/arrow/actions?query=event%3Apush+branch%3Amaster