-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-14345: [C++] Implement streaming reads #11436
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
|
|
||
| #include <google/cloud/storage/client.h> | ||
|
|
||
| #include "arrow/buffer.h" | ||
| #include "arrow/filesystem/gcsfs_internal.h" | ||
| #include "arrow/filesystem/path_util.h" | ||
| #include "arrow/result.h" | ||
|
|
@@ -28,6 +29,8 @@ namespace arrow { | |
| namespace fs { | ||
| namespace { | ||
|
|
||
| namespace gcs = google::cloud::storage; | ||
|
|
||
| auto constexpr kSep = '/'; | ||
|
|
||
| struct GcsPath { | ||
|
|
@@ -58,9 +61,48 @@ struct GcsPath { | |
| } | ||
| }; | ||
|
|
||
| } // namespace | ||
| class GcsInputStream : public arrow::io::InputStream { | ||
| public: | ||
| explicit GcsInputStream(gcs::ObjectReadStream stream) : stream_(std::move(stream)) {} | ||
|
|
||
| namespace gcs = google::cloud::storage; | ||
| ~GcsInputStream() override = default; | ||
|
|
||
| Status Close() override { | ||
| stream_.Close(); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| Result<int64_t> Tell() const override { | ||
| if (!stream_) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there a reason for this check here, and not in the other methods?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the other methods an invalid stream would result in an error anyway. Happy to add the additional checks if you think they make the code easier to grok. |
||
| return Status::IOError("invalid stream"); | ||
| } | ||
| return stream_.tellg(); | ||
| } | ||
|
|
||
| bool closed() const override { return !stream_.IsOpen(); } | ||
|
|
||
| Result<int64_t> Read(int64_t nbytes, void* out) override { | ||
| stream_.read(static_cast<char*>(out), nbytes); | ||
| if (!stream_.status().ok()) { | ||
| return internal::ToArrowStatus(stream_.status()); | ||
| } | ||
| return stream_.gcount(); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override { | ||
| ARROW_ASSIGN_OR_RAISE(auto buffer, arrow::AllocateResizableBuffer(nbytes)); | ||
| stream_.read(reinterpret_cast<char*>(buffer->mutable_data()), nbytes); | ||
| if (!stream_.status().ok()) { | ||
| return internal::ToArrowStatus(stream_.status()); | ||
| } | ||
| return arrow::SliceMutableBufferSafe(std::move(buffer), 0, stream_.gcount()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm. This is merely slicing the output buffer, but not resizing it. If the actual size is much smaller than
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True. On the other hand, a short read from a large buffer could require a lot of copying. I am not sure which is more likely to be a problem. I am happy to make a change if you have some data or intuition as to which one is worse.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does GCS return short reads except for the EOF case?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For all practical purposes, only the EOF case. The exception would be some kind of transient error that is stubborn enough to exhaust the retry policy.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. In any case, other implementations (such as S3) do resize the buffer, and there doesn't seem to be a reason to act differently here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Created https://issues.apache.org/jira/browse/ARROW-14559 to track that change. |
||
| } | ||
|
|
||
| private: | ||
| mutable gcs::ObjectReadStream stream_; | ||
| }; | ||
|
|
||
| } // namespace | ||
|
|
||
| google::cloud::Options AsGoogleCloudOptions(const GcsOptions& o) { | ||
| auto options = google::cloud::Options{}; | ||
|
|
@@ -95,6 +137,14 @@ class GcsFileSystem::Impl { | |
| return GetFileInfoImpl(path, std::move(meta).status(), FileType::Directory); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::InputStream>> OpenInputStream(const GcsPath& path) { | ||
| auto stream = client_.ReadObject(path.bucket, path.object); | ||
| if (!stream.status().ok()) { | ||
| return internal::ToArrowStatus(stream.status()); | ||
| } | ||
| return std::make_shared<GcsInputStream>(std::move(stream)); | ||
| } | ||
|
|
||
| private: | ||
| static Result<FileInfo> GetFileInfoImpl(const GcsPath& path, | ||
| const google::cloud::Status& status, | ||
|
|
@@ -169,12 +219,17 @@ Status GcsFileSystem::CopyFile(const std::string& src, const std::string& dest) | |
|
|
||
| Result<std::shared_ptr<io::InputStream>> GcsFileSystem::OpenInputStream( | ||
| const std::string& path) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(path)); | ||
| return impl_->OpenInputStream(p); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::InputStream>> GcsFileSystem::OpenInputStream( | ||
| const FileInfo& info) { | ||
| return Status::NotImplemented("The GCS FileSystem is not fully implemented"); | ||
| if (!info.IsFile()) { | ||
| return Status::IOError("Only files can be opened as input streams"); | ||
| } | ||
| ARROW_ASSIGN_OR_RAISE(auto p, GcsPath::FromString(info.path())); | ||
| return impl_->OpenInputStream(p); | ||
| } | ||
|
|
||
| Result<std::shared_ptr<io::RandomAccessFile>> GcsFileSystem::OpenInputFile( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just wanted to check if stream_.Close() is idempotent (not sure if this is a requirement but I believe all of the other interface implementations allow it). e.g. S3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Starting with v1.32.0 they are idempotent. Created ARROW-14385 to remind myself of updating any dependencies.