From 1569b3862abc35d2999bc458dbb2a28d4502dfc3 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 27 Mar 2020 17:21:48 +0900 Subject: [PATCH 01/12] [C++] Add push style stream format reader This change adds the following push style reader classes: * ipc::MessageEmitter * ipc::RecordBatchStreamEmitter Push style readers don't read data from stream directly. They receive already read data by users. This style is useful with event driven style IO API. We can't read data from stream directly in event driven style IO API. We just receive already read data from event driven style IO API like: void on_read(const uint8_t* data, size_t data_size) { process_data(data, data_size); } register_read_event(on_read); run_event_loop(); We can't use the current reader API with event driven style IO API but we can use this push style reader with event driven style IO API. The current Message reader is changed to use ipc::MessageEmitter internally. So we don't have duplicated reader implementation. And no performance regression with our benchmark. Before: Running release/arrow-ipc-read-write-benchmark Run on (12 X 4600 MHz CPU s) CPU Caches: L1 Data 32K (x6) L1 Instruction 32K (x6) L2 Unified 256K (x6) L3 Unified 12288K (x1) Load Average: 0.85, 0.84, 0.65 ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ----------------------------------------------------------------------------------------- ReadRecordBatch/1/real_time 886 ns 886 ns 774286 bytes_per_second=1102.15G/s ReadRecordBatch/4/real_time 1601 ns 1601 ns 436258 bytes_per_second=610.078G/s ReadRecordBatch/16/real_time 4819 ns 4820 ns 143568 bytes_per_second=202.663G/s ReadRecordBatch/64/real_time 18291 ns 18296 ns 38586 bytes_per_second=53.3893G/s ReadRecordBatch/256/real_time 84852 ns 84872 ns 8317 bytes_per_second=11.5091G/s ReadRecordBatch/1024/real_time 341091 ns 341168 ns 2049 bytes_per_second=2.86306G/s ReadRecordBatch/4096/real_time 1368049 ns 1368361 ns 511 bytes_per_second=730.968M/s ReadRecordBatch/8192/real_time 2676778 ns 2677341 ns 265 bytes_per_second=373.584M/s After: Running release/arrow-ipc-read-write-benchmark Run on (12 X 4600 MHz CPU s) CPU Caches: L1 Data 32K (x6) L1 Instruction 32K (x6) L2 Unified 256K (x6) L3 Unified 12288K (x1) Load Average: 0.88, 0.85, 0.66 ----------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ----------------------------------------------------------------------------------------- ReadRecordBatch/1/real_time 891 ns 891 ns 769579 bytes_per_second=1095.57G/s ReadRecordBatch/4/real_time 1599 ns 1599 ns 435756 bytes_per_second=610.746G/s ReadRecordBatch/16/real_time 4834 ns 4835 ns 144374 bytes_per_second=202.027G/s ReadRecordBatch/64/real_time 18204 ns 18206 ns 38190 bytes_per_second=53.6465G/s ReadRecordBatch/256/real_time 84142 ns 84154 ns 8309 bytes_per_second=11.6061G/s ReadRecordBatch/1024/real_time 343105 ns 343148 ns 2035 bytes_per_second=2.84625G/s ReadRecordBatch/4096/real_time 1399287 ns 1399484 ns 511 bytes_per_second=714.65M/s ReadRecordBatch/8192/real_time 2641529 ns 2641845 ns 263 bytes_per_second=378.569M/s Fix format Fix lint errors Fix lint errors Fix sanitizer errors Use AllocateBuffer to create empty 64-bit aligned buffer Introduce general Receiver API Add missing include Fix error type Use new Receiver API Fix format Split MessageReceiver again Remove duplicated comments Fix style Don't use deprecated API Don't use deprecated API Add missing slice for non CPU buffer Fix next_required_size parameter description Use ABORT_NOT_OK() Remove needless forward declaration Use different test suite name Fix include location Fix a bug that next_required_size() doesn't care buffered_size_ Use std::shared_ptr Add SchemaReceived Add more documentation for next_required_size() Use EosReceived() instead of is_eos() Rebase Remove unused variable from test --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/ipc/message.cc | 546 ++++++++++++++++++---- cpp/src/arrow/ipc/message.h | 189 ++++++++ cpp/src/arrow/ipc/read_write_benchmark.cc | 86 +++- cpp/src/arrow/ipc/read_write_test.cc | 197 +++++++- cpp/src/arrow/ipc/reader.cc | 196 +++++++- cpp/src/arrow/ipc/reader.h | 147 ++++++ cpp/src/arrow/util/receiver.cc | 30 ++ cpp/src/arrow/util/receiver.h | 69 +++ 9 files changed, 1319 insertions(+), 142 deletions(-) create mode 100644 cpp/src/arrow/util/receiver.cc create mode 100644 cpp/src/arrow/util/receiver.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 3454cd0c87d..19e28e275e9 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -180,6 +180,7 @@ set(ARROW_SRCS util/key_value_metadata.cc util/memory.cc util/parsing.cc + util/receiver.cc util/string.cc util/string_builder.cc util/task_group.cc diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index b089ab25e94..19fda228d79 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include "arrow/buffer.h" #include "arrow/device.h" @@ -183,33 +184,35 @@ Status CheckMetadataAndGetBodyLength(const Buffer& metadata, int64_t* body_lengt Result> Message::ReadFrom(std::shared_ptr metadata, io::InputStream* stream) { - RETURN_NOT_OK(MaybeAlignMetadata(&metadata)); - int64_t body_length = -1; - RETURN_NOT_OK(CheckMetadataAndGetBodyLength(*metadata, &body_length)); - - ARROW_ASSIGN_OR_RAISE(auto body, stream->Read(body_length)); - if (body->size() < body_length) { - return Status::IOError("Expected to be able to read ", body_length, + std::unique_ptr result; + auto receiver = std::make_shared(&result); + MessageEmitter emitter(receiver, MessageEmitter::State::METADATA, metadata->size()); + ARROW_RETURN_NOT_OK(emitter.Consume(metadata)); + + ARROW_ASSIGN_OR_RAISE(auto body, stream->Read(emitter.next_required_size())); + if (body->size() < emitter.next_required_size()) { + return Status::IOError("Expected to be able to read ", emitter.next_required_size(), " bytes for message body, got ", body->size()); } - - return Message::Open(metadata, body); + RETURN_NOT_OK(emitter.Consume(body)); + return result; } Result> Message::ReadFrom(const int64_t offset, std::shared_ptr metadata, io::RandomAccessFile* file) { - RETURN_NOT_OK(MaybeAlignMetadata(&metadata)); - int64_t body_length = -1; - RETURN_NOT_OK(CheckMetadataAndGetBodyLength(*metadata, &body_length)); - - ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset, body_length)); - if (body->size() < body_length) { - return Status::IOError("Expected to be able to read ", body_length, + std::unique_ptr result; + auto receiver = std::make_shared(&result); + MessageEmitter emitter(receiver, MessageEmitter::State::METADATA, metadata->size()); + ARROW_RETURN_NOT_OK(emitter.Consume(metadata)); + + ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset, emitter.next_required_size())); + if (body->size() < emitter.next_required_size()) { + return Status::IOError("Expected to be able to read ", emitter.next_required_size(), " bytes for message body, got ", body->size()); } - - return Message::Open(metadata, body); + RETURN_NOT_OK(emitter.Consume(body)); + return result; } Status WritePadding(io::OutputStream* stream, int64_t nbytes) { @@ -263,53 +266,48 @@ std::string FormatMessageType(Message::Type type) { Result> ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile* file) { - if (static_cast(metadata_length) < sizeof(int32_t)) { - return Status::Invalid("metadata_length should be at least 4"); - } + std::unique_ptr result; + auto receiver = std::make_shared(&result); + MessageEmitter emitter(receiver); - ARROW_ASSIGN_OR_RAISE(auto buffer, file->ReadAt(offset, metadata_length)); + if (metadata_length < emitter.next_required_size()) { + return Status::Invalid("metadata_length should be at least ", + emitter.next_required_size()); + } - if (buffer->size() < metadata_length) { + ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length)); + if (metadata->size() < metadata_length) { return Status::Invalid("Expected to read ", metadata_length, - " metadata bytes but got ", buffer->size()); + " metadata bytes but got ", metadata->size()); } - - const int32_t continuation = util::SafeLoadAs(buffer->data()); - - // The size of the Flatbuffer including padding - int32_t flatbuffer_length = -1; - int32_t prefix_size = -1; - if (continuation == internal::kIpcContinuationToken) { - if (metadata_length < 8) { - return Status::Invalid( - "Corrupted IPC message, had continuation token " - " but length ", - metadata_length); + ARROW_RETURN_NOT_OK(emitter.Consume(metadata)); + + switch (emitter.state()) { + case MessageEmitter::State::INITIAL: + return result; + case MessageEmitter::State::METADATA_LENGTH: + return Status::Invalid("metadata length is missing. File offset: ", offset, + ", metadata length: ", metadata_length); + case MessageEmitter::State::METADATA: + return Status::Invalid("flatbuffer size ", emitter.next_required_size(), + " invalid. File offset: ", offset, + ", metadata length: ", metadata_length); + case MessageEmitter::State::BODY: { + ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset + metadata_length, + emitter.next_required_size())); + if (body->size() < emitter.next_required_size()) { + return Status::IOError("Expected to be able to read ", + emitter.next_required_size(), + " bytes for message body, got ", body->size()); + } + RETURN_NOT_OK(emitter.Consume(body)); + return result; } - - // Valid IPC message, parse the message length now - flatbuffer_length = util::SafeLoadAs(buffer->data() + 4); - prefix_size = 8; - } else { - // ARROW-6314: Backwards compatibility for reading old IPC - // messages produced prior to version 0.15.0 - flatbuffer_length = continuation; - prefix_size = 4; - } - - if (flatbuffer_length == 0) { - return Status::Invalid("Unexpected empty message in IPC file format"); - } - - if (flatbuffer_length != metadata_length - prefix_size) { - return Status::Invalid("flatbuffer size ", flatbuffer_length, - " invalid. File offset: ", offset, - ", metadata length: ", metadata_length); + case MessageEmitter::State::EOS: + return Status::Invalid("Unexpected empty message in IPC file format"); + default: + return Status::Invalid("Unexpected state: ", emitter.state()); } - - std::shared_ptr metadata = - SliceBuffer(buffer, prefix_size, buffer->size() - prefix_size); - return Message::ReadFrom(offset + metadata_length, metadata, file); } Status AlignStream(io::InputStream* stream, int32_t alignment) { @@ -337,42 +335,59 @@ Status CheckAligned(io::FileInterface* stream, int32_t alignment) { } Result> ReadMessage(io::InputStream* file, MemoryPool* pool) { - int32_t continuation = 0; - ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, file->Read(sizeof(int32_t), &continuation)); - - if (bytes_read == 0) { - // EOS without indication - return nullptr; - } else if (bytes_read != sizeof(int32_t)) { - return Status::Invalid("Corrupted message, only ", bytes_read, " bytes available"); + std::unique_ptr message; + auto receiver = std::make_shared(&message); + MessageEmitter emitter(receiver, pool); + + { + uint8_t continuation[sizeof(int32_t)]; + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, file->Read(sizeof(int32_t), &continuation)); + if (bytes_read == 0) { + // EOS without indication + return nullptr; + } else if (bytes_read != emitter.next_required_size()) { + return Status::Invalid("Corrupted message, only ", bytes_read, " bytes available"); + } + ARROW_RETURN_NOT_OK(emitter.Consume(continuation, bytes_read)); } - int32_t flatbuffer_length = -1; - if (continuation == internal::kIpcContinuationToken) { + if (emitter.state() == MessageEmitter::State::METADATA_LENGTH) { // Valid IPC message, read the message length now - ARROW_ASSIGN_OR_RAISE(bytes_read, file->Read(sizeof(int32_t), &flatbuffer_length)); - } else { - // ARROW-6314: Backwards compatibility for reading old IPC - // messages produced prior to version 0.15.0 - flatbuffer_length = continuation; + uint8_t metadata_length[sizeof(int32_t)]; + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, + file->Read(sizeof(int32_t), &metadata_length)); + if (bytes_read != emitter.next_required_size()) { + return Status::Invalid("Corrupted metadata length, only ", bytes_read, + " bytes available"); + } + ARROW_RETURN_NOT_OK(emitter.Consume(metadata_length, bytes_read)); } - if (flatbuffer_length == 0) { - // EOS + if (emitter.state() == MessageEmitter::State::EOS) { return nullptr; } - ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(flatbuffer_length)); - bytes_read = metadata->size(); - if (bytes_read != flatbuffer_length) { - return Status::Invalid("Expected to read ", flatbuffer_length, - " metadata bytes, but ", "only read ", bytes_read); + auto metadata_length = emitter.next_required_size(); + ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length)); + if (metadata->size() != metadata_length) { + return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ", + "only read ", metadata->size()); } - // The buffer could be a non-CPU buffer (e.g. CUDA) - ARROW_ASSIGN_OR_RAISE(metadata, - Buffer::ViewOrCopy(metadata, CPUDevice::memory_manager(pool))); + RETURN_NOT_OK(emitter.Consume(metadata)); - return Message::ReadFrom(metadata, file); + if (emitter.state() == MessageEmitter::State::BODY) { + ARROW_ASSIGN_OR_RAISE(auto body, file->Read(emitter.next_required_size())); + if (body->size() < emitter.next_required_size()) { + return Status::IOError("Expected to be able to read ", emitter.next_required_size(), + " bytes for message body, got ", body->size()); + } + ARROW_RETURN_NOT_OK(emitter.Consume(body)); + } + + if (!message) { + return Status::Invalid("Failed to read message"); + } + return message; } Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, @@ -407,6 +422,367 @@ Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, return Status::OK(); } +// ---------------------------------------------------------------------- +// Implement MessageEmitter + +static constexpr auto kMessageEmitterNextRequiredSizeInitial = sizeof(int32_t); +static constexpr auto kMessageEmitterNextRequiredSizeMetadataLength = sizeof(int32_t); + +class MessageEmitter::MessageEmitterImpl { + public: + explicit MessageEmitterImpl(std::shared_ptr receiver, + State initial_state, int64_t initial_next_required_size, + MemoryPool* pool) + : receiver_(std::move(receiver)), + pool_(pool), + state_(initial_state), + next_required_size_(initial_next_required_size), + chunks_(), + buffered_size_(0), + metadata_(nullptr) {} + + Status ConsumeData(const uint8_t* data, int64_t size) { + if (buffered_size_ == 0) { + while (size > 0 && size >= next_required_size_) { + auto used_size = next_required_size_; + switch (state_) { + case State::INITIAL: + RETURN_NOT_OK(ConsumeInitialData(data, next_required_size_)); + break; + case State::METADATA_LENGTH: + RETURN_NOT_OK(ConsumeMetadataLengthData(data, next_required_size_)); + break; + case State::METADATA: { + auto buffer = std::make_shared(data, next_required_size_); + RETURN_NOT_OK(ConsumeMetadataBuffer(&buffer)); + } break; + case State::BODY: { + auto buffer = std::make_shared(data, next_required_size_); + RETURN_NOT_OK(ConsumeBodyBuffer(&buffer)); + } break; + case State::EOS: + return Status::OK(); + } + data += used_size; + size -= used_size; + } + } + + if (size == 0) { + return Status::OK(); + } + + chunks_.push_back(std::make_shared(data, size)); + buffered_size_ += size; + return ConsumeChunks(); + } + + Status ConsumeBuffer(std::shared_ptr* buffer) { + if (buffered_size_ == 0) { + while ((*buffer)->size() >= next_required_size_) { + auto used_size = next_required_size_; + switch (state_) { + case State::INITIAL: + RETURN_NOT_OK(ConsumeInitialBuffer(buffer)); + break; + case State::METADATA_LENGTH: + RETURN_NOT_OK(ConsumeMetadataLengthBuffer(buffer)); + break; + case State::METADATA: + if ((*buffer)->size() == next_required_size_) { + return ConsumeMetadataBuffer(buffer); + } else { + auto sliced_buffer = SliceBuffer(*buffer, 0, next_required_size_); + RETURN_NOT_OK(ConsumeMetadataBuffer(&sliced_buffer)); + } + break; + case State::BODY: + if ((*buffer)->size() == next_required_size_) { + return ConsumeBodyBuffer(buffer); + } else { + auto sliced_buffer = SliceBuffer(*buffer, 0, next_required_size_); + RETURN_NOT_OK(ConsumeBodyBuffer(&sliced_buffer)); + } + break; + case State::EOS: + return Status::OK(); + } + if ((*buffer)->size() == used_size) { + return Status::OK(); + } + *buffer = SliceBuffer(*buffer, used_size); + } + } + + if ((*buffer)->size() == 0) { + return Status::OK(); + } + + buffered_size_ += (*buffer)->size(); + chunks_.push_back(std::move(*buffer)); + return ConsumeChunks(); + } + + int64_t next_required_size() const { return next_required_size_ - buffered_size_; } + + MessageEmitter::State state() const { return state_; } + + private: + Status ConsumeChunks() { + while (state_ != State::EOS) { + if (buffered_size_ < next_required_size_) { + return Status::OK(); + } + + switch (state_) { + case State::INITIAL: + RETURN_NOT_OK(ConsumeInitialChunks()); + break; + case State::METADATA_LENGTH: + RETURN_NOT_OK(ConsumeMetadataLengthChunks()); + break; + case State::METADATA: + RETURN_NOT_OK(ConsumeMetadataChunks()); + break; + case State::BODY: + RETURN_NOT_OK(ConsumeBodyChunks()); + break; + case State::EOS: + return Status::OK(); + } + } + + return Status::OK(); + } + + Status ConsumeInitialData(const uint8_t* data, int64_t size) { + return ConsumeInitial(util::SafeLoadAs(data)); + } + + Status ConsumeInitialBuffer(std::shared_ptr* buffer) { + ARROW_ASSIGN_OR_RAISE(auto continuation, ConsumeDataBufferInt32(buffer)); + return ConsumeInitial(continuation); + } + + Status ConsumeInitialChunks() { + int32_t continuation = 0; + RETURN_NOT_OK(ConsumeDataChunks(sizeof(int32_t), &continuation)); + return ConsumeInitial(continuation); + } + + Status ConsumeInitial(int32_t continuation) { + if (continuation == internal::kIpcContinuationToken) { + state_ = State::METADATA_LENGTH; + next_required_size_ = kMessageEmitterNextRequiredSizeMetadataLength; + // Valid IPC message, read the message length now + return Status::OK(); + } else if (continuation == 0) { + state_ = State::EOS; + next_required_size_ = 0; + return Status::OK(); + } else { + state_ = State::METADATA; + // ARROW-6314: Backwards compatibility for reading old IPC + // messages produced prior to version 0.15.0 + next_required_size_ = continuation; + return Status::OK(); + } + } + + Status ConsumeMetadataLengthData(const uint8_t* data, int64_t size) { + return ConsumeMetadataLength(util::SafeLoadAs(data)); + } + + Status ConsumeMetadataLengthBuffer(std::shared_ptr* buffer) { + ARROW_ASSIGN_OR_RAISE(auto metadata_length, ConsumeDataBufferInt32(buffer)); + return ConsumeMetadataLength(metadata_length); + } + + Status ConsumeMetadataLengthChunks() { + int32_t metadata_length = 0; + RETURN_NOT_OK(ConsumeDataChunks(sizeof(int32_t), &metadata_length)); + return ConsumeMetadataLength(metadata_length); + } + + Status ConsumeMetadataLength(int32_t metadata_length) { + if (metadata_length == 0) { + state_ = State::EOS; + next_required_size_ = 0; + return Status::OK(); + } else { + state_ = State::METADATA; + next_required_size_ = metadata_length; + return Status::OK(); + } + } + + Status ConsumeMetadataBuffer(std::shared_ptr* buffer) { + if ((*buffer)->is_cpu()) { + metadata_ = std::move(*buffer); + } else { + ARROW_ASSIGN_OR_RAISE( + metadata_, Buffer::ViewOrCopy(*buffer, CPUDevice::memory_manager(pool_))); + } + return ConsumeMetadata(); + } + + Status ConsumeMetadataChunks() { + if (chunks_[0]->size() >= next_required_size_) { + if (chunks_[0]->size() == next_required_size_) { + if (chunks_[0]->is_cpu()) { + metadata_ = std::move(chunks_[0]); + } else { + ARROW_ASSIGN_OR_RAISE( + metadata_, + Buffer::ViewOrCopy(chunks_[0], CPUDevice::memory_manager(pool_))); + } + chunks_.erase(chunks_.begin()); + } else { + metadata_ = SliceBuffer(chunks_[0], 0, next_required_size_); + if (!chunks_[0]->is_cpu()) { + ARROW_ASSIGN_OR_RAISE( + metadata_, Buffer::ViewOrCopy(metadata_, CPUDevice::memory_manager(pool_))); + } + chunks_[0] = SliceBuffer(chunks_[0], next_required_size_); + } + buffered_size_ -= next_required_size_; + } else { + ARROW_ASSIGN_OR_RAISE(auto metadata, AllocateBuffer(next_required_size_, pool_)); + metadata_ = std::shared_ptr(metadata.release()); + RETURN_NOT_OK(ConsumeDataChunks(next_required_size_, metadata_->mutable_data())); + } + return ConsumeMetadata(); + } + + Status ConsumeMetadata() { + RETURN_NOT_OK(MaybeAlignMetadata(&metadata_)); + int64_t body_length = -1; + RETURN_NOT_OK(CheckMetadataAndGetBodyLength(*metadata_, &body_length)); + + state_ = State::BODY; + next_required_size_ = body_length; + if (next_required_size_ == 0) { + ARROW_ASSIGN_OR_RAISE(auto body, AllocateBuffer(0, pool_)); + std::shared_ptr shared_body(body.release()); + return ConsumeBody(&shared_body); + } else { + return Status::OK(); + } + } + + Status ConsumeBodyBuffer(std::shared_ptr* buffer) { + return ConsumeBody(buffer); + } + + Status ConsumeBodyChunks() { + if (chunks_[0]->size() >= next_required_size_) { + auto used_size = next_required_size_; + if (chunks_[0]->size() == next_required_size_) { + RETURN_NOT_OK(ConsumeBody(&chunks_[0])); + chunks_.erase(chunks_.begin()); + } else { + auto body = SliceBuffer(chunks_[0], 0, next_required_size_); + RETURN_NOT_OK(ConsumeBody(&body)); + chunks_[0] = SliceBuffer(chunks_[0], used_size); + } + buffered_size_ -= used_size; + return Status::OK(); + } else { + ARROW_ASSIGN_OR_RAISE(auto body, AllocateBuffer(next_required_size_, pool_)); + RETURN_NOT_OK(ConsumeDataChunks(next_required_size_, body->mutable_data())); + std::shared_ptr shared_body(body.release()); + return ConsumeBody(&shared_body); + } + } + + Status ConsumeBody(std::shared_ptr* buffer) { + ARROW_ASSIGN_OR_RAISE(std::unique_ptr message, + Message::Open(metadata_, *buffer)); + + state_ = State::INITIAL; + next_required_size_ = kMessageEmitterNextRequiredSizeInitial; + RETURN_NOT_OK(receiver_->Received(std::move(message))); + return Status::OK(); + } + + Result ConsumeDataBufferInt32(std::shared_ptr* buffer) { + if ((*buffer)->is_cpu()) { + return util::SafeLoadAs((*buffer)->data()); + } else { + ARROW_ASSIGN_OR_RAISE( + auto cpu_buffer, Buffer::ViewOrCopy(*buffer, CPUDevice::memory_manager(pool_))); + return util::SafeLoadAs(cpu_buffer->data()); + } + } + + Status ConsumeDataChunks(int64_t nbytes, void* out) { + size_t offset = 0; + size_t n_used_chunks = 0; + auto required_size = nbytes; + std::shared_ptr last_chunk; + for (auto& chunk : chunks_) { + if (!chunk->is_cpu()) { + ARROW_ASSIGN_OR_RAISE( + chunk, Buffer::ViewOrCopy(chunk, CPUDevice::memory_manager(pool_))); + } + auto data = chunk->data(); + auto data_size = chunk->size(); + auto copy_size = std::min(required_size, data_size); + memcpy(static_cast(out) + offset, data, copy_size); + n_used_chunks++; + offset += copy_size; + required_size -= copy_size; + if (required_size == 0) { + if (data_size != copy_size) { + last_chunk = SliceBuffer(chunk, copy_size); + } + break; + } + } + chunks_.erase(chunks_.begin(), chunks_.begin() + n_used_chunks); + if (last_chunk.get() != nullptr) { + chunks_.insert(chunks_.begin(), std::move(last_chunk)); + } + buffered_size_ -= offset; + return Status::OK(); + } + + std::shared_ptr receiver_; + MemoryPool* pool_; + State state_; + int64_t next_required_size_; + std::vector> chunks_; + int64_t buffered_size_; + std::shared_ptr metadata_; // Must be CPU buffer +}; + +MessageEmitter::MessageEmitter(std::shared_ptr receiver, + MemoryPool* pool) { + impl_.reset(new MessageEmitterImpl(std::move(receiver), State::INITIAL, + kMessageEmitterNextRequiredSizeInitial, pool)); +} + +MessageEmitter::MessageEmitter(std::shared_ptr receiver, + State initial_state, int64_t initial_next_required_size, + MemoryPool* pool) { + impl_.reset(new MessageEmitterImpl(std::move(receiver), initial_state, + initial_next_required_size, pool)); +} + +MessageEmitter::~MessageEmitter() {} + +Status MessageEmitter::Consume(const uint8_t* data, int64_t size) { + return impl_->ConsumeData(data, size); +} + +Status MessageEmitter::Consume(std::shared_ptr buffer) { + return impl_->ConsumeBuffer(&buffer); +} + +int64_t MessageEmitter::next_required_size() const { return impl_->next_required_size(); } + +MessageEmitter::State MessageEmitter::state() const { return impl_->state(); } + // ---------------------------------------------------------------------- // Implement InputStream message reader diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 9698806fd21..49c7eada8e2 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -180,6 +180,195 @@ class ARROW_EXPORT Message { ARROW_EXPORT std::string FormatMessageType(Message::Type type); +/// \class MessageReceiver +/// \brief An abstract class to receive read messages from +/// MessageEmitter. +class ARROW_EXPORT MessageReceiver { + public: + virtual ~MessageReceiver() = default; + + /// \brief Receive a message. + /// + /// MessageEmitter calls this method when it read a message. This + /// method is called multiple times when the target stream stream + /// has multiple messages. + /// + /// \param[in] message a read message + /// \return Status + virtual Status Received(std::unique_ptr message) = 0; +}; + +/// \class MessageReceiverAssign +/// \brief Assign a message read by MessageEmitter. +class ARROW_EXPORT MessageReceiverAssign : public MessageReceiver { + public: + /// \brief Construct a message receiver that assign a read message + /// to the specified location. + /// + /// \param[in] message a location to store the received message + explicit MessageReceiverAssign(std::unique_ptr* message) : message_(message) {} + + virtual ~MessageReceiverAssign() = default; + + Status Received(std::unique_ptr message) override { + *message_ = std::move(message); + return Status::OK(); + } + + private: + std::unique_ptr* message_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(MessageReceiverAssign); +}; + +/// \class MessageEmitter +/// \brief Push style message reader that receives data from user. +class ARROW_EXPORT MessageEmitter { + public: + /// \brief State for reading a message + enum State { + /// The initial state. It requires one of the followings as the next data: + /// + /// * int32_t continuation token + /// * int32_t end-of-stream mark (== 0) + /// * int32_t metadata length (backward compatibility for + /// reading old IPC messages produced prior to version 0.15.0 + INITIAL, + + /// It requires int32_t metadata length. + METADATA_LENGTH, + + /// It requires metadata. + METADATA, + + /// It requires message body. + BODY, + + /// The end-of-stream state. No more data is processed. + EOS, + }; + + /// \brief Construct a message emitter. + /// + /// \param[in] receiver a MessageReceiver that receives read messages + /// \param[in] pool an optional MemoryPool to copy metadata on the + /// CPU, if required + explicit MessageEmitter(std::shared_ptr receiver, + MemoryPool* pool = default_memory_pool()); + + /// \brief Construct a message emitter with the specified state. + /// + /// This is a construct for advanced users that know how to read + /// Message. + /// + /// \param[in] receiver a MessageReceiver that receives read messages + /// \param[in] initial_state an initial state of the emitter + /// \param[in] initial_next_required_size the number of bytes needed + /// to run the next action + /// \param[in] pool an optional MemoryPool to copy metadata on the + /// CPU, if required + MessageEmitter(std::shared_ptr receiver, State initial_state, + int64_t initial_next_required_size, + MemoryPool* pool = default_memory_pool()); + + virtual ~MessageEmitter(); + + /// \brief Feed data to the emitter as a raw data. + /// + /// If the emitter can read one or more messages by the data, the + /// emitter emits read batches by calling receiver->Receive() + /// multiple times. + /// + /// \param[in] data a raw data to be processed. This data isn't + /// copied. The passed memory must be kept alive through message + /// processing. + /// \param[in] size raw data size. + /// \return Status + Status Consume(const uint8_t* data, int64_t size); + + /// \brief Feed data to the emitter as a Buffer. + /// + /// If the emitter can read one or more messages by the Buffer, the + /// emitter emits read messages by calling receiver->Receive() + /// multiple times. + /// + /// \param[in] buffer a Buffer to be processed. + /// \return Status + Status Consume(std::shared_ptr buffer); + + /// \brief Return the number of bytes needed to advance the state of + /// the emitter. + /// + /// This method is provided for users who want to optimize performance. + /// Normal users don't need to use this method. + /// + /// Here is an example usage for normal users: + /// + /// ~~~{.cpp} + /// emitter.Consume(buffer1); + /// emitter.Consume(buffer2); + /// emitter.Consume(buffer3); + /// ~~~ + /// + /// Emitter has internal buffer. If consumed data isn't enough to + /// advance the state of the emitter, consumed data is buffered to + /// the internal buffer. It causes performance overhead. + /// + /// If you pass next_required_size() size data to each Consume() + /// call, the emitter doesn't use its internal buffer. It improves + /// performance. + /// + /// Here is an example usage to avoid using internal buffer: + /// + /// ~~~{.cpp} + /// buffer1 = get_data(emitter.next_required_size()); + /// emitter.Consume(buffer1); + /// buffer2 = get_data(emitter.next_required_size()); + /// emitter.Consume(buffer2); + /// ~~~ + /// + /// Users can use this method to avoid creating small + /// chunks. Message body must be contiguous data. If users pass + /// small chunks to the emitter, the emitter needs concatenate small + /// chunks internally. It causes performance overhead. + /// + /// Here is an example usage to reduce small chunks: + /// + /// ~~~{.cpp} + /// buffer = AllocateResizableBuffer(); + /// while ((small_chunk = get_data(&small_chunk_size))) { + /// auto current_buffer_size = buffer->size(); + /// buffer->Resize(current_buffer_size + small_chunk_size); + /// memcpy(buffer->mutable_data() + current_buffer_size, + /// small_chunk, + /// small_chunk_size); + /// if (buffer->size() < emitter.next_requied_size()) { + /// continue; + /// } + /// std::shared_ptr chunk(buffer.release()); + /// emitter.Consume(chunk); + /// buffer = AllocateResizableBuffer(); + /// } + /// if (buffer->size() > 0) { + /// std::shared_ptr chunk(buffer.release()); + /// emitter.Consume(chunk); + /// } + /// ~~~ + /// + /// \return the number of bytes needed to advance the state of the + /// emitter + int64_t next_required_size() const; + + /// \return the current state + State state() const; + + private: + class MessageEmitterImpl; + std::unique_ptr impl_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(MessageEmitter); +}; + /// \brief Abstract interface for a sequence of messages /// \since 0.5.0 class ARROW_EXPORT MessageReader { diff --git a/cpp/src/arrow/ipc/read_write_benchmark.cc b/cpp/src/arrow/ipc/read_write_benchmark.cc index c76c166dfcd..bc971e278df 100644 --- a/cpp/src/arrow/ipc/read_write_benchmark.cc +++ b/cpp/src/arrow/ipc/read_write_benchmark.cc @@ -59,11 +59,8 @@ static void WriteRecordBatch(benchmark::State& state) { // NOLINT non-const ref io::BufferOutputStream stream(buffer); int32_t metadata_length; int64_t body_length; - if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length, - options) - .ok()) { - state.SkipWithError("Failed to write!"); - } + ABORT_NOT_OK(ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, + &body_length, options)); } state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); } @@ -80,25 +77,86 @@ static void ReadRecordBatch(benchmark::State& state) { // NOLINT non-const refe int32_t metadata_length; int64_t body_length; - if (!ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, &body_length, - options) - .ok()) { - state.SkipWithError("Failed to write!"); - } + ABORT_NOT_OK(ipc::WriteRecordBatch(*record_batch, 0, &stream, &metadata_length, + &body_length, options)); ipc::DictionaryMemo empty_memo; while (state.KeepRunning()) { io::BufferReader reader(buffer); - if (!ipc::ReadRecordBatch(record_batch->schema(), &empty_memo, - ipc::IpcReadOptions::Defaults(), &reader) - .ok()) { - state.SkipWithError("Failed to read!"); + ABORT_NOT_OK(ipc::ReadRecordBatch(record_batch->schema(), &empty_memo, + ipc::IpcReadOptions::Defaults(), &reader)); + } + state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); +} + +static void ReadStream(benchmark::State& state) { // NOLINT non-const reference + // 1MB + constexpr int64_t kTotalSize = 1 << 20; + auto options = ipc::IpcWriteOptions::Defaults(); + + std::shared_ptr buffer = *AllocateResizableBuffer(kTotalSize & 2); + auto record_batch = MakeRecordBatch(kTotalSize, state.range(0)); + + io::BufferOutputStream stream(buffer); + + auto writer_result = ipc::NewStreamWriter(&stream, record_batch->schema(), options); + ABORT_NOT_OK(writer_result); + auto writer = *writer_result; + ABORT_NOT_OK(writer->WriteRecordBatch(*record_batch)); + ABORT_NOT_OK(writer->Close()); + + ipc::DictionaryMemo empty_memo; + while (state.KeepRunning()) { + io::BufferReader input(buffer); + auto reader_result = + ipc::RecordBatchStreamReader::Open(&input, ipc::IpcReadOptions::Defaults()); + ABORT_NOT_OK(reader_result); + auto reader = *reader_result; + while (true) { + std::shared_ptr batch; + ABORT_NOT_OK(reader->ReadNext(&batch)); + if (batch.get() == nullptr) { + break; + } } } state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); } +static void EmitStream(benchmark::State& state) { // NOLINT non-const reference + // 1MB + constexpr int64_t kTotalSize = 1 << 20; + auto options = ipc::IpcWriteOptions::Defaults(); + + std::shared_ptr buffer = *AllocateResizableBuffer(kTotalSize & 2); + auto record_batch = MakeRecordBatch(kTotalSize, state.range(0)); + + io::BufferOutputStream stream(buffer); + + auto writer_result = ipc::NewStreamWriter(&stream, record_batch->schema(), options); + ABORT_NOT_OK(writer_result); + auto writer = *writer_result; + ABORT_NOT_OK(writer->WriteRecordBatch(*record_batch)); + ABORT_NOT_OK(writer->Close()); + + ipc::DictionaryMemo empty_memo; + while (state.KeepRunning()) { + class RecordBatchReceiverNull : public Receiver { + Status RecordBatchReceived(std::shared_ptr batch) override { + return Status::OK(); + } + } receiver; + ipc::RecordBatchStreamEmitter emitter( + std::shared_ptr(&receiver), + ipc::IpcReadOptions::Defaults()); + ABORT_NOT_OK(emitter.Consume(buffer)); + } + state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); +} + BENCHMARK(WriteRecordBatch)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); BENCHMARK(ReadRecordBatch)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); +BENCHMARK(ReadStream)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); +BENCHMARK(EmitStream)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); } // namespace arrow diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index f9500be40af..5629df795ee 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -884,6 +884,84 @@ struct StreamWriterHelper { std::shared_ptr writer_; }; +struct StreamEmitterDataWriterHelper : public StreamWriterHelper { + Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver, options); + ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data(), buffer_->size())); + *out_batches = receiver->record_batches(); + return Status::OK(); + } + + Status ReadSchema(std::shared_ptr* out) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver); + ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data(), buffer_->size())); + *out = receiver->schema(); + return Status::OK(); + } +}; + +struct StreamEmitterBufferWriterHelper : public StreamWriterHelper { + Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver, options); + ARROW_RETURN_NOT_OK(emitter.Consume(buffer_)); + *out_batches = receiver->record_batches(); + return Status::OK(); + } + + Status ReadSchema(std::shared_ptr* out) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver); + ARROW_RETURN_NOT_OK(emitter.Consume(buffer_)); + *out = receiver->schema(); + return Status::OK(); + } +}; + +struct StreamEmitterSmallChunksWriterHelper : public StreamWriterHelper { + Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver, options); + for (int64_t offset = 0; offset < buffer_->size() - 1; ++offset) { + ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data() + offset, 1)); + } + *out_batches = receiver->record_batches(); + return Status::OK(); + } + + Status ReadSchema(std::shared_ptr* out) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver); + for (int64_t offset = 0; offset < buffer_->size() - 1; ++offset) { + ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data() + offset, 1)); + } + *out = receiver->schema(); + return Status::OK(); + } +}; + +struct StreamEmitterLargeChunksWriterHelper : public StreamWriterHelper { + Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver, options); + ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 0, 1))); + ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 1))); + *out_batches = receiver->record_batches(); + return Status::OK(); + } + + Status ReadSchema(std::shared_ptr* out) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver); + ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 0, 1))); + ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 1))); + *out = receiver->schema(); + return Status::OK(); + } +}; + // Parameterized mixin with tests for stream / file writer template @@ -902,8 +980,9 @@ class ReaderWriterMixin { BatchVector in_batches = {batch1, batch2}; BatchVector out_batches; - ASSERT_OK( - RoundTripHelper(in_batches, options, IpcReadOptions::Defaults(), &out_batches)); + WriterHelper writer_helper; + ASSERT_OK(RoundTripHelper(writer_helper, in_batches, options, + IpcReadOptions::Defaults(), &out_batches)); ASSERT_EQ(out_batches.size(), in_batches.size()); // Compare batches @@ -924,8 +1003,9 @@ class ReaderWriterMixin { BatchVector in_batches = {batch1, batch2}; BatchVector out_batches; - ASSERT_OK( - RoundTripHelper(in_batches, options, IpcReadOptions::Defaults(), &out_batches)); + WriterHelper writer_helper; + ASSERT_OK(RoundTripHelper(writer_helper, in_batches, options, + IpcReadOptions::Defaults(), &out_batches)); ASSERT_EQ(out_batches.size(), in_batches.size()); // Compare batches @@ -938,8 +1018,9 @@ class ReaderWriterMixin { std::shared_ptr batch; ASSERT_OK(MakeDictionary(&batch)); + WriterHelper writer_helper; BatchVector out_batches; - ASSERT_OK(RoundTripHelper({batch}, IpcWriteOptions::Defaults(), + ASSERT_OK(RoundTripHelper(writer_helper, {batch}, IpcWriteOptions::Defaults(), IpcReadOptions::Defaults(), &out_batches)); ASSERT_EQ(out_batches.size(), 1); @@ -967,22 +1048,35 @@ class ReaderWriterMixin { options.included_fields = {1, 3}; - BatchVector out_batches; - ASSERT_OK( - RoundTripHelper({batch}, IpcWriteOptions::Defaults(), options, &out_batches)); + { + WriterHelper writer_helper; + BatchVector out_batches; + ASSERT_OK(RoundTripHelper(writer_helper, {batch}, IpcWriteOptions::Defaults(), + options, &out_batches)); - auto ex_schema = schema({field("a1", utf8()), field("a3", utf8())}, - key_value_metadata({"key1"}, {"value1"})); - auto ex_batch = RecordBatch::Make(ex_schema, a0->length(), {a1, a3}); - AssertBatchesEqual(*ex_batch, *out_batches[0], /*check_metadata=*/true); + auto ex_schema = schema({field("a1", utf8()), field("a3", utf8())}, + key_value_metadata({"key1"}, {"value1"})); + auto ex_batch = RecordBatch::Make(ex_schema, a0->length(), {a1, a3}); + AssertBatchesEqual(*ex_batch, *out_batches[0], /*check_metadata=*/true); + } // Out of bounds cases options.included_fields = {1, 3, 5}; - ASSERT_RAISES(Invalid, RoundTripHelper({batch}, IpcWriteOptions::Defaults(), options, - &out_batches)); + { + WriterHelper writer_helper; + BatchVector out_batches; + ASSERT_RAISES(Invalid, + RoundTripHelper(writer_helper, {batch}, IpcWriteOptions::Defaults(), + options, &out_batches)); + } options.included_fields = {1, 3, -1}; - ASSERT_RAISES(Invalid, RoundTripHelper({batch}, IpcWriteOptions::Defaults(), options, - &out_batches)); + { + WriterHelper writer_helper; + BatchVector out_batches; + ASSERT_RAISES(Invalid, + RoundTripHelper(writer_helper, {batch}, IpcWriteOptions::Defaults(), + options, &out_batches)); + } } void TestWriteDifferentSchema() { @@ -1031,10 +1125,9 @@ class ReaderWriterMixin { } private: - Status RoundTripHelper(const BatchVector& in_batches, + Status RoundTripHelper(WriterHelper& writer_helper, const BatchVector& in_batches, const IpcWriteOptions& write_options, const IpcReadOptions& read_options, BatchVector* out_batches) { - WriterHelper writer_helper; RETURN_NOT_OK(writer_helper.Init(in_batches[0]->schema(), write_options)); for (const auto& batch : in_batches) { RETURN_NOT_OK(writer_helper.WriteBatch(batch)); @@ -1069,6 +1162,17 @@ class TestFileFormat : public ReaderWriterMixin, class TestStreamFormat : public ReaderWriterMixin, public ::testing::TestWithParam {}; +class TestStreamEmitterData : public ReaderWriterMixin, + public ::testing::TestWithParam {}; +class TestStreamEmitterBuffer : public ReaderWriterMixin, + public ::testing::TestWithParam {}; +class TestStreamEmitterSmallChunks + : public ReaderWriterMixin, + public ::testing::TestWithParam {}; +class TestStreamEmitterLargeChunks + : public ReaderWriterMixin, + public ::testing::TestWithParam {}; + TEST_P(TestFileFormat, RoundTrip) { TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); @@ -1089,9 +1193,57 @@ TEST_P(TestStreamFormat, RoundTrip) { TestZeroLengthRoundTrip(*GetParam(), options); } +TEST_P(TestStreamEmitterData, RoundTrip) { + TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + + IpcWriteOptions options; + options.write_legacy_ipc_format = true; + TestRoundTrip(*GetParam(), options); + TestZeroLengthRoundTrip(*GetParam(), options); +} + +TEST_P(TestStreamEmitterBuffer, RoundTrip) { + TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + + IpcWriteOptions options; + options.write_legacy_ipc_format = true; + TestRoundTrip(*GetParam(), options); + TestZeroLengthRoundTrip(*GetParam(), options); +} + +TEST_P(TestStreamEmitterSmallChunks, RoundTrip) { + TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + + IpcWriteOptions options; + options.write_legacy_ipc_format = true; + TestRoundTrip(*GetParam(), options); + TestZeroLengthRoundTrip(*GetParam(), options); +} + +TEST_P(TestStreamEmitterLargeChunks, RoundTrip) { + TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); + + IpcWriteOptions options; + options.write_legacy_ipc_format = true; + TestRoundTrip(*GetParam(), options); + TestZeroLengthRoundTrip(*GetParam(), options); +} + INSTANTIATE_TEST_SUITE_P(GenericIpcRoundTripTests, TestIpcRoundTrip, BATCH_CASES()); INSTANTIATE_TEST_SUITE_P(FileRoundTripTests, TestFileFormat, BATCH_CASES()); INSTANTIATE_TEST_SUITE_P(StreamRoundTripTests, TestStreamFormat, BATCH_CASES()); +INSTANTIATE_TEST_SUITE_P(StreamEmitterDataRoundTripTests, TestStreamEmitterData, + BATCH_CASES()); +INSTANTIATE_TEST_SUITE_P(StreamEmitterBufferRoundTripTests, TestStreamEmitterBuffer, + BATCH_CASES()); +INSTANTIATE_TEST_SUITE_P(StreamEmitterSmallChunksRoundTripTests, + TestStreamEmitterSmallChunks, BATCH_CASES()); +INSTANTIATE_TEST_SUITE_P(StreamEmitterLargeChunksRoundTripTests, + TestStreamEmitterLargeChunks, BATCH_CASES()); TEST(TestIpcFileFormat, FooterMetaData) { // ARROW-6837 @@ -1692,6 +1844,15 @@ TEST(TestRecordBatchStreamReader, MalformedInput) { ASSERT_RAISES(Invalid, RecordBatchStreamReader::Open(&garbage_reader)); } +TEST(TestRecordBatchStreamEmitter, NextRequiredSize) { + auto receiver = std::make_shared(); + RecordBatchStreamEmitter emitter(receiver); + auto next_required_size = emitter.next_required_size(); + const uint8_t data[1] = {0}; + ASSERT_OK(emitter.Consume(data, 1)); + ASSERT_EQ(next_required_size - 1, emitter.next_required_size()); +} + // ---------------------------------------------------------------------- // DictionaryMemo miscellanea diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 484d648bbd0..9ae5a292906 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -502,7 +502,7 @@ Result> ReadRecordBatchInternal( const Buffer& metadata, const std::shared_ptr& schema, const std::vector& inclusion_mask, const DictionaryMemo* dictionary_memo, const IpcReadOptions& options, io::RandomAccessFile* file) { - const flatbuf::Message* message; + const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); auto batch = message->header_as_RecordBatch(); if (batch == nullptr) { @@ -528,6 +528,24 @@ Status PopulateInclusionMask(const std::vector& included_indices, return Status::OK(); } +Status PrepareSchemaMessage(const Message& message, DictionaryMemo* dictionary_memo, + std::shared_ptr* schema, + const IpcReadOptions& options, + std::vector* field_inclusion_mask) { + CHECK_MESSAGE_TYPE(Message::SCHEMA, message.type()); + CHECK_HAS_NO_BODY(message); + + RETURN_NOT_OK(internal::GetSchema(message.header(), dictionary_memo, schema)); + + // If we are selecting only certain fields, populate the inclusion mask now + // for fast lookups + if (options.included_fields) { + RETURN_NOT_OK(PopulateInclusionMask(*options.included_fields, (*schema)->num_fields(), + field_inclusion_mask)); + } + return Status::OK(); +} + Result> ReadRecordBatch( const Buffer& metadata, const std::shared_ptr& schema, const DictionaryMemo* dictionary_memo, const IpcReadOptions& options, @@ -544,7 +562,7 @@ Result> ReadRecordBatch( Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, const IpcReadOptions& options, io::RandomAccessFile* file) { - const flatbuf::Message* message; + const flatbuf::Message* message = nullptr; RETURN_NOT_OK(internal::VerifyMessage(metadata.data(), metadata.size(), &message)); auto dictionary_batch = message->header_as_DictionaryBatch(); if (dictionary_batch == nullptr) { @@ -580,6 +598,21 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo, return dictionary_memo->AddDictionary(id, dictionary); } +Status ParseDictionary(const Message& message, DictionaryMemo* dictionary_memo, + const IpcReadOptions& options) { + // Only invoke this method if we already know we have a dictionary message + DCHECK_EQ(message.type(), Message::DICTIONARY_BATCH); + CHECK_HAS_BODY(message); + ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); + return ReadDictionary(*message.metadata(), dictionary_memo, options, reader.get()); +} + +Status UpdateDictionaries(const Message& message, DictionaryMemo* dictionary_memo, + const IpcReadOptions& options) { + // TODO(wesm): implement delta dictionaries + return Status::NotImplemented("Delta dictionaries not yet implemented"); +} + // ---------------------------------------------------------------------- // RecordBatchStreamReader implementation @@ -596,18 +629,8 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { if (!message) { return Status::Invalid("Tried reading schema message, was null or length 0"); } - CHECK_MESSAGE_TYPE(Message::SCHEMA, message->type()); - CHECK_HAS_NO_BODY(*message); - - RETURN_NOT_OK(internal::GetSchema(message->header(), &dictionary_memo_, &schema_)); - - // If we are selecting only certain fields, populate the inclusion mask now - // for fast lookups - if (options.included_fields) { - RETURN_NOT_OK(PopulateInclusionMask(*options.included_fields, schema_->num_fields(), - &field_inclusion_mask_)); - } - return Status::OK(); + return PrepareSchemaMessage(*message, &dictionary_memo_, &schema_, options, + &field_inclusion_mask_); } Status ReadNext(std::shared_ptr* batch) override { @@ -631,8 +654,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { } if (message->type() == Message::DICTIONARY_BATCH) { - // TODO(wesm): implement delta dictionaries - return Status::NotImplemented("Delta dictionaries not yet implemented"); + return UpdateDictionaries(*message, &dictionary_memo_, options_); } else { CHECK_HAS_BODY(*message); ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); @@ -645,14 +667,6 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { std::shared_ptr schema() const override { return schema_; } private: - Status ParseDictionary(const Message& message) { - // Only invoke this method if we already know we have a dictionary message - DCHECK_EQ(message.type(), Message::DICTIONARY_BATCH); - CHECK_HAS_BODY(message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadDictionary(*message.metadata(), &dictionary_memo_, options_, reader.get()); - } - Status ReadInitialDictionaries() { // We must receive all dictionaries before reconstructing the // first record batch. Subsequent dictionary deltas modify the memo @@ -684,7 +698,7 @@ class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { dictionary_memo_.num_fields(), ") of dictionaries at the start of the stream"); } - RETURN_NOT_OK(ParseDictionary(*message)); + RETURN_NOT_OK(ParseDictionary(*message, &dictionary_memo_, options_)); } have_read_initial_dictionaries_ = true; @@ -927,6 +941,138 @@ Result> RecordBatchFileReader::Open( return result; } +class RecordBatchStreamEmitter::RecordBatchStreamEmitterImpl : public MessageReceiver { + private: + enum State { + SCHEMA, + INITIAL_DICTIONARIES, + RECORD_BATCHES, + EOS, + }; + + public: + explicit RecordBatchStreamEmitterImpl(std::shared_ptr receiver, + const IpcReadOptions& options) + : MessageReceiver(), + receiver_(std::move(receiver)), + options_(options), + state_(State::SCHEMA), + message_emitter_( + std::shared_ptr(this, [](void*) {}), + options_.memory_pool), + field_inclusion_mask_(), + n_required_dictionaries_(0), + dictionary_memo_(), + schema_() {} + + Status Received(std::unique_ptr message) override { + switch (state_) { + case State::SCHEMA: + ARROW_RETURN_NOT_OK(SchemaMessageReceived(std::move(message))); + break; + case State::INITIAL_DICTIONARIES: + ARROW_RETURN_NOT_OK(InitialDictionaryMessageReceived(std::move(message))); + break; + case State::RECORD_BATCHES: + ARROW_RETURN_NOT_OK(RecordBatchMessageReceived(std::move(message))); + break; + case State::EOS: + break; + } + if (message_emitter_.state() == MessageEmitter::State::EOS) { + state_ = State::EOS; + ARROW_RETURN_NOT_OK(receiver_->EosReceived()); + } + return Status::OK(); + } + + Status Consume(const uint8_t* data, int64_t size) { + return message_emitter_.Consume(data, size); + } + + Status Consume(std::shared_ptr buffer) { + return message_emitter_.Consume(std::move(buffer)); + } + + std::shared_ptr schema() const { return schema_; } + + int64_t next_required_size() const { return message_emitter_.next_required_size(); } + + private: + Status SchemaMessageReceived(std::unique_ptr message) { + RETURN_NOT_OK(PrepareSchemaMessage(*message, &dictionary_memo_, &schema_, options_, + &field_inclusion_mask_)); + n_required_dictionaries_ = dictionary_memo_.num_fields(); + if (n_required_dictionaries_ == 0) { + state_ = State::RECORD_BATCHES; + ARROW_RETURN_NOT_OK(receiver_->SchemaReceived(schema_)); + } else { + state_ = State::INITIAL_DICTIONARIES; + } + return Status::OK(); + } + + Status InitialDictionaryMessageReceived(std::unique_ptr message) { + if (message->type() != Message::DICTIONARY_BATCH) { + return Status::Invalid("IPC stream did not have the expected number (", + dictionary_memo_.num_fields(), + ") of dictionaries at the start of the stream"); + } + RETURN_NOT_OK(ParseDictionary(*message, &dictionary_memo_, options_)); + n_required_dictionaries_--; + if (n_required_dictionaries_ == 0) { + state_ = State::RECORD_BATCHES; + ARROW_RETURN_NOT_OK(receiver_->SchemaReceived(schema_)); + } + return Status::OK(); + } + + Status RecordBatchMessageReceived(std::unique_ptr message) { + if (message->type() == Message::DICTIONARY_BATCH) { + return UpdateDictionaries(*message, &dictionary_memo_, options_); + } else { + CHECK_HAS_BODY(*message); + ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); + ARROW_ASSIGN_OR_RAISE( + auto batch, + ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, + &dictionary_memo_, options_, reader.get())); + return receiver_->RecordBatchReceived(std::move(batch)); + } + } + + std::shared_ptr receiver_; + IpcReadOptions options_; + State state_; + MessageEmitter message_emitter_; + std::vector field_inclusion_mask_; + int n_required_dictionaries_; + DictionaryMemo dictionary_memo_; + std::shared_ptr schema_; +}; + +RecordBatchStreamEmitter::RecordBatchStreamEmitter(std::shared_ptr receiver, + const IpcReadOptions& options) { + impl_.reset(new RecordBatchStreamEmitterImpl(std::move(receiver), options)); +} + +RecordBatchStreamEmitter::~RecordBatchStreamEmitter() {} + +Status RecordBatchStreamEmitter::Consume(const uint8_t* data, int64_t size) { + return impl_->Consume(data, size); +} +Status RecordBatchStreamEmitter::Consume(std::shared_ptr buffer) { + return impl_->Consume(std::move(buffer)); +} + +std::shared_ptr RecordBatchStreamEmitter::schema() const { + return impl_->schema(); +} + +int64_t RecordBatchStreamEmitter::next_required_size() const { + return impl_->next_required_size(); +} + Result> ReadSchema(io::InputStream* stream, DictionaryMemo* dictionary_memo) { std::unique_ptr reader = MessageReader::Open(stream); diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index b691e0df122..17a61db171b 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -22,12 +22,15 @@ #include #include #include +#include +#include #include "arrow/ipc/message.h" #include "arrow/ipc/options.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/util/macros.h" +#include "arrow/util/receiver.h" #include "arrow/util/visibility.h" namespace arrow { @@ -194,6 +197,150 @@ class ARROW_EXPORT RecordBatchFileReader { } }; +/// \class RecordBatchReceiverCollect +/// \brief Collect record batches read by RecordBatchStreamEmitter. +class ARROW_EXPORT RecordBatchReceiverCollect : public Receiver { + public: + RecordBatchReceiverCollect() : schema_(), record_batches_() {} + virtual ~RecordBatchReceiverCollect() = default; + + Status SchemaReceived(std::shared_ptr schema) override { + schema_ = std::move(schema); + return Status::OK(); + } + + Status RecordBatchReceived(std::shared_ptr record_batch) override { + record_batches_.push_back(std::move(record_batch)); + return Status::OK(); + } + + /// \return the read schema + std::shared_ptr schema() const { return schema_; } + + /// \return the all read record batches + std::vector> record_batches() const { + return record_batches_; + } + + private: + std::shared_ptr schema_; + std::vector> record_batches_; +}; + +/// \class RecordBatchStreamEmitter +/// \brief Push style record batch stream reader that receives data +/// from user. +/// +/// This class reads the schema (plus any dictionaries) as the first +/// messages in the fed data, followed by record batches. +class ARROW_EXPORT RecordBatchStreamEmitter { + public: + /// \brief Construct a record batch emitter. + /// + /// \param[in] receiver a Receiver that must implement + /// Receiver::MessageReceived to receive read record batches + /// \param[in] options any IPC reading options (optional) + RecordBatchStreamEmitter(std::shared_ptr receiver, + const IpcReadOptions& options = IpcReadOptions::Defaults()); + + virtual ~RecordBatchStreamEmitter(); + + /// \brief Feed data to the emitter as a raw data. + /// + /// If the emitter can read one or more record batches by the data, + /// the emitter emits read record batches by calling + /// receiver->Receive() multiple times. + /// + /// \param[in] data a raw data to be processed. This data isn't + /// copied. The passed memory must be kept alive through record + /// batch processing. + /// \param[in] size raw data size. + /// \return Status + Status Consume(const uint8_t* data, int64_t size); + + /// \brief Feed data to the emitter as a Buffer. + /// + /// If the emitter can read one or more record batches by the + /// Buffer, the emitter emits read record batches by calling + /// receiver->Receive() multiple times. + /// + /// \param[in] buffer a Buffer to be processed. + /// \return Status + Status Consume(std::shared_ptr buffer); + + /// \return the shared schema of the record batches in the stream + std::shared_ptr schema() const; + + /// \brief Return the number of bytes needed to advance the state of + /// the emitter. + /// + /// This method is provided for users who want to optimize performance. + /// Normal users don't need to use this method. + /// + /// Here is an example usage for normal users: + /// + /// ~~~{.cpp} + /// emitter.Consume(buffer1); + /// emitter.Consume(buffer2); + /// emitter.Consume(buffer3); + /// ~~~ + /// + /// Emitter has internal buffer. If consumed data isn't enough to + /// advance the state of the emitter, consumed data is buffered to + /// the internal buffer. It causes performance overhead. + /// + /// If you pass next_required_size() size data to each Consume() + /// call, the emitter doesn't use its internal buffer. It improves + /// performance. + /// + /// Here is an example usage to avoid using internal buffer: + /// + /// ~~~{.cpp} + /// buffer1 = get_data(emitter.next_required_size()); + /// emitter.Consume(buffer1); + /// buffer2 = get_data(emitter.next_required_size()); + /// emitter.Consume(buffer2); + /// ~~~ + /// + /// Users can use this method to avoid creating small chunks. Record + /// batch data must be contiguous data. If users pass small chunks + /// to the emitter, the emitter needs concatenate small chunks + /// internally. It causes performance overhead. + /// + /// Here is an example usage to reduce small chunks: + /// + /// ~~~{.cpp} + /// buffer = AllocateResizableBuffer(); + /// while ((small_chunk = get_data(&small_chunk_size))) { + /// auto current_buffer_size = buffer->size(); + /// buffer->Resize(current_buffer_size + small_chunk_size); + /// memcpy(buffer->mutable_data() + current_buffer_size, + /// small_chunk, + /// small_chunk_size); + /// if (buffer->size() < emitter.next_requied_size()) { + /// continue; + /// } + /// std::shared_ptr chunk(buffer.release()); + /// emitter.Consume(chunk); + /// buffer = AllocateResizableBuffer(); + /// } + /// if (buffer->size() > 0) { + /// std::shared_ptr chunk(buffer.release()); + /// emitter.Consume(chunk); + /// } + /// ~~~ + /// + /// \return the number of bytes needed to advance the state of the + /// emitter + int64_t next_required_size() const; + + private: + class RecordBatchStreamEmitterImpl; + std::unique_ptr impl_; + + ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatchStreamEmitter); +}; + // Generic read functions; does not copy data if the input supports zero copy reads /// \brief Read Schema from stream serialized as a single IPC message diff --git a/cpp/src/arrow/util/receiver.cc b/cpp/src/arrow/util/receiver.cc new file mode 100644 index 00000000000..539bf90e070 --- /dev/null +++ b/cpp/src/arrow/util/receiver.cc @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/receiver.h" + +namespace arrow { + +Status Receiver::EosReceived() { return Status::OK(); } + +Status Receiver::SchemaReceived(std::shared_ptr schema) { return Status::OK(); } + +Status Receiver::RecordBatchReceived(std::shared_ptr record_batch) { + return Status::NotImplemented("RecordBatch receiver isn't implemented"); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/receiver.h b/cpp/src/arrow/util/receiver.h new file mode 100644 index 00000000000..f577ff5822d --- /dev/null +++ b/cpp/src/arrow/util/receiver.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/status.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class RecordBatch; +class Schema; + +/// \class Receiver +/// \brief A general receiver class to receive objects. +/// +/// You must implement receiver methods for objects you want to receive. +class ARROW_EXPORT Receiver { + public: + virtual ~Receiver() = default; + + /// \brief Called when end-of-stream is received. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + /// + /// \see RecordBatchStreamEmitter + virtual Status EosReceived(); + + /// \brief Called when a record batch is received. + /// + /// The default implementation just returns + /// arrow::Status::NotImplemented(). + /// + /// \param[in] record_batch a record batch received + /// \return Status + /// + /// \see RecordBatchStreamEmitter + virtual Status RecordBatchReceived(std::shared_ptr record_batch); + + /// \brief Called when a schema is received. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \param[in] schema a schema received + /// \return Status + /// + /// \see RecordBatchStreamEmitter + virtual Status SchemaReceived(std::shared_ptr schema); +}; + +} // namespace arrow From 20b0aac125e5ec8a76c3e85994b6d57ecb053593 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 05:37:47 +0900 Subject: [PATCH 02/12] Add explicit std::move() --- cpp/src/arrow/ipc/message.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 19fda228d79..b96c5efbbc9 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -195,7 +195,7 @@ Result> Message::ReadFrom(std::shared_ptr metad " bytes for message body, got ", body->size()); } RETURN_NOT_OK(emitter.Consume(body)); - return result; + return std::move(result); } Result> Message::ReadFrom(const int64_t offset, @@ -212,7 +212,7 @@ Result> Message::ReadFrom(const int64_t offset, " bytes for message body, got ", body->size()); } RETURN_NOT_OK(emitter.Consume(body)); - return result; + return std::move(result); } Status WritePadding(io::OutputStream* stream, int64_t nbytes) { @@ -387,7 +387,7 @@ Result> ReadMessage(io::InputStream* file, MemoryPool* if (!message) { return Status::Invalid("Failed to read message"); } - return message; + return std::move(message); } Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, From 76e7aba0c104f3c85ac3a97fd016fd1dfad125de Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 06:16:46 +0900 Subject: [PATCH 03/12] Rename to "decoder" from "emitter" --- cpp/src/arrow/ipc/message.cc | 120 +++++++++++----------- cpp/src/arrow/ipc/message.h | 104 ++++++++++++------- cpp/src/arrow/ipc/read_write_benchmark.cc | 4 +- cpp/src/arrow/ipc/read_write_test.cc | 86 ++++++++-------- cpp/src/arrow/ipc/reader.cc | 32 +++--- cpp/src/arrow/ipc/reader.h | 76 +++++++------- 6 files changed, 226 insertions(+), 196 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index b96c5efbbc9..5f73522897b 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -186,15 +186,15 @@ Result> Message::ReadFrom(std::shared_ptr metad io::InputStream* stream) { std::unique_ptr result; auto receiver = std::make_shared(&result); - MessageEmitter emitter(receiver, MessageEmitter::State::METADATA, metadata->size()); - ARROW_RETURN_NOT_OK(emitter.Consume(metadata)); + MessageDecoder decoder(receiver, MessageDecoder::State::METADATA, metadata->size()); + ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); - ARROW_ASSIGN_OR_RAISE(auto body, stream->Read(emitter.next_required_size())); - if (body->size() < emitter.next_required_size()) { - return Status::IOError("Expected to be able to read ", emitter.next_required_size(), + ARROW_ASSIGN_OR_RAISE(auto body, stream->Read(decoder.next_required_size())); + if (body->size() < decoder.next_required_size()) { + return Status::IOError("Expected to be able to read ", decoder.next_required_size(), " bytes for message body, got ", body->size()); } - RETURN_NOT_OK(emitter.Consume(body)); + RETURN_NOT_OK(decoder.Consume(body)); return std::move(result); } @@ -203,15 +203,15 @@ Result> Message::ReadFrom(const int64_t offset, io::RandomAccessFile* file) { std::unique_ptr result; auto receiver = std::make_shared(&result); - MessageEmitter emitter(receiver, MessageEmitter::State::METADATA, metadata->size()); - ARROW_RETURN_NOT_OK(emitter.Consume(metadata)); + MessageDecoder decoder(receiver, MessageDecoder::State::METADATA, metadata->size()); + ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); - ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset, emitter.next_required_size())); - if (body->size() < emitter.next_required_size()) { - return Status::IOError("Expected to be able to read ", emitter.next_required_size(), + ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset, decoder.next_required_size())); + if (body->size() < decoder.next_required_size()) { + return Status::IOError("Expected to be able to read ", decoder.next_required_size(), " bytes for message body, got ", body->size()); } - RETURN_NOT_OK(emitter.Consume(body)); + RETURN_NOT_OK(decoder.Consume(body)); return std::move(result); } @@ -268,11 +268,11 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le io::RandomAccessFile* file) { std::unique_ptr result; auto receiver = std::make_shared(&result); - MessageEmitter emitter(receiver); + MessageDecoder decoder(receiver); - if (metadata_length < emitter.next_required_size()) { + if (metadata_length < decoder.next_required_size()) { return Status::Invalid("metadata_length should be at least ", - emitter.next_required_size()); + decoder.next_required_size()); } ARROW_ASSIGN_OR_RAISE(auto metadata, file->ReadAt(offset, metadata_length)); @@ -280,33 +280,33 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le return Status::Invalid("Expected to read ", metadata_length, " metadata bytes but got ", metadata->size()); } - ARROW_RETURN_NOT_OK(emitter.Consume(metadata)); + ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); - switch (emitter.state()) { - case MessageEmitter::State::INITIAL: + switch (decoder.state()) { + case MessageDecoder::State::INITIAL: return result; - case MessageEmitter::State::METADATA_LENGTH: + case MessageDecoder::State::METADATA_LENGTH: return Status::Invalid("metadata length is missing. File offset: ", offset, ", metadata length: ", metadata_length); - case MessageEmitter::State::METADATA: - return Status::Invalid("flatbuffer size ", emitter.next_required_size(), + case MessageDecoder::State::METADATA: + return Status::Invalid("flatbuffer size ", decoder.next_required_size(), " invalid. File offset: ", offset, ", metadata length: ", metadata_length); - case MessageEmitter::State::BODY: { + case MessageDecoder::State::BODY: { ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset + metadata_length, - emitter.next_required_size())); - if (body->size() < emitter.next_required_size()) { + decoder.next_required_size())); + if (body->size() < decoder.next_required_size()) { return Status::IOError("Expected to be able to read ", - emitter.next_required_size(), + decoder.next_required_size(), " bytes for message body, got ", body->size()); } - RETURN_NOT_OK(emitter.Consume(body)); + RETURN_NOT_OK(decoder.Consume(body)); return result; } - case MessageEmitter::State::EOS: + case MessageDecoder::State::EOS: return Status::Invalid("Unexpected empty message in IPC file format"); default: - return Status::Invalid("Unexpected state: ", emitter.state()); + return Status::Invalid("Unexpected state: ", decoder.state()); } } @@ -337,7 +337,7 @@ Status CheckAligned(io::FileInterface* stream, int32_t alignment) { Result> ReadMessage(io::InputStream* file, MemoryPool* pool) { std::unique_ptr message; auto receiver = std::make_shared(&message); - MessageEmitter emitter(receiver, pool); + MessageDecoder decoder(receiver, pool); { uint8_t continuation[sizeof(int32_t)]; @@ -345,43 +345,43 @@ Result> ReadMessage(io::InputStream* file, MemoryPool* if (bytes_read == 0) { // EOS without indication return nullptr; - } else if (bytes_read != emitter.next_required_size()) { + } else if (bytes_read != decoder.next_required_size()) { return Status::Invalid("Corrupted message, only ", bytes_read, " bytes available"); } - ARROW_RETURN_NOT_OK(emitter.Consume(continuation, bytes_read)); + ARROW_RETURN_NOT_OK(decoder.Consume(continuation, bytes_read)); } - if (emitter.state() == MessageEmitter::State::METADATA_LENGTH) { + if (decoder.state() == MessageDecoder::State::METADATA_LENGTH) { // Valid IPC message, read the message length now uint8_t metadata_length[sizeof(int32_t)]; ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, file->Read(sizeof(int32_t), &metadata_length)); - if (bytes_read != emitter.next_required_size()) { + if (bytes_read != decoder.next_required_size()) { return Status::Invalid("Corrupted metadata length, only ", bytes_read, " bytes available"); } - ARROW_RETURN_NOT_OK(emitter.Consume(metadata_length, bytes_read)); + ARROW_RETURN_NOT_OK(decoder.Consume(metadata_length, bytes_read)); } - if (emitter.state() == MessageEmitter::State::EOS) { + if (decoder.state() == MessageDecoder::State::EOS) { return nullptr; } - auto metadata_length = emitter.next_required_size(); + auto metadata_length = decoder.next_required_size(); ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length)); if (metadata->size() != metadata_length) { return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ", "only read ", metadata->size()); } - RETURN_NOT_OK(emitter.Consume(metadata)); + RETURN_NOT_OK(decoder.Consume(metadata)); - if (emitter.state() == MessageEmitter::State::BODY) { - ARROW_ASSIGN_OR_RAISE(auto body, file->Read(emitter.next_required_size())); - if (body->size() < emitter.next_required_size()) { - return Status::IOError("Expected to be able to read ", emitter.next_required_size(), + if (decoder.state() == MessageDecoder::State::BODY) { + ARROW_ASSIGN_OR_RAISE(auto body, file->Read(decoder.next_required_size())); + if (body->size() < decoder.next_required_size()) { + return Status::IOError("Expected to be able to read ", decoder.next_required_size(), " bytes for message body, got ", body->size()); } - ARROW_RETURN_NOT_OK(emitter.Consume(body)); + ARROW_RETURN_NOT_OK(decoder.Consume(body)); } if (!message) { @@ -423,14 +423,14 @@ Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, } // ---------------------------------------------------------------------- -// Implement MessageEmitter +// Implement MessageDecoder -static constexpr auto kMessageEmitterNextRequiredSizeInitial = sizeof(int32_t); -static constexpr auto kMessageEmitterNextRequiredSizeMetadataLength = sizeof(int32_t); +static constexpr auto kMessageDecoderNextRequiredSizeInitial = sizeof(int32_t); +static constexpr auto kMessageDecoderNextRequiredSizeMetadataLength = sizeof(int32_t); -class MessageEmitter::MessageEmitterImpl { +class MessageDecoder::MessageDecoderImpl { public: - explicit MessageEmitterImpl(std::shared_ptr receiver, + explicit MessageDecoderImpl(std::shared_ptr receiver, State initial_state, int64_t initial_next_required_size, MemoryPool* pool) : receiver_(std::move(receiver)), @@ -525,7 +525,7 @@ class MessageEmitter::MessageEmitterImpl { int64_t next_required_size() const { return next_required_size_ - buffered_size_; } - MessageEmitter::State state() const { return state_; } + MessageDecoder::State state() const { return state_; } private: Status ConsumeChunks() { @@ -573,7 +573,7 @@ class MessageEmitter::MessageEmitterImpl { Status ConsumeInitial(int32_t continuation) { if (continuation == internal::kIpcContinuationToken) { state_ = State::METADATA_LENGTH; - next_required_size_ = kMessageEmitterNextRequiredSizeMetadataLength; + next_required_size_ = kMessageDecoderNextRequiredSizeMetadataLength; // Valid IPC message, read the message length now return Status::OK(); } else if (continuation == 0) { @@ -700,7 +700,7 @@ class MessageEmitter::MessageEmitterImpl { Message::Open(metadata_, *buffer)); state_ = State::INITIAL; - next_required_size_ = kMessageEmitterNextRequiredSizeInitial; + next_required_size_ = kMessageDecoderNextRequiredSizeInitial; RETURN_NOT_OK(receiver_->Received(std::move(message))); return Status::OK(); } @@ -756,32 +756,32 @@ class MessageEmitter::MessageEmitterImpl { std::shared_ptr metadata_; // Must be CPU buffer }; -MessageEmitter::MessageEmitter(std::shared_ptr receiver, +MessageDecoder::MessageDecoder(std::shared_ptr receiver, MemoryPool* pool) { - impl_.reset(new MessageEmitterImpl(std::move(receiver), State::INITIAL, - kMessageEmitterNextRequiredSizeInitial, pool)); + impl_.reset(new MessageDecoderImpl(std::move(receiver), State::INITIAL, + kMessageDecoderNextRequiredSizeInitial, pool)); } -MessageEmitter::MessageEmitter(std::shared_ptr receiver, +MessageDecoder::MessageDecoder(std::shared_ptr receiver, State initial_state, int64_t initial_next_required_size, MemoryPool* pool) { - impl_.reset(new MessageEmitterImpl(std::move(receiver), initial_state, + impl_.reset(new MessageDecoderImpl(std::move(receiver), initial_state, initial_next_required_size, pool)); } -MessageEmitter::~MessageEmitter() {} +MessageDecoder::~MessageDecoder() {} -Status MessageEmitter::Consume(const uint8_t* data, int64_t size) { +Status MessageDecoder::Consume(const uint8_t* data, int64_t size) { return impl_->ConsumeData(data, size); } -Status MessageEmitter::Consume(std::shared_ptr buffer) { +Status MessageDecoder::Consume(std::shared_ptr buffer) { return impl_->ConsumeBuffer(&buffer); } -int64_t MessageEmitter::next_required_size() const { return impl_->next_required_size(); } +int64_t MessageDecoder::next_required_size() const { return impl_->next_required_size(); } -MessageEmitter::State MessageEmitter::state() const { return impl_->state(); } +MessageDecoder::State MessageDecoder::state() const { return impl_->state(); } // ---------------------------------------------------------------------- // Implement InputStream message reader diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 49c7eada8e2..8d4a0c1e11c 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -221,9 +221,9 @@ class ARROW_EXPORT MessageReceiverAssign : public MessageReceiver { ARROW_DISALLOW_COPY_AND_ASSIGN(MessageReceiverAssign); }; -/// \class MessageEmitter -/// \brief Push style message reader that receives data from user. -class ARROW_EXPORT MessageEmitter { +/// \class MessageDecoder +/// \brief Push style message decoder that receives data from user. +class ARROW_EXPORT MessageDecoder { public: /// \brief State for reading a message enum State { @@ -248,35 +248,35 @@ class ARROW_EXPORT MessageEmitter { EOS, }; - /// \brief Construct a message emitter. + /// \brief Construct a message decoder. /// - /// \param[in] receiver a MessageReceiver that receives read messages + /// \param[in] receiver a MessageReceiver that receives decoded messages /// \param[in] pool an optional MemoryPool to copy metadata on the /// CPU, if required - explicit MessageEmitter(std::shared_ptr receiver, + explicit MessageDecoder(std::shared_ptr receiver, MemoryPool* pool = default_memory_pool()); - /// \brief Construct a message emitter with the specified state. + /// \brief Construct a message decoder with the specified state. /// - /// This is a construct for advanced users that know how to read + /// This is a construct for advanced users that know how to decode /// Message. /// - /// \param[in] receiver a MessageReceiver that receives read messages - /// \param[in] initial_state an initial state of the emitter + /// \param[in] receiver a MessageReceiver that receives decoded messages + /// \param[in] initial_state an initial state of the decode /// \param[in] initial_next_required_size the number of bytes needed /// to run the next action /// \param[in] pool an optional MemoryPool to copy metadata on the /// CPU, if required - MessageEmitter(std::shared_ptr receiver, State initial_state, + MessageDecoder(std::shared_ptr receiver, State initial_state, int64_t initial_next_required_size, MemoryPool* pool = default_memory_pool()); - virtual ~MessageEmitter(); + virtual ~MessageDecoder(); - /// \brief Feed data to the emitter as a raw data. + /// \brief Feed data to the decoder as a raw data. /// - /// If the emitter can read one or more messages by the data, the - /// emitter emits read batches by calling receiver->Receive() + /// If the decoder can decode one or more messages by the data, the + /// decoder calls receiver->Receive() with a decoded message /// multiple times. /// /// \param[in] data a raw data to be processed. This data isn't @@ -286,10 +286,10 @@ class ARROW_EXPORT MessageEmitter { /// \return Status Status Consume(const uint8_t* data, int64_t size); - /// \brief Feed data to the emitter as a Buffer. + /// \brief Feed data to the decoder as a Buffer. /// - /// If the emitter can read one or more messages by the Buffer, the - /// emitter emits read messages by calling receiver->Receive() + /// If the decoder can decode one or more messages by the Buffer, + /// the decoder calls receiver->Receive() with a decoded message /// multiple times. /// /// \param[in] buffer a Buffer to be processed. @@ -297,7 +297,7 @@ class ARROW_EXPORT MessageEmitter { Status Consume(std::shared_ptr buffer); /// \brief Return the number of bytes needed to advance the state of - /// the emitter. + /// the decoder. /// /// This method is provided for users who want to optimize performance. /// Normal users don't need to use this method. @@ -305,31 +305,31 @@ class ARROW_EXPORT MessageEmitter { /// Here is an example usage for normal users: /// /// ~~~{.cpp} - /// emitter.Consume(buffer1); - /// emitter.Consume(buffer2); - /// emitter.Consume(buffer3); + /// decoder.Consume(buffer1); + /// decoder.Consume(buffer2); + /// decoder.Consume(buffer3); /// ~~~ /// - /// Emitter has internal buffer. If consumed data isn't enough to - /// advance the state of the emitter, consumed data is buffered to + /// Decoder has internal buffer. If consumed data isn't enough to + /// advance the state of the decoder, consumed data is buffered to /// the internal buffer. It causes performance overhead. /// /// If you pass next_required_size() size data to each Consume() - /// call, the emitter doesn't use its internal buffer. It improves + /// call, the decoder doesn't use its internal buffer. It improves /// performance. /// /// Here is an example usage to avoid using internal buffer: /// /// ~~~{.cpp} - /// buffer1 = get_data(emitter.next_required_size()); - /// emitter.Consume(buffer1); - /// buffer2 = get_data(emitter.next_required_size()); - /// emitter.Consume(buffer2); + /// buffer1 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer1); + /// buffer2 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer2); /// ~~~ /// /// Users can use this method to avoid creating small /// chunks. Message body must be contiguous data. If users pass - /// small chunks to the emitter, the emitter needs concatenate small + /// small chunks to the decoder, the decoder needs concatenate small /// chunks internally. It causes performance overhead. /// /// Here is an example usage to reduce small chunks: @@ -342,31 +342,61 @@ class ARROW_EXPORT MessageEmitter { /// memcpy(buffer->mutable_data() + current_buffer_size, /// small_chunk, /// small_chunk_size); - /// if (buffer->size() < emitter.next_requied_size()) { + /// if (buffer->size() < decoder.next_requied_size()) { /// continue; /// } /// std::shared_ptr chunk(buffer.release()); - /// emitter.Consume(chunk); + /// decoder.Consume(chunk); /// buffer = AllocateResizableBuffer(); /// } /// if (buffer->size() > 0) { /// std::shared_ptr chunk(buffer.release()); - /// emitter.Consume(chunk); + /// decoder.Consume(chunk); /// } /// ~~~ /// /// \return the number of bytes needed to advance the state of the - /// emitter + /// decoder int64_t next_required_size() const; + /// \brief Return the current state of the decoder. + /// + /// This method is provided for users who want to optimize performance. + /// Normal users don't need to use this method. + /// + /// Decoder doesn't need Buffer to process data on the + /// MessageDecoder::State::INITIAL state and the + /// MessageDecoder::State::METADATA_LENGTH. Creating Buffer has + /// performance overhead. Advanced users can avoid creating Buffer + /// by checking the current state of the decoder: + /// + /// ~~~{.cpp} + /// switch (decoder.state()) { + /// MessageDecoder::State::INITIAL: + /// MessageDecoder::State::METADATA_LENGTH: + /// { + /// uint8_t data[sizeof(int32_t)]; + /// auto data_size = input->Read(decoder.next_required_size(), data); + /// decoder.Consume(data, data_size); + /// } + /// break; + /// default: + /// { + /// auto buffer = input->Read(decoder.next_required_size()); + /// decoder.Consume(buffer); + /// } + /// break; + /// } + /// ~~~ + /// /// \return the current state State state() const; private: - class MessageEmitterImpl; - std::unique_ptr impl_; + class MessageDecoderImpl; + std::unique_ptr impl_; - ARROW_DISALLOW_COPY_AND_ASSIGN(MessageEmitter); + ARROW_DISALLOW_COPY_AND_ASSIGN(MessageDecoder); }; /// \brief Abstract interface for a sequence of messages diff --git a/cpp/src/arrow/ipc/read_write_benchmark.cc b/cpp/src/arrow/ipc/read_write_benchmark.cc index bc971e278df..11c1253089b 100644 --- a/cpp/src/arrow/ipc/read_write_benchmark.cc +++ b/cpp/src/arrow/ipc/read_write_benchmark.cc @@ -146,10 +146,10 @@ static void EmitStream(benchmark::State& state) { // NOLINT non-const reference return Status::OK(); } } receiver; - ipc::RecordBatchStreamEmitter emitter( + ipc::StreamDecoder decoder( std::shared_ptr(&receiver), ipc::IpcReadOptions::Defaults()); - ABORT_NOT_OK(emitter.Consume(buffer)); + ABORT_NOT_OK(decoder.Consume(buffer)); } state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); } diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 5629df795ee..cbe3ba81c5a 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -884,48 +884,48 @@ struct StreamWriterHelper { std::shared_ptr writer_; }; -struct StreamEmitterDataWriterHelper : public StreamWriterHelper { +struct StreamDecoderDataWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver, options); - ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data(), buffer_->size())); + StreamDecoder decoder(receiver, options); + ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data(), buffer_->size())); *out_batches = receiver->record_batches(); return Status::OK(); } Status ReadSchema(std::shared_ptr* out) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver); - ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data(), buffer_->size())); + StreamDecoder decoder(receiver); + ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data(), buffer_->size())); *out = receiver->schema(); return Status::OK(); } }; -struct StreamEmitterBufferWriterHelper : public StreamWriterHelper { +struct StreamDecoderBufferWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver, options); - ARROW_RETURN_NOT_OK(emitter.Consume(buffer_)); + StreamDecoder decoder(receiver, options); + ARROW_RETURN_NOT_OK(decoder.Consume(buffer_)); *out_batches = receiver->record_batches(); return Status::OK(); } Status ReadSchema(std::shared_ptr* out) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver); - ARROW_RETURN_NOT_OK(emitter.Consume(buffer_)); + StreamDecoder decoder(receiver); + ARROW_RETURN_NOT_OK(decoder.Consume(buffer_)); *out = receiver->schema(); return Status::OK(); } }; -struct StreamEmitterSmallChunksWriterHelper : public StreamWriterHelper { +struct StreamDecoderSmallChunksWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver, options); + StreamDecoder decoder(receiver, options); for (int64_t offset = 0; offset < buffer_->size() - 1; ++offset) { - ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data() + offset, 1)); + ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data() + offset, 1)); } *out_batches = receiver->record_batches(); return Status::OK(); @@ -933,30 +933,30 @@ struct StreamEmitterSmallChunksWriterHelper : public StreamWriterHelper { Status ReadSchema(std::shared_ptr* out) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver); + StreamDecoder decoder(receiver); for (int64_t offset = 0; offset < buffer_->size() - 1; ++offset) { - ARROW_RETURN_NOT_OK(emitter.Consume(buffer_->data() + offset, 1)); + ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data() + offset, 1)); } *out = receiver->schema(); return Status::OK(); } }; -struct StreamEmitterLargeChunksWriterHelper : public StreamWriterHelper { +struct StreamDecoderLargeChunksWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver, options); - ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 0, 1))); - ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 1))); + StreamDecoder decoder(receiver, options); + ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 0, 1))); + ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 1))); *out_batches = receiver->record_batches(); return Status::OK(); } Status ReadSchema(std::shared_ptr* out) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver); - ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 0, 1))); - ARROW_RETURN_NOT_OK(emitter.Consume(SliceBuffer(buffer_, 1))); + StreamDecoder decoder(receiver); + ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 0, 1))); + ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 1))); *out = receiver->schema(); return Status::OK(); } @@ -1162,15 +1162,15 @@ class TestFileFormat : public ReaderWriterMixin, class TestStreamFormat : public ReaderWriterMixin, public ::testing::TestWithParam {}; -class TestStreamEmitterData : public ReaderWriterMixin, +class TestStreamDecoderData : public ReaderWriterMixin, public ::testing::TestWithParam {}; -class TestStreamEmitterBuffer : public ReaderWriterMixin, +class TestStreamDecoderBuffer : public ReaderWriterMixin, public ::testing::TestWithParam {}; -class TestStreamEmitterSmallChunks - : public ReaderWriterMixin, +class TestStreamDecoderSmallChunks + : public ReaderWriterMixin, public ::testing::TestWithParam {}; -class TestStreamEmitterLargeChunks - : public ReaderWriterMixin, +class TestStreamDecoderLargeChunks + : public ReaderWriterMixin, public ::testing::TestWithParam {}; TEST_P(TestFileFormat, RoundTrip) { @@ -1193,7 +1193,7 @@ TEST_P(TestStreamFormat, RoundTrip) { TestZeroLengthRoundTrip(*GetParam(), options); } -TEST_P(TestStreamEmitterData, RoundTrip) { +TEST_P(TestStreamDecoderData, RoundTrip) { TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); @@ -1203,7 +1203,7 @@ TEST_P(TestStreamEmitterData, RoundTrip) { TestZeroLengthRoundTrip(*GetParam(), options); } -TEST_P(TestStreamEmitterBuffer, RoundTrip) { +TEST_P(TestStreamDecoderBuffer, RoundTrip) { TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); @@ -1213,7 +1213,7 @@ TEST_P(TestStreamEmitterBuffer, RoundTrip) { TestZeroLengthRoundTrip(*GetParam(), options); } -TEST_P(TestStreamEmitterSmallChunks, RoundTrip) { +TEST_P(TestStreamDecoderSmallChunks, RoundTrip) { TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); @@ -1223,7 +1223,7 @@ TEST_P(TestStreamEmitterSmallChunks, RoundTrip) { TestZeroLengthRoundTrip(*GetParam(), options); } -TEST_P(TestStreamEmitterLargeChunks, RoundTrip) { +TEST_P(TestStreamDecoderLargeChunks, RoundTrip) { TestRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); TestZeroLengthRoundTrip(*GetParam(), IpcWriteOptions::Defaults()); @@ -1236,14 +1236,14 @@ TEST_P(TestStreamEmitterLargeChunks, RoundTrip) { INSTANTIATE_TEST_SUITE_P(GenericIpcRoundTripTests, TestIpcRoundTrip, BATCH_CASES()); INSTANTIATE_TEST_SUITE_P(FileRoundTripTests, TestFileFormat, BATCH_CASES()); INSTANTIATE_TEST_SUITE_P(StreamRoundTripTests, TestStreamFormat, BATCH_CASES()); -INSTANTIATE_TEST_SUITE_P(StreamEmitterDataRoundTripTests, TestStreamEmitterData, +INSTANTIATE_TEST_SUITE_P(StreamDecoderDataRoundTripTests, TestStreamDecoderData, BATCH_CASES()); -INSTANTIATE_TEST_SUITE_P(StreamEmitterBufferRoundTripTests, TestStreamEmitterBuffer, +INSTANTIATE_TEST_SUITE_P(StreamDecoderBufferRoundTripTests, TestStreamDecoderBuffer, BATCH_CASES()); -INSTANTIATE_TEST_SUITE_P(StreamEmitterSmallChunksRoundTripTests, - TestStreamEmitterSmallChunks, BATCH_CASES()); -INSTANTIATE_TEST_SUITE_P(StreamEmitterLargeChunksRoundTripTests, - TestStreamEmitterLargeChunks, BATCH_CASES()); +INSTANTIATE_TEST_SUITE_P(StreamDecoderSmallChunksRoundTripTests, + TestStreamDecoderSmallChunks, BATCH_CASES()); +INSTANTIATE_TEST_SUITE_P(StreamDecoderLargeChunksRoundTripTests, + TestStreamDecoderLargeChunks, BATCH_CASES()); TEST(TestIpcFileFormat, FooterMetaData) { // ARROW-6837 @@ -1844,13 +1844,13 @@ TEST(TestRecordBatchStreamReader, MalformedInput) { ASSERT_RAISES(Invalid, RecordBatchStreamReader::Open(&garbage_reader)); } -TEST(TestRecordBatchStreamEmitter, NextRequiredSize) { +TEST(TestStreamDecoder, NextRequiredSize) { auto receiver = std::make_shared(); - RecordBatchStreamEmitter emitter(receiver); - auto next_required_size = emitter.next_required_size(); + StreamDecoder decoder(receiver); + auto next_required_size = decoder.next_required_size(); const uint8_t data[1] = {0}; - ASSERT_OK(emitter.Consume(data, 1)); - ASSERT_EQ(next_required_size - 1, emitter.next_required_size()); + ASSERT_OK(decoder.Consume(data, 1)); + ASSERT_EQ(next_required_size - 1, decoder.next_required_size()); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 9ae5a292906..c0231c0a8da 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -941,7 +941,7 @@ Result> RecordBatchFileReader::Open( return result; } -class RecordBatchStreamEmitter::RecordBatchStreamEmitterImpl : public MessageReceiver { +class StreamDecoder::StreamDecoderImpl : public MessageReceiver { private: enum State { SCHEMA, @@ -951,14 +951,14 @@ class RecordBatchStreamEmitter::RecordBatchStreamEmitterImpl : public MessageRec }; public: - explicit RecordBatchStreamEmitterImpl(std::shared_ptr receiver, + explicit StreamDecoderImpl(std::shared_ptr receiver, const IpcReadOptions& options) : MessageReceiver(), receiver_(std::move(receiver)), options_(options), state_(State::SCHEMA), - message_emitter_( - std::shared_ptr(this, [](void*) {}), + message_decoder_( + std::shared_ptr(this, [](void*) {}), options_.memory_pool), field_inclusion_mask_(), n_required_dictionaries_(0), @@ -979,7 +979,7 @@ class RecordBatchStreamEmitter::RecordBatchStreamEmitterImpl : public MessageRec case State::EOS: break; } - if (message_emitter_.state() == MessageEmitter::State::EOS) { + if (message_decoder_.state() == MessageDecoder::State::EOS) { state_ = State::EOS; ARROW_RETURN_NOT_OK(receiver_->EosReceived()); } @@ -987,16 +987,16 @@ class RecordBatchStreamEmitter::RecordBatchStreamEmitterImpl : public MessageRec } Status Consume(const uint8_t* data, int64_t size) { - return message_emitter_.Consume(data, size); + return message_decoder_.Consume(data, size); } Status Consume(std::shared_ptr buffer) { - return message_emitter_.Consume(std::move(buffer)); + return message_decoder_.Consume(std::move(buffer)); } std::shared_ptr schema() const { return schema_; } - int64_t next_required_size() const { return message_emitter_.next_required_size(); } + int64_t next_required_size() const { return message_decoder_.next_required_size(); } private: Status SchemaMessageReceived(std::unique_ptr message) { @@ -1044,32 +1044,32 @@ class RecordBatchStreamEmitter::RecordBatchStreamEmitterImpl : public MessageRec std::shared_ptr receiver_; IpcReadOptions options_; State state_; - MessageEmitter message_emitter_; + MessageDecoder message_decoder_; std::vector field_inclusion_mask_; int n_required_dictionaries_; DictionaryMemo dictionary_memo_; std::shared_ptr schema_; }; -RecordBatchStreamEmitter::RecordBatchStreamEmitter(std::shared_ptr receiver, +StreamDecoder::StreamDecoder(std::shared_ptr receiver, const IpcReadOptions& options) { - impl_.reset(new RecordBatchStreamEmitterImpl(std::move(receiver), options)); + impl_.reset(new StreamDecoderImpl(std::move(receiver), options)); } -RecordBatchStreamEmitter::~RecordBatchStreamEmitter() {} +StreamDecoder::~StreamDecoder() {} -Status RecordBatchStreamEmitter::Consume(const uint8_t* data, int64_t size) { +Status StreamDecoder::Consume(const uint8_t* data, int64_t size) { return impl_->Consume(data, size); } -Status RecordBatchStreamEmitter::Consume(std::shared_ptr buffer) { +Status StreamDecoder::Consume(std::shared_ptr buffer) { return impl_->Consume(std::move(buffer)); } -std::shared_ptr RecordBatchStreamEmitter::schema() const { +std::shared_ptr StreamDecoder::schema() const { return impl_->schema(); } -int64_t RecordBatchStreamEmitter::next_required_size() const { +int64_t StreamDecoder::next_required_size() const { return impl_->next_required_size(); } diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 17a61db171b..663a7a036e0 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -227,29 +227,29 @@ class ARROW_EXPORT RecordBatchReceiverCollect : public Receiver { std::vector> record_batches_; }; -/// \class RecordBatchStreamEmitter -/// \brief Push style record batch stream reader that receives data -/// from user. +/// \class StreamDecoder +/// \brief Push style stream decoder that receives data from user. /// -/// This class reads the schema (plus any dictionaries) as the first -/// messages in the fed data, followed by record batches. -class ARROW_EXPORT RecordBatchStreamEmitter { +/// This class decodes the Apache Arrow IPC streaming format data. +/// +/// \see https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format +class ARROW_EXPORT StreamDecoder { public: - /// \brief Construct a record batch emitter. + /// \brief Construct a stream decoder. /// /// \param[in] receiver a Receiver that must implement - /// Receiver::MessageReceived to receive read record batches + /// Receiver::RecordBatchReceived() to receive decoded record batches /// \param[in] options any IPC reading options (optional) - RecordBatchStreamEmitter(std::shared_ptr receiver, - const IpcReadOptions& options = IpcReadOptions::Defaults()); + StreamDecoder(std::shared_ptr receiver, + const IpcReadOptions& options = IpcReadOptions::Defaults()); - virtual ~RecordBatchStreamEmitter(); + virtual ~StreamDecoder(); - /// \brief Feed data to the emitter as a raw data. + /// \brief Feed data to the decoder as a raw data. /// - /// If the emitter can read one or more record batches by the data, - /// the emitter emits read record batches by calling - /// receiver->Receive() multiple times. + /// If the decoder can read one or more record batches by the data, + /// the decoder calls receiver->RecordBatchReceived() with a decoded + /// record batch multiple times. /// /// \param[in] data a raw data to be processed. This data isn't /// copied. The passed memory must be kept alive through record @@ -258,11 +258,11 @@ class ARROW_EXPORT RecordBatchStreamEmitter { /// \return Status Status Consume(const uint8_t* data, int64_t size); - /// \brief Feed data to the emitter as a Buffer. + /// \brief Feed data to the decoder as a Buffer. /// - /// If the emitter can read one or more record batches by the - /// Buffer, the emitter emits read record batches by calling - /// receiver->Receive() multiple times. + /// If the decoder can read one or more record batches by the + /// Buffer, the decoder calls receiver->RecordBatchReceived() with a + /// decoded record batch multiple times. /// /// \param[in] buffer a Buffer to be processed. /// \return Status @@ -272,7 +272,7 @@ class ARROW_EXPORT RecordBatchStreamEmitter { std::shared_ptr schema() const; /// \brief Return the number of bytes needed to advance the state of - /// the emitter. + /// the decoder. /// /// This method is provided for users who want to optimize performance. /// Normal users don't need to use this method. @@ -280,31 +280,31 @@ class ARROW_EXPORT RecordBatchStreamEmitter { /// Here is an example usage for normal users: /// /// ~~~{.cpp} - /// emitter.Consume(buffer1); - /// emitter.Consume(buffer2); - /// emitter.Consume(buffer3); + /// decoder.Consume(buffer1); + /// decoder.Consume(buffer2); + /// decoder.Consume(buffer3); /// ~~~ /// - /// Emitter has internal buffer. If consumed data isn't enough to - /// advance the state of the emitter, consumed data is buffered to + /// Decoder has internal buffer. If consumed data isn't enough to + /// advance the state of the decoder, consumed data is buffered to /// the internal buffer. It causes performance overhead. /// /// If you pass next_required_size() size data to each Consume() - /// call, the emitter doesn't use its internal buffer. It improves + /// call, the decoder doesn't use its internal buffer. It improves /// performance. /// /// Here is an example usage to avoid using internal buffer: /// /// ~~~{.cpp} - /// buffer1 = get_data(emitter.next_required_size()); - /// emitter.Consume(buffer1); - /// buffer2 = get_data(emitter.next_required_size()); - /// emitter.Consume(buffer2); + /// buffer1 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer1); + /// buffer2 = get_data(decoder.next_required_size()); + /// decoder.Consume(buffer2); /// ~~~ /// /// Users can use this method to avoid creating small chunks. Record /// batch data must be contiguous data. If users pass small chunks - /// to the emitter, the emitter needs concatenate small chunks + /// to the decoder, the decoder needs concatenate small chunks /// internally. It causes performance overhead. /// /// Here is an example usage to reduce small chunks: @@ -317,28 +317,28 @@ class ARROW_EXPORT RecordBatchStreamEmitter { /// memcpy(buffer->mutable_data() + current_buffer_size, /// small_chunk, /// small_chunk_size); - /// if (buffer->size() < emitter.next_requied_size()) { + /// if (buffer->size() < decoder.next_requied_size()) { /// continue; /// } /// std::shared_ptr chunk(buffer.release()); - /// emitter.Consume(chunk); + /// decoder.Consume(chunk); /// buffer = AllocateResizableBuffer(); /// } /// if (buffer->size() > 0) { /// std::shared_ptr chunk(buffer.release()); - /// emitter.Consume(chunk); + /// decoder.Consume(chunk); /// } /// ~~~ /// /// \return the number of bytes needed to advance the state of the - /// emitter + /// decoder int64_t next_required_size() const; private: - class RecordBatchStreamEmitterImpl; - std::unique_ptr impl_; + class StreamDecoderImpl; + std::unique_ptr impl_; - ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatchStreamEmitter); + ARROW_DISALLOW_COPY_AND_ASSIGN(StreamDecoder); }; // Generic read functions; does not copy data if the input supports zero copy reads From 230cd0be890115b52e53ba42ff3b2dd5164387e4 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 06:23:15 +0900 Subject: [PATCH 04/12] Move Receiver from arrow/util/ to arrow/ipc/ --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/ipc/read_write_benchmark.cc | 2 +- cpp/src/arrow/ipc/reader.cc | 8 +++ cpp/src/arrow/ipc/reader.h | 40 ++++++++++++- cpp/src/arrow/util/receiver.cc | 30 ---------- cpp/src/arrow/util/receiver.h | 69 ----------------------- 6 files changed, 48 insertions(+), 102 deletions(-) delete mode 100644 cpp/src/arrow/util/receiver.cc delete mode 100644 cpp/src/arrow/util/receiver.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 19e28e275e9..3454cd0c87d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -180,7 +180,6 @@ set(ARROW_SRCS util/key_value_metadata.cc util/memory.cc util/parsing.cc - util/receiver.cc util/string.cc util/string_builder.cc util/task_group.cc diff --git a/cpp/src/arrow/ipc/read_write_benchmark.cc b/cpp/src/arrow/ipc/read_write_benchmark.cc index 11c1253089b..f694591570a 100644 --- a/cpp/src/arrow/ipc/read_write_benchmark.cc +++ b/cpp/src/arrow/ipc/read_write_benchmark.cc @@ -141,7 +141,7 @@ static void EmitStream(benchmark::State& state) { // NOLINT non-const reference ipc::DictionaryMemo empty_memo; while (state.KeepRunning()) { - class RecordBatchReceiverNull : public Receiver { + class RecordBatchReceiverNull : public ipc::Receiver { Status RecordBatchReceived(std::shared_ptr batch) override { return Status::OK(); } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index c0231c0a8da..290e26e8947 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -941,6 +941,14 @@ Result> RecordBatchFileReader::Open( return result; } +Status Receiver::EosReceived() { return Status::OK(); } + +Status Receiver::SchemaReceived(std::shared_ptr schema) { return Status::OK(); } + +Status Receiver::RecordBatchReceived(std::shared_ptr record_batch) { + return Status::NotImplemented("RecordBatch receiver isn't implemented"); +} + class StreamDecoder::StreamDecoderImpl : public MessageReceiver { private: enum State { diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 663a7a036e0..232408309fd 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -30,7 +30,6 @@ #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/util/macros.h" -#include "arrow/util/receiver.h" #include "arrow/util/visibility.h" namespace arrow { @@ -197,6 +196,45 @@ class ARROW_EXPORT RecordBatchFileReader { } }; +/// \class Receiver +/// \brief A general receiver class to receive objects. +/// +/// You must implement receiver methods for objects you want to receive. +class ARROW_EXPORT Receiver { + public: + virtual ~Receiver() = default; + + /// \brief Called when end-of-stream is received. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \return Status + /// + /// \see RecordBatchStreamEmitter + virtual Status EosReceived(); + + /// \brief Called when a record batch is received. + /// + /// The default implementation just returns + /// arrow::Status::NotImplemented(). + /// + /// \param[in] record_batch a record batch received + /// \return Status + /// + /// \see RecordBatchStreamEmitter + virtual Status RecordBatchReceived(std::shared_ptr record_batch); + + /// \brief Called when a schema is received. + /// + /// The default implementation just returns arrow::Status::OK(). + /// + /// \param[in] schema a schema received + /// \return Status + /// + /// \see RecordBatchStreamEmitter + virtual Status SchemaReceived(std::shared_ptr schema); +}; + /// \class RecordBatchReceiverCollect /// \brief Collect record batches read by RecordBatchStreamEmitter. class ARROW_EXPORT RecordBatchReceiverCollect : public Receiver { diff --git a/cpp/src/arrow/util/receiver.cc b/cpp/src/arrow/util/receiver.cc deleted file mode 100644 index 539bf90e070..00000000000 --- a/cpp/src/arrow/util/receiver.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/util/receiver.h" - -namespace arrow { - -Status Receiver::EosReceived() { return Status::OK(); } - -Status Receiver::SchemaReceived(std::shared_ptr schema) { return Status::OK(); } - -Status Receiver::RecordBatchReceived(std::shared_ptr record_batch) { - return Status::NotImplemented("RecordBatch receiver isn't implemented"); -} - -} // namespace arrow diff --git a/cpp/src/arrow/util/receiver.h b/cpp/src/arrow/util/receiver.h deleted file mode 100644 index f577ff5822d..00000000000 --- a/cpp/src/arrow/util/receiver.h +++ /dev/null @@ -1,69 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class RecordBatch; -class Schema; - -/// \class Receiver -/// \brief A general receiver class to receive objects. -/// -/// You must implement receiver methods for objects you want to receive. -class ARROW_EXPORT Receiver { - public: - virtual ~Receiver() = default; - - /// \brief Called when end-of-stream is received. - /// - /// The default implementation just returns arrow::Status::OK(). - /// - /// \return Status - /// - /// \see RecordBatchStreamEmitter - virtual Status EosReceived(); - - /// \brief Called when a record batch is received. - /// - /// The default implementation just returns - /// arrow::Status::NotImplemented(). - /// - /// \param[in] record_batch a record batch received - /// \return Status - /// - /// \see RecordBatchStreamEmitter - virtual Status RecordBatchReceived(std::shared_ptr record_batch); - - /// \brief Called when a schema is received. - /// - /// The default implementation just returns arrow::Status::OK(). - /// - /// \param[in] schema a schema received - /// \return Status - /// - /// \see RecordBatchStreamEmitter - virtual Status SchemaReceived(std::shared_ptr schema); -}; - -} // namespace arrow From cc6da5614f333457cf5380347924a637360a11d1 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 06:26:16 +0900 Subject: [PATCH 05/12] Mark new APIs as experimental --- cpp/src/arrow/ipc/message.h | 12 ++++++++++++ cpp/src/arrow/ipc/reader.h | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 8d4a0c1e11c..7bfe16931f0 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -183,6 +183,10 @@ ARROW_EXPORT std::string FormatMessageType(Message::Type type); /// \class MessageReceiver /// \brief An abstract class to receive read messages from /// MessageEmitter. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 class ARROW_EXPORT MessageReceiver { public: virtual ~MessageReceiver() = default; @@ -200,6 +204,10 @@ class ARROW_EXPORT MessageReceiver { /// \class MessageReceiverAssign /// \brief Assign a message read by MessageEmitter. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 class ARROW_EXPORT MessageReceiverAssign : public MessageReceiver { public: /// \brief Construct a message receiver that assign a read message @@ -223,6 +231,10 @@ class ARROW_EXPORT MessageReceiverAssign : public MessageReceiver { /// \class MessageDecoder /// \brief Push style message decoder that receives data from user. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 class ARROW_EXPORT MessageDecoder { public: /// \brief State for reading a message diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index 232408309fd..f31476da7e2 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -200,6 +200,10 @@ class ARROW_EXPORT RecordBatchFileReader { /// \brief A general receiver class to receive objects. /// /// You must implement receiver methods for objects you want to receive. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 class ARROW_EXPORT Receiver { public: virtual ~Receiver() = default; @@ -237,6 +241,10 @@ class ARROW_EXPORT Receiver { /// \class RecordBatchReceiverCollect /// \brief Collect record batches read by RecordBatchStreamEmitter. +/// +/// This API is EXPERIMENTAL. +/// +/// \since 0.17.0 class ARROW_EXPORT RecordBatchReceiverCollect : public Receiver { public: RecordBatchReceiverCollect() : schema_(), record_batches_() {} @@ -270,7 +278,11 @@ class ARROW_EXPORT RecordBatchReceiverCollect : public Receiver { /// /// This class decodes the Apache Arrow IPC streaming format data. /// +/// This API is EXPERIMENTAL. +/// /// \see https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format +/// +/// \since 0.17.0 class ARROW_EXPORT StreamDecoder { public: /// \brief Construct a stream decoder. From a9629a2e29a1a629e27ee2134f7f3f971d882d84 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 06:50:31 +0900 Subject: [PATCH 06/12] Use decode --- cpp/src/arrow/ipc/read_write_benchmark.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/read_write_benchmark.cc b/cpp/src/arrow/ipc/read_write_benchmark.cc index f694591570a..c94efc242b0 100644 --- a/cpp/src/arrow/ipc/read_write_benchmark.cc +++ b/cpp/src/arrow/ipc/read_write_benchmark.cc @@ -123,7 +123,7 @@ static void ReadStream(benchmark::State& state) { // NOLINT non-const reference state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); } -static void EmitStream(benchmark::State& state) { // NOLINT non-const reference +static void DecodeStream(benchmark::State& state) { // NOLINT non-const reference // 1MB constexpr int64_t kTotalSize = 1 << 20; auto options = ipc::IpcWriteOptions::Defaults(); @@ -157,6 +157,6 @@ static void EmitStream(benchmark::State& state) { // NOLINT non-const reference BENCHMARK(WriteRecordBatch)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); BENCHMARK(ReadRecordBatch)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); BENCHMARK(ReadStream)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); -BENCHMARK(EmitStream)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); +BENCHMARK(DecodeStream)->RangeMultiplier(4)->Range(1, 1 << 13)->UseRealTime(); } // namespace arrow From b4fd9cc14120e76aaeeb98475748700cbc7a3f42 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 06:58:08 +0900 Subject: [PATCH 07/12] Rename MessageReceiver to MessageDecoderListener --- cpp/src/arrow/ipc/message.cc | 65 +++++++++++++++++------- cpp/src/arrow/ipc/message.h | 96 +++++++++++++++++++++++++----------- cpp/src/arrow/ipc/reader.cc | 15 +++--- 3 files changed, 124 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 5f73522897b..5109d3770f4 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -185,8 +185,8 @@ Status CheckMetadataAndGetBodyLength(const Buffer& metadata, int64_t* body_lengt Result> Message::ReadFrom(std::shared_ptr metadata, io::InputStream* stream) { std::unique_ptr result; - auto receiver = std::make_shared(&result); - MessageDecoder decoder(receiver, MessageDecoder::State::METADATA, metadata->size()); + auto listener = std::make_shared(&result); + MessageDecoder decoder(listener, MessageDecoder::State::METADATA, metadata->size()); ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); ARROW_ASSIGN_OR_RAISE(auto body, stream->Read(decoder.next_required_size())); @@ -202,8 +202,8 @@ Result> Message::ReadFrom(const int64_t offset, std::shared_ptr metadata, io::RandomAccessFile* file) { std::unique_ptr result; - auto receiver = std::make_shared(&result); - MessageDecoder decoder(receiver, MessageDecoder::State::METADATA, metadata->size()); + auto listener = std::make_shared(&result); + MessageDecoder decoder(listener, MessageDecoder::State::METADATA, metadata->size()); ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); ARROW_ASSIGN_OR_RAISE(auto body, file->ReadAt(offset, decoder.next_required_size())); @@ -267,8 +267,8 @@ std::string FormatMessageType(Message::Type type) { Result> ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile* file) { std::unique_ptr result; - auto receiver = std::make_shared(&result); - MessageDecoder decoder(receiver); + auto listener = std::make_shared(&result); + MessageDecoder decoder(listener); if (metadata_length < decoder.next_required_size()) { return Status::Invalid("metadata_length should be at least ", @@ -336,8 +336,8 @@ Status CheckAligned(io::FileInterface* stream, int32_t alignment) { Result> ReadMessage(io::InputStream* file, MemoryPool* pool) { std::unique_ptr message; - auto receiver = std::make_shared(&message); - MessageDecoder decoder(receiver, pool); + auto listener = std::make_shared(&message); + MessageDecoder decoder(listener, pool); { uint8_t continuation[sizeof(int32_t)]; @@ -425,21 +425,45 @@ Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, // ---------------------------------------------------------------------- // Implement MessageDecoder +Status MessageDecoderListener::OnInitial() {return Status::OK();} +Status MessageDecoderListener::OnMetadataLength() {return Status::OK();} +Status MessageDecoderListener::OnMetadata() {return Status::OK();} +Status MessageDecoderListener::OnBody() {return Status::OK();} +Status MessageDecoderListener::OnEOS() {return Status::OK();} + static constexpr auto kMessageDecoderNextRequiredSizeInitial = sizeof(int32_t); static constexpr auto kMessageDecoderNextRequiredSizeMetadataLength = sizeof(int32_t); class MessageDecoder::MessageDecoderImpl { public: - explicit MessageDecoderImpl(std::shared_ptr receiver, + explicit MessageDecoderImpl(std::shared_ptr listener, State initial_state, int64_t initial_next_required_size, MemoryPool* pool) - : receiver_(std::move(receiver)), + : listener_(std::move(listener)), pool_(pool), state_(initial_state), next_required_size_(initial_next_required_size), chunks_(), buffered_size_(0), - metadata_(nullptr) {} + metadata_(nullptr) { + switch (state_) { + case State::INITIAL: + listener_->OnInitial(); + break; + case State::METADATA_LENGTH: + listener_->OnMetadataLength(); + break; + case State::METADATA: + listener_->OnMetadata(); + break; + case State::BODY: + listener_->OnBody(); + break; + case State::EOS: + listener_->OnEOS(); + break; + } + } Status ConsumeData(const uint8_t* data, int64_t size) { if (buffered_size_ == 0) { @@ -574,17 +598,20 @@ class MessageDecoder::MessageDecoderImpl { if (continuation == internal::kIpcContinuationToken) { state_ = State::METADATA_LENGTH; next_required_size_ = kMessageDecoderNextRequiredSizeMetadataLength; + RETURN_NOT_OK(listener_->OnMetadataLength()); // Valid IPC message, read the message length now return Status::OK(); } else if (continuation == 0) { state_ = State::EOS; next_required_size_ = 0; + RETURN_NOT_OK(listener_->OnEOS()); return Status::OK(); } else { state_ = State::METADATA; // ARROW-6314: Backwards compatibility for reading old IPC // messages produced prior to version 0.15.0 next_required_size_ = continuation; + RETURN_NOT_OK(listener_->OnMetadata()); return Status::OK(); } } @@ -608,10 +635,12 @@ class MessageDecoder::MessageDecoderImpl { if (metadata_length == 0) { state_ = State::EOS; next_required_size_ = 0; + RETURN_NOT_OK(listener_->OnEOS()); return Status::OK(); } else { state_ = State::METADATA; next_required_size_ = metadata_length; + RETURN_NOT_OK(listener_->OnMetadata()); return Status::OK(); } } @@ -661,6 +690,7 @@ class MessageDecoder::MessageDecoderImpl { state_ = State::BODY; next_required_size_ = body_length; + RETURN_NOT_OK(listener_->OnBody()); if (next_required_size_ == 0) { ARROW_ASSIGN_OR_RAISE(auto body, AllocateBuffer(0, pool_)); std::shared_ptr shared_body(body.release()); @@ -699,9 +729,10 @@ class MessageDecoder::MessageDecoderImpl { ARROW_ASSIGN_OR_RAISE(std::unique_ptr message, Message::Open(metadata_, *buffer)); + RETURN_NOT_OK(listener_->OnMessageDecoded(std::move(message))); state_ = State::INITIAL; next_required_size_ = kMessageDecoderNextRequiredSizeInitial; - RETURN_NOT_OK(receiver_->Received(std::move(message))); + RETURN_NOT_OK(listener_->OnInitial()); return Status::OK(); } @@ -747,7 +778,7 @@ class MessageDecoder::MessageDecoderImpl { return Status::OK(); } - std::shared_ptr receiver_; + std::shared_ptr listener_; MemoryPool* pool_; State state_; int64_t next_required_size_; @@ -756,16 +787,16 @@ class MessageDecoder::MessageDecoderImpl { std::shared_ptr metadata_; // Must be CPU buffer }; -MessageDecoder::MessageDecoder(std::shared_ptr receiver, +MessageDecoder::MessageDecoder(std::shared_ptr listener, MemoryPool* pool) { - impl_.reset(new MessageDecoderImpl(std::move(receiver), State::INITIAL, + impl_.reset(new MessageDecoderImpl(std::move(listener), State::INITIAL, kMessageDecoderNextRequiredSizeInitial, pool)); } -MessageDecoder::MessageDecoder(std::shared_ptr receiver, +MessageDecoder::MessageDecoder(std::shared_ptr listener, State initial_state, int64_t initial_next_required_size, MemoryPool* pool) { - impl_.reset(new MessageDecoderImpl(std::move(receiver), initial_state, + impl_.reset(new MessageDecoderImpl(std::move(listener), initial_state, initial_next_required_size, pool)); } diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 7bfe16931f0..dc71262abc7 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -180,45 +180,74 @@ class ARROW_EXPORT Message { ARROW_EXPORT std::string FormatMessageType(Message::Type type); -/// \class MessageReceiver -/// \brief An abstract class to receive read messages from -/// MessageEmitter. +/// \class MessageDecoderListener +/// \brief An abstract class to listen events from MessageDecoder. /// /// This API is EXPERIMENTAL. /// /// \since 0.17.0 -class ARROW_EXPORT MessageReceiver { +class ARROW_EXPORT MessageDecoderListener { public: - virtual ~MessageReceiver() = default; + virtual ~MessageDecoderListener() = default; - /// \brief Receive a message. + /// \brief Called when a message is decoded. /// - /// MessageEmitter calls this method when it read a message. This - /// method is called multiple times when the target stream stream - /// has multiple messages. + /// MessageDecoder calls this method when it decodes a message. This + /// method is called multiple times when the target stream has + /// multiple messages. /// - /// \param[in] message a read message + /// \param[in] message a decoded message /// \return Status - virtual Status Received(std::unique_ptr message) = 0; + virtual Status OnMessageDecoded(std::unique_ptr message) = 0; + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::INITIAL. + /// + /// \return Status + virtual Status OnInitial(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::METADATA_LENGTH. + /// + /// \return Status + virtual Status OnMetadataLength(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::METADATA. + /// + /// \return Status + virtual Status OnMetadata(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::BODY. + /// + /// \return Status + virtual Status OnBody(); + + /// \brief Called when the decoder state is changed to + /// MessageDecoder::State::EOS. + /// + /// \return Status + virtual Status OnEOS(); }; -/// \class MessageReceiverAssign -/// \brief Assign a message read by MessageEmitter. +/// \class MessageDecoderListenerAssign +/// \brief Assign a message decoded by MessageDecoder. /// /// This API is EXPERIMENTAL. /// /// \since 0.17.0 -class ARROW_EXPORT MessageReceiverAssign : public MessageReceiver { +class ARROW_EXPORT MessageDecoderListenerAssign : public MessageDecoderListener { public: - /// \brief Construct a message receiver that assign a read message - /// to the specified location. + /// \brief Construct a listener that assign a decoded message to the + /// specified location. /// /// \param[in] message a location to store the received message - explicit MessageReceiverAssign(std::unique_ptr* message) : message_(message) {} + explicit MessageDecoderListenerAssign(std::unique_ptr* message) : message_(message) {} - virtual ~MessageReceiverAssign() = default; + virtual ~MessageDecoderListenerAssign() = default; - Status Received(std::unique_ptr message) override { + Status OnMessageDecoded(std::unique_ptr message) override { *message_ = std::move(message); return Status::OK(); } @@ -226,7 +255,7 @@ class ARROW_EXPORT MessageReceiverAssign : public MessageReceiver { private: std::unique_ptr* message_; - ARROW_DISALLOW_COPY_AND_ASSIGN(MessageReceiverAssign); + ARROW_DISALLOW_COPY_AND_ASSIGN(MessageDecoderListenerAssign); }; /// \class MessageDecoder @@ -262,10 +291,11 @@ class ARROW_EXPORT MessageDecoder { /// \brief Construct a message decoder. /// - /// \param[in] receiver a MessageReceiver that receives decoded messages + /// \param[in] listener a MessageDecoderListener that responds events from + /// the decoder /// \param[in] pool an optional MemoryPool to copy metadata on the /// CPU, if required - explicit MessageDecoder(std::shared_ptr receiver, + explicit MessageDecoder(std::shared_ptr listener, MemoryPool* pool = default_memory_pool()); /// \brief Construct a message decoder with the specified state. @@ -273,13 +303,14 @@ class ARROW_EXPORT MessageDecoder { /// This is a construct for advanced users that know how to decode /// Message. /// - /// \param[in] receiver a MessageReceiver that receives decoded messages + /// \param[in] listener a MessageDecoderListener that responds events from + /// the decoder /// \param[in] initial_state an initial state of the decode /// \param[in] initial_next_required_size the number of bytes needed /// to run the next action /// \param[in] pool an optional MemoryPool to copy metadata on the /// CPU, if required - MessageDecoder(std::shared_ptr receiver, State initial_state, + MessageDecoder(std::shared_ptr listener, State initial_state, int64_t initial_next_required_size, MemoryPool* pool = default_memory_pool()); @@ -288,8 +319,17 @@ class ARROW_EXPORT MessageDecoder { /// \brief Feed data to the decoder as a raw data. /// /// If the decoder can decode one or more messages by the data, the - /// decoder calls receiver->Receive() with a decoded message - /// multiple times. + /// decoder calls listener->OnMessageDecoded() with a decoded + /// message multiple times. + /// + /// If the state of the decoder is changed, corresponding callbacks + /// on listener is called: + /// + /// * MessageDecoder::State::INITIAL: listener->OnInitial() + /// * MessageDecoder::State::METADATA_LENGTH: listener->OnMetadataLength() + /// * MessageDecoder::State::METADATA: listener->OnMetadata() + /// * MessageDecoder::State::BODY: listener->OnBody() + /// * MessageDecoder::State::EOS: listener->OnEOS() /// /// \param[in] data a raw data to be processed. This data isn't /// copied. The passed memory must be kept alive through message @@ -301,8 +341,8 @@ class ARROW_EXPORT MessageDecoder { /// \brief Feed data to the decoder as a Buffer. /// /// If the decoder can decode one or more messages by the Buffer, - /// the decoder calls receiver->Receive() with a decoded message - /// multiple times. + /// the decoder calls listener->OnMessageDecoded() with a decoded + /// message multiple times. /// /// \param[in] buffer a Buffer to be processed. /// \return Status diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 290e26e8947..0a1343a1b9e 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -949,7 +949,7 @@ Status Receiver::RecordBatchReceived(std::shared_ptr record_batch) return Status::NotImplemented("RecordBatch receiver isn't implemented"); } -class StreamDecoder::StreamDecoderImpl : public MessageReceiver { +class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { private: enum State { SCHEMA, @@ -961,7 +961,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageReceiver { public: explicit StreamDecoderImpl(std::shared_ptr receiver, const IpcReadOptions& options) - : MessageReceiver(), + : MessageDecoderListener(), receiver_(std::move(receiver)), options_(options), state_(State::SCHEMA), @@ -973,7 +973,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageReceiver { dictionary_memo_(), schema_() {} - Status Received(std::unique_ptr message) override { + Status OnMessageDecoded(std::unique_ptr message) override { switch (state_) { case State::SCHEMA: ARROW_RETURN_NOT_OK(SchemaMessageReceived(std::move(message))); @@ -987,13 +987,14 @@ class StreamDecoder::StreamDecoderImpl : public MessageReceiver { case State::EOS: break; } - if (message_decoder_.state() == MessageDecoder::State::EOS) { - state_ = State::EOS; - ARROW_RETURN_NOT_OK(receiver_->EosReceived()); - } return Status::OK(); } + Status OnEOS() override { + state_ = State::EOS; + return receiver_->EosReceived(); + } + Status Consume(const uint8_t* data, int64_t size) { return message_decoder_.Consume(data, size); } From 9a04e969e64d2a2ed40e066c2734c85fc4a12525 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 08:22:24 +0900 Subject: [PATCH 08/12] Reuse decoder in MessageReader --- cpp/src/arrow/ipc/message.cc | 76 ++++++++++++++++++++++++------------ cpp/src/arrow/ipc/message.h | 13 ++++++ 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 5109d3770f4..d724e5b2e18 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -334,60 +334,70 @@ Status CheckAligned(io::FileInterface* stream, int32_t alignment) { } } -Result> ReadMessage(io::InputStream* file, MemoryPool* pool) { - std::unique_ptr message; - auto listener = std::make_shared(&message); - MessageDecoder decoder(listener, pool); - - { +Status DecodeMessage(MessageDecoder*decoder, io::InputStream* file) { + if (decoder->state() == MessageDecoder::State::INITIAL) { uint8_t continuation[sizeof(int32_t)]; ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, file->Read(sizeof(int32_t), &continuation)); if (bytes_read == 0) { // EOS without indication - return nullptr; - } else if (bytes_read != decoder.next_required_size()) { + return Status::OK(); + } else if (bytes_read != decoder->next_required_size()) { return Status::Invalid("Corrupted message, only ", bytes_read, " bytes available"); } - ARROW_RETURN_NOT_OK(decoder.Consume(continuation, bytes_read)); + ARROW_RETURN_NOT_OK(decoder->Consume(continuation, bytes_read)); } - if (decoder.state() == MessageDecoder::State::METADATA_LENGTH) { + if (decoder->state() == MessageDecoder::State::METADATA_LENGTH) { // Valid IPC message, read the message length now uint8_t metadata_length[sizeof(int32_t)]; ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, file->Read(sizeof(int32_t), &metadata_length)); - if (bytes_read != decoder.next_required_size()) { + if (bytes_read != decoder->next_required_size()) { return Status::Invalid("Corrupted metadata length, only ", bytes_read, " bytes available"); } - ARROW_RETURN_NOT_OK(decoder.Consume(metadata_length, bytes_read)); + ARROW_RETURN_NOT_OK(decoder->Consume(metadata_length, bytes_read)); } - if (decoder.state() == MessageDecoder::State::EOS) { - return nullptr; + if (decoder->state() == MessageDecoder::State::EOS) { + return Status::OK(); } - auto metadata_length = decoder.next_required_size(); + auto metadata_length = decoder->next_required_size(); ARROW_ASSIGN_OR_RAISE(auto metadata, file->Read(metadata_length)); if (metadata->size() != metadata_length) { return Status::Invalid("Expected to read ", metadata_length, " metadata bytes, but ", "only read ", metadata->size()); } - RETURN_NOT_OK(decoder.Consume(metadata)); + ARROW_RETURN_NOT_OK(decoder->Consume(metadata)); - if (decoder.state() == MessageDecoder::State::BODY) { - ARROW_ASSIGN_OR_RAISE(auto body, file->Read(decoder.next_required_size())); - if (body->size() < decoder.next_required_size()) { - return Status::IOError("Expected to be able to read ", decoder.next_required_size(), + if (decoder->state() == MessageDecoder::State::BODY) { + ARROW_ASSIGN_OR_RAISE(auto body, file->Read(decoder->next_required_size())); + if (body->size() < decoder->next_required_size()) { + return Status::IOError("Expected to be able to read ", decoder->next_required_size(), " bytes for message body, got ", body->size()); } - ARROW_RETURN_NOT_OK(decoder.Consume(body)); + ARROW_RETURN_NOT_OK(decoder->Consume(body)); } + if (decoder->state() == MessageDecoder::State::INITIAL || + decoder->state() == MessageDecoder::State::EOS) { + return Status::OK(); + } else { + return Status::Invalid("Failed to decode message"); + } +} + +Result> ReadMessage(io::InputStream* file, MemoryPool* pool) { + std::unique_ptr message; + auto listener = std::make_shared(&message); + MessageDecoder decoder(listener, pool); + ARROW_RETURN_NOT_OK(DecodeMessage(&decoder, file)); if (!message) { - return Status::Invalid("Failed to read message"); + return nullptr; + } else { + return std::move(message); } - return std::move(message); } Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, @@ -818,9 +828,13 @@ MessageDecoder::State MessageDecoder::state() const { return impl_->state(); } // Implement InputStream message reader /// \brief Implementation of MessageReader that reads from InputStream -class InputStreamMessageReader : public MessageReader { +class InputStreamMessageReader : public MessageReader, public MessageDecoderListener { public: - explicit InputStreamMessageReader(io::InputStream* stream) : stream_(stream) {} + explicit InputStreamMessageReader(io::InputStream* stream) : + stream_(stream), + owned_stream_(), + message_(), + decoder_(std::shared_ptr(this, [](void*) {})) {} explicit InputStreamMessageReader(const std::shared_ptr& owned_stream) : InputStreamMessageReader(owned_stream.get()) { @@ -829,11 +843,21 @@ class InputStreamMessageReader : public MessageReader { ~InputStreamMessageReader() {} - Result> ReadNextMessage() { return ReadMessage(stream_); } + Status OnMessageDecoded(std::unique_ptr message) override { + message_ = std::move(message); + return Status::OK(); + } + + Result> ReadNextMessage() { + ARROW_RETURN_NOT_OK(DecodeMessage(&decoder_, stream_)); + return std::move(message_); + } private: io::InputStream* stream_; std::shared_ptr owned_stream_; + std::unique_ptr message_; + MessageDecoder decoder_; }; std::unique_ptr MessageReader::Open(io::InputStream* stream) { diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index dc71262abc7..f9bd7f1f8dc 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -528,6 +528,19 @@ ARROW_EXPORT Result> ReadMessage(io::InputStream* stream, MemoryPool* pool = default_memory_pool()); +/// \brief Feed data from InputStream to MessageDecoder to decode an +/// encapsulated IPC message (metadata and body) +/// +/// This API is EXPERIMENTAL. +/// +/// \param[in] decoder a decoder +/// \param[in] stream an input stream +/// \return Status +/// +/// \since 0.17.0 +ARROW_EXPORT +Status DecodeMessage(MessageDecoder* decoder, io::InputStream* stream); + /// Write encapsulated IPC message Does not make assumptions about /// whether the stream is aligned already. Can write legacy (pre /// version 0.15.0) IPC message if option set From a5d7320631c578a909f4c808ada335a58f18593f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 08:23:13 +0900 Subject: [PATCH 09/12] Format --- cpp/src/arrow/ipc/message.cc | 55 ++++++++++++----------- cpp/src/arrow/ipc/message.h | 3 +- cpp/src/arrow/ipc/read_write_benchmark.cc | 5 +-- cpp/src/arrow/ipc/reader.cc | 17 +++---- 4 files changed, 38 insertions(+), 42 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index d724e5b2e18..6d2d32a7e65 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -334,7 +334,7 @@ Status CheckAligned(io::FileInterface* stream, int32_t alignment) { } } -Status DecodeMessage(MessageDecoder*decoder, io::InputStream* file) { +Status DecodeMessage(MessageDecoder* decoder, io::InputStream* file) { if (decoder->state() == MessageDecoder::State::INITIAL) { uint8_t continuation[sizeof(int32_t)]; ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, file->Read(sizeof(int32_t), &continuation)); @@ -374,7 +374,8 @@ Status DecodeMessage(MessageDecoder*decoder, io::InputStream* file) { if (decoder->state() == MessageDecoder::State::BODY) { ARROW_ASSIGN_OR_RAISE(auto body, file->Read(decoder->next_required_size())); if (body->size() < decoder->next_required_size()) { - return Status::IOError("Expected to be able to read ", decoder->next_required_size(), + return Status::IOError("Expected to be able to read ", + decoder->next_required_size(), " bytes for message body, got ", body->size()); } ARROW_RETURN_NOT_OK(decoder->Consume(body)); @@ -435,11 +436,11 @@ Status WriteMessage(const Buffer& message, const IpcWriteOptions& options, // ---------------------------------------------------------------------- // Implement MessageDecoder -Status MessageDecoderListener::OnInitial() {return Status::OK();} -Status MessageDecoderListener::OnMetadataLength() {return Status::OK();} -Status MessageDecoderListener::OnMetadata() {return Status::OK();} -Status MessageDecoderListener::OnBody() {return Status::OK();} -Status MessageDecoderListener::OnEOS() {return Status::OK();} +Status MessageDecoderListener::OnInitial() { return Status::OK(); } +Status MessageDecoderListener::OnMetadataLength() { return Status::OK(); } +Status MessageDecoderListener::OnMetadata() { return Status::OK(); } +Status MessageDecoderListener::OnBody() { return Status::OK(); } +Status MessageDecoderListener::OnEOS() { return Status::OK(); } static constexpr auto kMessageDecoderNextRequiredSizeInitial = sizeof(int32_t); static constexpr auto kMessageDecoderNextRequiredSizeMetadataLength = sizeof(int32_t); @@ -457,21 +458,21 @@ class MessageDecoder::MessageDecoderImpl { buffered_size_(0), metadata_(nullptr) { switch (state_) { - case State::INITIAL: - listener_->OnInitial(); - break; - case State::METADATA_LENGTH: - listener_->OnMetadataLength(); - break; - case State::METADATA: - listener_->OnMetadata(); - break; - case State::BODY: - listener_->OnBody(); - break; - case State::EOS: - listener_->OnEOS(); - break; + case State::INITIAL: + listener_->OnInitial(); + break; + case State::METADATA_LENGTH: + listener_->OnMetadataLength(); + break; + case State::METADATA: + listener_->OnMetadata(); + break; + case State::BODY: + listener_->OnBody(); + break; + case State::EOS: + listener_->OnEOS(); + break; } } @@ -830,11 +831,11 @@ MessageDecoder::State MessageDecoder::state() const { return impl_->state(); } /// \brief Implementation of MessageReader that reads from InputStream class InputStreamMessageReader : public MessageReader, public MessageDecoderListener { public: - explicit InputStreamMessageReader(io::InputStream* stream) : - stream_(stream), - owned_stream_(), - message_(), - decoder_(std::shared_ptr(this, [](void*) {})) {} + explicit InputStreamMessageReader(io::InputStream* stream) + : stream_(stream), + owned_stream_(), + message_(), + decoder_(std::shared_ptr(this, [](void*) {})) {} explicit InputStreamMessageReader(const std::shared_ptr& owned_stream) : InputStreamMessageReader(owned_stream.get()) { diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index f9bd7f1f8dc..0fd5bf53468 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -243,7 +243,8 @@ class ARROW_EXPORT MessageDecoderListenerAssign : public MessageDecoderListener /// specified location. /// /// \param[in] message a location to store the received message - explicit MessageDecoderListenerAssign(std::unique_ptr* message) : message_(message) {} + explicit MessageDecoderListenerAssign(std::unique_ptr* message) + : message_(message) {} virtual ~MessageDecoderListenerAssign() = default; diff --git a/cpp/src/arrow/ipc/read_write_benchmark.cc b/cpp/src/arrow/ipc/read_write_benchmark.cc index c94efc242b0..cf7c9664f90 100644 --- a/cpp/src/arrow/ipc/read_write_benchmark.cc +++ b/cpp/src/arrow/ipc/read_write_benchmark.cc @@ -146,9 +146,8 @@ static void DecodeStream(benchmark::State& state) { // NOLINT non-const referen return Status::OK(); } } receiver; - ipc::StreamDecoder decoder( - std::shared_ptr(&receiver), - ipc::IpcReadOptions::Defaults()); + ipc::StreamDecoder decoder(std::shared_ptr(&receiver), + ipc::IpcReadOptions::Defaults()); ABORT_NOT_OK(decoder.Consume(buffer)); } state.SetBytesProcessed(int64_t(state.iterations()) * kTotalSize); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 0a1343a1b9e..a2d2f30685a 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -960,14 +960,13 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { public: explicit StreamDecoderImpl(std::shared_ptr receiver, - const IpcReadOptions& options) + const IpcReadOptions& options) : MessageDecoderListener(), receiver_(std::move(receiver)), options_(options), state_(State::SCHEMA), - message_decoder_( - std::shared_ptr(this, [](void*) {}), - options_.memory_pool), + message_decoder_(std::shared_ptr(this, [](void*) {}), + options_.memory_pool), field_inclusion_mask_(), n_required_dictionaries_(0), dictionary_memo_(), @@ -1061,7 +1060,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { }; StreamDecoder::StreamDecoder(std::shared_ptr receiver, - const IpcReadOptions& options) { + const IpcReadOptions& options) { impl_.reset(new StreamDecoderImpl(std::move(receiver), options)); } @@ -1074,13 +1073,9 @@ Status StreamDecoder::Consume(std::shared_ptr buffer) { return impl_->Consume(std::move(buffer)); } -std::shared_ptr StreamDecoder::schema() const { - return impl_->schema(); -} +std::shared_ptr StreamDecoder::schema() const { return impl_->schema(); } -int64_t StreamDecoder::next_required_size() const { - return impl_->next_required_size(); -} +int64_t StreamDecoder::next_required_size() const { return impl_->next_required_size(); } Result> ReadSchema(io::InputStream* stream, DictionaryMemo* dictionary_memo) { From 7fab0e3825bbd2b276cc87ef2cab14747f5a7152 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 09:20:07 +0900 Subject: [PATCH 10/12] Rename receiver to listener --- cpp/src/arrow/ipc/message.cc | 8 +-- cpp/src/arrow/ipc/message.h | 22 ++++++--- cpp/src/arrow/ipc/read_write_benchmark.cc | 8 +-- cpp/src/arrow/ipc/read_write_test.cc | 52 ++++++++++---------- cpp/src/arrow/ipc/reader.cc | 38 +++++++------- cpp/src/arrow/ipc/reader.h | 60 +++++++++++------------ 6 files changed, 99 insertions(+), 89 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 6d2d32a7e65..e0b47942fe2 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -185,7 +185,7 @@ Status CheckMetadataAndGetBodyLength(const Buffer& metadata, int64_t* body_lengt Result> Message::ReadFrom(std::shared_ptr metadata, io::InputStream* stream) { std::unique_ptr result; - auto listener = std::make_shared(&result); + auto listener = std::make_shared(&result); MessageDecoder decoder(listener, MessageDecoder::State::METADATA, metadata->size()); ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); @@ -202,7 +202,7 @@ Result> Message::ReadFrom(const int64_t offset, std::shared_ptr metadata, io::RandomAccessFile* file) { std::unique_ptr result; - auto listener = std::make_shared(&result); + auto listener = std::make_shared(&result); MessageDecoder decoder(listener, MessageDecoder::State::METADATA, metadata->size()); ARROW_RETURN_NOT_OK(decoder.Consume(metadata)); @@ -267,7 +267,7 @@ std::string FormatMessageType(Message::Type type) { Result> ReadMessage(int64_t offset, int32_t metadata_length, io::RandomAccessFile* file) { std::unique_ptr result; - auto listener = std::make_shared(&result); + auto listener = std::make_shared(&result); MessageDecoder decoder(listener); if (metadata_length < decoder.next_required_size()) { @@ -391,7 +391,7 @@ Status DecodeMessage(MessageDecoder* decoder, io::InputStream* file) { Result> ReadMessage(io::InputStream* file, MemoryPool* pool) { std::unique_ptr message; - auto listener = std::make_shared(&message); + auto listener = std::make_shared(&message); MessageDecoder decoder(listener, pool); ARROW_RETURN_NOT_OK(DecodeMessage(&decoder, file)); if (!message) { diff --git a/cpp/src/arrow/ipc/message.h b/cpp/src/arrow/ipc/message.h index 0fd5bf53468..1b3fa8f861a 100644 --- a/cpp/src/arrow/ipc/message.h +++ b/cpp/src/arrow/ipc/message.h @@ -203,50 +203,60 @@ class ARROW_EXPORT MessageDecoderListener { /// \brief Called when the decoder state is changed to /// MessageDecoder::State::INITIAL. /// + /// The default implementation just returns arrow::Status::OK(). + /// /// \return Status virtual Status OnInitial(); /// \brief Called when the decoder state is changed to /// MessageDecoder::State::METADATA_LENGTH. /// + /// The default implementation just returns arrow::Status::OK(). + /// /// \return Status virtual Status OnMetadataLength(); /// \brief Called when the decoder state is changed to /// MessageDecoder::State::METADATA. /// + /// The default implementation just returns arrow::Status::OK(). + /// /// \return Status virtual Status OnMetadata(); /// \brief Called when the decoder state is changed to /// MessageDecoder::State::BODY. /// + /// The default implementation just returns arrow::Status::OK(). + /// /// \return Status virtual Status OnBody(); /// \brief Called when the decoder state is changed to /// MessageDecoder::State::EOS. /// + /// The default implementation just returns arrow::Status::OK(). + /// /// \return Status virtual Status OnEOS(); }; -/// \class MessageDecoderListenerAssign +/// \class AssignMessageDecoderListener /// \brief Assign a message decoded by MessageDecoder. /// /// This API is EXPERIMENTAL. /// /// \since 0.17.0 -class ARROW_EXPORT MessageDecoderListenerAssign : public MessageDecoderListener { +class ARROW_EXPORT AssignMessageDecoderListener : public MessageDecoderListener { public: - /// \brief Construct a listener that assign a decoded message to the + /// \brief Construct a listener that assigns a decoded message to the /// specified location. /// /// \param[in] message a location to store the received message - explicit MessageDecoderListenerAssign(std::unique_ptr* message) + explicit AssignMessageDecoderListener(std::unique_ptr* message) : message_(message) {} - virtual ~MessageDecoderListenerAssign() = default; + virtual ~AssignMessageDecoderListener() = default; Status OnMessageDecoded(std::unique_ptr message) override { *message_ = std::move(message); @@ -256,7 +266,7 @@ class ARROW_EXPORT MessageDecoderListenerAssign : public MessageDecoderListener private: std::unique_ptr* message_; - ARROW_DISALLOW_COPY_AND_ASSIGN(MessageDecoderListenerAssign); + ARROW_DISALLOW_COPY_AND_ASSIGN(AssignMessageDecoderListener); }; /// \class MessageDecoder diff --git a/cpp/src/arrow/ipc/read_write_benchmark.cc b/cpp/src/arrow/ipc/read_write_benchmark.cc index cf7c9664f90..1c4ee406fcf 100644 --- a/cpp/src/arrow/ipc/read_write_benchmark.cc +++ b/cpp/src/arrow/ipc/read_write_benchmark.cc @@ -141,12 +141,12 @@ static void DecodeStream(benchmark::State& state) { // NOLINT non-const referen ipc::DictionaryMemo empty_memo; while (state.KeepRunning()) { - class RecordBatchReceiverNull : public ipc::Receiver { - Status RecordBatchReceived(std::shared_ptr batch) override { + class NullListener : public ipc::Listener { + Status OnRecordBatchDecoded(std::shared_ptr batch) override { return Status::OK(); } - } receiver; - ipc::StreamDecoder decoder(std::shared_ptr(&receiver), + } listener; + ipc::StreamDecoder decoder(std::shared_ptr(&listener, [](void*) {}), ipc::IpcReadOptions::Defaults()); ABORT_NOT_OK(decoder.Consume(buffer)); } diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index cbe3ba81c5a..d30eed754ab 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -886,78 +886,78 @@ struct StreamWriterHelper { struct StreamDecoderDataWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver, options); + auto listener = std::make_shared(); + StreamDecoder decoder(listener, options); ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data(), buffer_->size())); - *out_batches = receiver->record_batches(); + *out_batches = listener->record_batches(); return Status::OK(); } Status ReadSchema(std::shared_ptr* out) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver); + auto listener = std::make_shared(); + StreamDecoder decoder(listener); ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data(), buffer_->size())); - *out = receiver->schema(); + *out = listener->schema(); return Status::OK(); } }; struct StreamDecoderBufferWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver, options); + auto listener = std::make_shared(); + StreamDecoder decoder(listener, options); ARROW_RETURN_NOT_OK(decoder.Consume(buffer_)); - *out_batches = receiver->record_batches(); + *out_batches = listener->record_batches(); return Status::OK(); } Status ReadSchema(std::shared_ptr* out) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver); + auto listener = std::make_shared(); + StreamDecoder decoder(listener); ARROW_RETURN_NOT_OK(decoder.Consume(buffer_)); - *out = receiver->schema(); + *out = listener->schema(); return Status::OK(); } }; struct StreamDecoderSmallChunksWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver, options); + auto listener = std::make_shared(); + StreamDecoder decoder(listener, options); for (int64_t offset = 0; offset < buffer_->size() - 1; ++offset) { ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data() + offset, 1)); } - *out_batches = receiver->record_batches(); + *out_batches = listener->record_batches(); return Status::OK(); } Status ReadSchema(std::shared_ptr* out) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver); + auto listener = std::make_shared(); + StreamDecoder decoder(listener); for (int64_t offset = 0; offset < buffer_->size() - 1; ++offset) { ARROW_RETURN_NOT_OK(decoder.Consume(buffer_->data() + offset, 1)); } - *out = receiver->schema(); + *out = listener->schema(); return Status::OK(); } }; struct StreamDecoderLargeChunksWriterHelper : public StreamWriterHelper { Status ReadBatches(const IpcReadOptions& options, BatchVector* out_batches) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver, options); + auto listener = std::make_shared(); + StreamDecoder decoder(listener, options); ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 0, 1))); ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 1))); - *out_batches = receiver->record_batches(); + *out_batches = listener->record_batches(); return Status::OK(); } Status ReadSchema(std::shared_ptr* out) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver); + auto listener = std::make_shared(); + StreamDecoder decoder(listener); ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 0, 1))); ARROW_RETURN_NOT_OK(decoder.Consume(SliceBuffer(buffer_, 1))); - *out = receiver->schema(); + *out = listener->schema(); return Status::OK(); } }; @@ -1845,8 +1845,8 @@ TEST(TestRecordBatchStreamReader, MalformedInput) { } TEST(TestStreamDecoder, NextRequiredSize) { - auto receiver = std::make_shared(); - StreamDecoder decoder(receiver); + auto listener = std::make_shared(); + StreamDecoder decoder(listener); auto next_required_size = decoder.next_required_size(); const uint8_t data[1] = {0}; ASSERT_OK(decoder.Consume(data, 1)); diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index a2d2f30685a..571aac33fcb 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -941,12 +941,12 @@ Result> RecordBatchFileReader::Open( return result; } -Status Receiver::EosReceived() { return Status::OK(); } +Status Listener::OnEOS() { return Status::OK(); } -Status Receiver::SchemaReceived(std::shared_ptr schema) { return Status::OK(); } +Status Listener::OnSchemaDecoded(std::shared_ptr schema) { return Status::OK(); } -Status Receiver::RecordBatchReceived(std::shared_ptr record_batch) { - return Status::NotImplemented("RecordBatch receiver isn't implemented"); +Status Listener::OnRecordBatchDecoded(std::shared_ptr record_batch) { + return Status::NotImplemented("OnRecordBatchDecoded() callback isn't implemented"); } class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { @@ -959,10 +959,10 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { }; public: - explicit StreamDecoderImpl(std::shared_ptr receiver, + explicit StreamDecoderImpl(std::shared_ptr listener, const IpcReadOptions& options) : MessageDecoderListener(), - receiver_(std::move(receiver)), + listener_(std::move(listener)), options_(options), state_(State::SCHEMA), message_decoder_(std::shared_ptr(this, [](void*) {}), @@ -975,13 +975,13 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status OnMessageDecoded(std::unique_ptr message) override { switch (state_) { case State::SCHEMA: - ARROW_RETURN_NOT_OK(SchemaMessageReceived(std::move(message))); + ARROW_RETURN_NOT_OK(OnSchemaMessageDecoded(std::move(message))); break; case State::INITIAL_DICTIONARIES: - ARROW_RETURN_NOT_OK(InitialDictionaryMessageReceived(std::move(message))); + ARROW_RETURN_NOT_OK(OnInitialDictionaryMessageDecoded(std::move(message))); break; case State::RECORD_BATCHES: - ARROW_RETURN_NOT_OK(RecordBatchMessageReceived(std::move(message))); + ARROW_RETURN_NOT_OK(OnRecordBatchMessageDecoded(std::move(message))); break; case State::EOS: break; @@ -991,7 +991,7 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { Status OnEOS() override { state_ = State::EOS; - return receiver_->EosReceived(); + return listener_->OnEOS(); } Status Consume(const uint8_t* data, int64_t size) { @@ -1007,20 +1007,20 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { int64_t next_required_size() const { return message_decoder_.next_required_size(); } private: - Status SchemaMessageReceived(std::unique_ptr message) { + Status OnSchemaMessageDecoded(std::unique_ptr message) { RETURN_NOT_OK(PrepareSchemaMessage(*message, &dictionary_memo_, &schema_, options_, &field_inclusion_mask_)); n_required_dictionaries_ = dictionary_memo_.num_fields(); if (n_required_dictionaries_ == 0) { state_ = State::RECORD_BATCHES; - ARROW_RETURN_NOT_OK(receiver_->SchemaReceived(schema_)); + ARROW_RETURN_NOT_OK(listener_->OnSchemaDecoded(schema_)); } else { state_ = State::INITIAL_DICTIONARIES; } return Status::OK(); } - Status InitialDictionaryMessageReceived(std::unique_ptr message) { + Status OnInitialDictionaryMessageDecoded(std::unique_ptr message) { if (message->type() != Message::DICTIONARY_BATCH) { return Status::Invalid("IPC stream did not have the expected number (", dictionary_memo_.num_fields(), @@ -1030,12 +1030,12 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { n_required_dictionaries_--; if (n_required_dictionaries_ == 0) { state_ = State::RECORD_BATCHES; - ARROW_RETURN_NOT_OK(receiver_->SchemaReceived(schema_)); + ARROW_RETURN_NOT_OK(listener_->OnSchemaDecoded(schema_)); } return Status::OK(); } - Status RecordBatchMessageReceived(std::unique_ptr message) { + Status OnRecordBatchMessageDecoded(std::unique_ptr message) { if (message->type() == Message::DICTIONARY_BATCH) { return UpdateDictionaries(*message, &dictionary_memo_, options_); } else { @@ -1045,11 +1045,11 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { auto batch, ReadRecordBatchInternal(*message->metadata(), schema_, field_inclusion_mask_, &dictionary_memo_, options_, reader.get())); - return receiver_->RecordBatchReceived(std::move(batch)); + return listener_->OnRecordBatchDecoded(std::move(batch)); } } - std::shared_ptr receiver_; + std::shared_ptr listener_; IpcReadOptions options_; State state_; MessageDecoder message_decoder_; @@ -1059,9 +1059,9 @@ class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { std::shared_ptr schema_; }; -StreamDecoder::StreamDecoder(std::shared_ptr receiver, +StreamDecoder::StreamDecoder(std::shared_ptr listener, const IpcReadOptions& options) { - impl_.reset(new StreamDecoderImpl(std::move(receiver), options)); + impl_.reset(new StreamDecoderImpl(std::move(listener), options)); } StreamDecoder::~StreamDecoder() {} diff --git a/cpp/src/arrow/ipc/reader.h b/cpp/src/arrow/ipc/reader.h index f31476da7e2..45d2ee32951 100644 --- a/cpp/src/arrow/ipc/reader.h +++ b/cpp/src/arrow/ipc/reader.h @@ -196,17 +196,17 @@ class ARROW_EXPORT RecordBatchFileReader { } }; -/// \class Receiver -/// \brief A general receiver class to receive objects. +/// \class Listener +/// \brief A general listener class to receive events. /// -/// You must implement receiver methods for objects you want to receive. +/// You must implement callback methods for interested events. /// /// This API is EXPERIMENTAL. /// /// \since 0.17.0 -class ARROW_EXPORT Receiver { +class ARROW_EXPORT Listener { public: - virtual ~Receiver() = default; + virtual ~Listener() = default; /// \brief Called when end-of-stream is received. /// @@ -214,56 +214,56 @@ class ARROW_EXPORT Receiver { /// /// \return Status /// - /// \see RecordBatchStreamEmitter - virtual Status EosReceived(); + /// \see StreamDecoder + virtual Status OnEOS(); - /// \brief Called when a record batch is received. + /// \brief Called when a record batch is decoded. /// /// The default implementation just returns /// arrow::Status::NotImplemented(). /// - /// \param[in] record_batch a record batch received + /// \param[in] record_batch a record batch decoded /// \return Status /// - /// \see RecordBatchStreamEmitter - virtual Status RecordBatchReceived(std::shared_ptr record_batch); + /// \see StreamDecoder + virtual Status OnRecordBatchDecoded(std::shared_ptr record_batch); - /// \brief Called when a schema is received. + /// \brief Called when a schema is decoded. /// /// The default implementation just returns arrow::Status::OK(). /// - /// \param[in] schema a schema received + /// \param[in] schema a schema decoded /// \return Status /// - /// \see RecordBatchStreamEmitter - virtual Status SchemaReceived(std::shared_ptr schema); + /// \see StreamDecoder + virtual Status OnSchemaDecoded(std::shared_ptr schema); }; -/// \class RecordBatchReceiverCollect -/// \brief Collect record batches read by RecordBatchStreamEmitter. +/// \class CollectListener +/// \brief Collect schema and record batches decoded by StreamDecoder. /// /// This API is EXPERIMENTAL. /// /// \since 0.17.0 -class ARROW_EXPORT RecordBatchReceiverCollect : public Receiver { +class ARROW_EXPORT CollectListener : public Listener { public: - RecordBatchReceiverCollect() : schema_(), record_batches_() {} - virtual ~RecordBatchReceiverCollect() = default; + CollectListener() : schema_(), record_batches_() {} + virtual ~CollectListener() = default; - Status SchemaReceived(std::shared_ptr schema) override { + Status OnSchemaDecoded(std::shared_ptr schema) override { schema_ = std::move(schema); return Status::OK(); } - Status RecordBatchReceived(std::shared_ptr record_batch) override { + Status OnRecordBatchDecoded(std::shared_ptr record_batch) override { record_batches_.push_back(std::move(record_batch)); return Status::OK(); } - /// \return the read schema + /// \return the decoded schema std::shared_ptr schema() const { return schema_; } - /// \return the all read record batches + /// \return the all decoded record batches std::vector> record_batches() const { return record_batches_; } @@ -287,10 +287,10 @@ class ARROW_EXPORT StreamDecoder { public: /// \brief Construct a stream decoder. /// - /// \param[in] receiver a Receiver that must implement - /// Receiver::RecordBatchReceived() to receive decoded record batches + /// \param[in] listener a Listener that must implement + /// Listener::OnRecordBatchDecoded() to receive decoded record batches /// \param[in] options any IPC reading options (optional) - StreamDecoder(std::shared_ptr receiver, + StreamDecoder(std::shared_ptr listener, const IpcReadOptions& options = IpcReadOptions::Defaults()); virtual ~StreamDecoder(); @@ -298,8 +298,8 @@ class ARROW_EXPORT StreamDecoder { /// \brief Feed data to the decoder as a raw data. /// /// If the decoder can read one or more record batches by the data, - /// the decoder calls receiver->RecordBatchReceived() with a decoded - /// record batch multiple times. + /// the decoder calls listener->OnRecordBatchDecoded() with a + /// decoded record batch multiple times. /// /// \param[in] data a raw data to be processed. This data isn't /// copied. The passed memory must be kept alive through record @@ -311,7 +311,7 @@ class ARROW_EXPORT StreamDecoder { /// \brief Feed data to the decoder as a Buffer. /// /// If the decoder can read one or more record batches by the - /// Buffer, the decoder calls receiver->RecordBatchReceived() with a + /// Buffer, the decoder calls listener->RecordBatchReceived() with a /// decoded record batch multiple times. /// /// \param[in] buffer a Buffer to be processed. From 259250485b03930fa30f57ea4aa5dedc1042158f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 09:47:36 +0900 Subject: [PATCH 11/12] Add missing std::move() --- cpp/src/arrow/ipc/message.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index e0b47942fe2..94887d48ded 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -284,7 +284,7 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le switch (decoder.state()) { case MessageDecoder::State::INITIAL: - return result; + return std::move(result); case MessageDecoder::State::METADATA_LENGTH: return Status::Invalid("metadata length is missing. File offset: ", offset, ", metadata length: ", metadata_length); @@ -301,7 +301,7 @@ Result> ReadMessage(int64_t offset, int32_t metadata_le " bytes for message body, got ", body->size()); } RETURN_NOT_OK(decoder.Consume(body)); - return result; + return std::move(result); } case MessageDecoder::State::EOS: return Status::Invalid("Unexpected empty message in IPC file format"); From 7bcc20ef6f2a4f92bc4f6985576fb33c18401098 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 10 Apr 2020 09:50:40 +0900 Subject: [PATCH 12/12] Suppress warnings --- cpp/src/arrow/ipc/message.cc | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/ipc/message.cc b/cpp/src/arrow/ipc/message.cc index 94887d48ded..75a8cd7a11b 100644 --- a/cpp/src/arrow/ipc/message.cc +++ b/cpp/src/arrow/ipc/message.cc @@ -456,25 +456,7 @@ class MessageDecoder::MessageDecoderImpl { next_required_size_(initial_next_required_size), chunks_(), buffered_size_(0), - metadata_(nullptr) { - switch (state_) { - case State::INITIAL: - listener_->OnInitial(); - break; - case State::METADATA_LENGTH: - listener_->OnMetadataLength(); - break; - case State::METADATA: - listener_->OnMetadata(); - break; - case State::BODY: - listener_->OnBody(); - break; - case State::EOS: - listener_->OnEOS(); - break; - } - } + metadata_(nullptr) {} Status ConsumeData(const uint8_t* data, int64_t size) { if (buffered_size_ == 0) { @@ -849,7 +831,7 @@ class InputStreamMessageReader : public MessageReader, public MessageDecoderList return Status::OK(); } - Result> ReadNextMessage() { + Result> ReadNextMessage() override { ARROW_RETURN_NOT_OK(DecodeMessage(&decoder_, stream_)); return std::move(message_); }