-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-14429: [C++] Speed up IPC file reader on high-latency filesystems #11535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
86e9883
99ac26f
fd1902c
c3758b1
860a14d
3c2ae3d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -980,8 +980,9 @@ static Result<std::unique_ptr<Message>> ReadMessageFromBlock( | |
| // TODO(wesm): this breaks integration tests, see ARROW-3256 | ||
| // DCHECK_EQ((*out)->body_length(), block.body_length); | ||
|
|
||
| ARROW_ASSIGN_OR_RAISE(auto message, ReadMessage(block.offset, block.metadata_length, | ||
| file, fields_loader)); | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| auto message, ReadMessage(block.offset, block.metadata_length, block.body_length, | ||
| file, fields_loader)); | ||
| return std::move(message); | ||
| } | ||
|
|
||
|
|
@@ -1257,15 +1258,23 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { | |
| } | ||
|
|
||
| Future<> ReadFooterAsync(arrow::internal::Executor* executor) { | ||
| // When reading the footer, read up to this much additional data in | ||
| // an attempt to avoid a second I/O operation (which can be slow | ||
| // on a high-latency filesystem like S3) | ||
| constexpr static int kFooterReadaheadSize = 512 * 1024; | ||
|
|
||
| const int32_t magic_size = static_cast<int>(strlen(kArrowMagicBytes)); | ||
|
|
||
| if (footer_offset_ <= magic_size * 2 + 4) { | ||
| return Status::Invalid("File is too small: ", footer_offset_); | ||
| } | ||
|
|
||
| int file_end_size = static_cast<int>(magic_size + sizeof(int32_t)); | ||
| int readahead = std::min<int>(kFooterReadaheadSize, | ||
| static_cast<int>(footer_offset_ - file_end_size)); | ||
| auto self = std::dynamic_pointer_cast<RecordBatchFileReaderImpl>(shared_from_this()); | ||
| auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size, file_end_size); | ||
| auto read_magic = file_->ReadAsync(footer_offset_ - file_end_size - readahead, | ||
| file_end_size + readahead); | ||
| if (executor) read_magic = executor->Transfer(std::move(read_magic)); | ||
| return read_magic | ||
| .Then([=](const std::shared_ptr<Buffer>& buffer) | ||
|
|
@@ -1276,23 +1285,35 @@ class RecordBatchFileReaderImpl : public RecordBatchFileReader { | |
| "from end of file"); | ||
| } | ||
|
|
||
| if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) { | ||
| const uint8_t* magic_start = buffer->data() + readahead; | ||
| if (memcmp(magic_start + sizeof(int32_t), kArrowMagicBytes, magic_size)) { | ||
| return Status::Invalid("Not an Arrow file"); | ||
| } | ||
|
|
||
| int32_t footer_length = BitUtil::FromLittleEndian( | ||
| *reinterpret_cast<const int32_t*>(buffer->data())); | ||
|
|
||
| int32_t footer_length = | ||
| BitUtil::FromLittleEndian(util::SafeLoadAs<int32_t>(magic_start)); | ||
| if (footer_length <= 0 || | ||
| footer_length > self->footer_offset_ - magic_size * 2 - 4) { | ||
| return Status::Invalid("File is smaller than indicated metadata size"); | ||
| } | ||
|
|
||
| // Now read the footer | ||
| auto read_footer = self->file_->ReadAsync( | ||
| self->footer_offset_ - footer_length - file_end_size, footer_length); | ||
| if (executor) read_footer = executor->Transfer(std::move(read_footer)); | ||
| return read_footer; | ||
| if (footer_length <= readahead) { | ||
| return SliceBuffer(buffer, buffer->size() - file_end_size - footer_length, | ||
| footer_length); | ||
| } | ||
|
|
||
|
||
| const int64_t already_read = buffer->size() - file_end_size; | ||
| auto read_remainder = | ||
| self->file_->ReadAsync(self->footer_offset_ - footer_length - file_end_size, | ||
| footer_length - already_read); | ||
| auto* memory_pool = options_.memory_pool; | ||
| if (executor) read_remainder = executor->Transfer(std::move(read_remainder)); | ||
| return read_remainder.Then([memory_pool, buffer, already_read]( | ||
| const std::shared_ptr<Buffer>& remainder) { | ||
| return ConcatenateBuffers({remainder, SliceBuffer(buffer, 0, already_read)}, | ||
| memory_pool); | ||
| }); | ||
| }) | ||
| .Then([=](const std::shared_ptr<Buffer>& buffer) -> Status { | ||
| self->footer_buffer_ = buffer; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit:
metadatais a slightly inaccurate name now.