This repository was archived by the owner on May 10, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 191
PARQUET-687: C++: Switch to PLAIN encoding if dictionary grows too large #157
Closed
Closed
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
84f360d
added dictionary fallback support with tests
54af38a
clang format
dd0cc7e
Add all types to the test
312bad8
minor changes
da46033
added comments and fixed review suggestions
eac9114
clang format
c498aeb
modify comment style
6f51df6
minor comment fix
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,8 @@ ColumnWriter::ColumnWriter(const ColumnDescriptor* descr, | |
| num_buffered_encoded_values_(0), | ||
| num_rows_(0), | ||
| total_bytes_written_(0), | ||
| closed_(false) { | ||
| closed_(false), | ||
| fallback_(false) { | ||
| InitSinks(); | ||
| } | ||
|
|
||
|
|
@@ -118,7 +119,13 @@ void ColumnWriter::AddDataPage() { | |
| DataPage page( | ||
| uncompressed_data, num_buffered_values_, encoding_, Encoding::RLE, Encoding::RLE); | ||
|
|
||
| data_pages_.push_back(std::move(page)); | ||
| // Write the page to OutputStream eagerly if there is no dictionary or | ||
| // if dictionary encoding has fallen back to PLAIN | ||
| if (has_dictionary_ && !fallback_) { // Save pages until end of dictionary encoding | ||
| data_pages_.push_back(std::move(page)); | ||
| } else { // Eagerly write pages | ||
| WriteDataPage(page); | ||
| } | ||
|
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fast path to serialize data pages. |
||
|
|
||
| // Re-initialize the sinks as GetBuffer made them invalid. | ||
| InitSinks(); | ||
|
|
@@ -134,52 +141,71 @@ void ColumnWriter::WriteDataPage(const DataPage& page) { | |
| int64_t ColumnWriter::Close() { | ||
| if (!closed_) { | ||
| closed_ = true; | ||
| if (has_dictionary_) { WriteDictionaryPage(); } | ||
| // Write all outstanding data to a new page | ||
| if (num_buffered_values_ > 0) { AddDataPage(); } | ||
| if (has_dictionary_ && !fallback_) { WriteDictionaryPage(); } | ||
|
|
||
| FlushBufferedDataPages(); | ||
|
|
||
| for (size_t i = 0; i < data_pages_.size(); i++) { | ||
| WriteDataPage(data_pages_[i]); | ||
| } | ||
| pager_->Close(has_dictionary_, fallback_); | ||
| } | ||
|
|
||
| if (num_rows_ != expected_rows_) { | ||
| throw ParquetException( | ||
| "Less then the number of expected rows written in" | ||
| "Less than the number of expected rows written in" | ||
| " the current column chunk"); | ||
| } | ||
|
|
||
| pager_->Close(); | ||
|
|
||
| return total_bytes_written_; | ||
| } | ||
|
|
||
| void ColumnWriter::FlushBufferedDataPages() { | ||
| // Write all outstanding data to a new page | ||
| if (num_buffered_values_ > 0) { AddDataPage(); } | ||
| for (size_t i = 0; i < data_pages_.size(); i++) { | ||
| WriteDataPage(data_pages_[i]); | ||
| } | ||
| data_pages_.clear(); | ||
| } | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
| // TypedColumnWriter | ||
|
|
||
| template <typename Type> | ||
| TypedColumnWriter<Type>::TypedColumnWriter(const ColumnDescriptor* schema, | ||
| TypedColumnWriter<Type>::TypedColumnWriter(const ColumnDescriptor* descr, | ||
| std::unique_ptr<PageWriter> pager, int64_t expected_rows, Encoding::type encoding, | ||
| const WriterProperties* properties) | ||
| : ColumnWriter(schema, std::move(pager), expected_rows, | ||
| : ColumnWriter(descr, std::move(pager), expected_rows, | ||
| (encoding == Encoding::PLAIN_DICTIONARY || | ||
| encoding == Encoding::RLE_DICTIONARY), | ||
| encoding, properties) { | ||
| switch (encoding) { | ||
| case Encoding::PLAIN: | ||
| current_encoder_ = std::unique_ptr<EncoderType>( | ||
| new PlainEncoder<Type>(schema, properties->allocator())); | ||
| current_encoder_.reset(new PlainEncoder<Type>(descr, properties->allocator())); | ||
| break; | ||
| case Encoding::PLAIN_DICTIONARY: | ||
| case Encoding::RLE_DICTIONARY: | ||
| current_encoder_ = std::unique_ptr<EncoderType>( | ||
| new DictEncoder<Type>(schema, &pool_, properties->allocator())); | ||
| current_encoder_.reset( | ||
| new DictEncoder<Type>(descr, &pool_, properties->allocator())); | ||
| break; | ||
| default: | ||
| ParquetException::NYI("Selected encoding is not supported"); | ||
| } | ||
| } | ||
|
|
||
| // Only one Dictionary Page is written. | ||
| // Fallback to PLAIN if dictionary page limit is reached. | ||
| template <typename Type> | ||
| void TypedColumnWriter<Type>::CheckDictionarySizeLimit() { | ||
| auto dict_encoder = static_cast<DictEncoder<Type>*>(current_encoder_.get()); | ||
| if (dict_encoder->dict_encoded_size() >= properties_->dictionary_pagesize_limit()) { | ||
| WriteDictionaryPage(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we extract the following lines into a function |
||
| // Serialize the buffered Dictionary Indicies | ||
| FlushBufferedDataPages(); | ||
| fallback_ = true; | ||
| // Only PLAIN encoding is supported for fallback in V1 | ||
| current_encoder_.reset(new PlainEncoder<Type>(descr_, properties_->allocator())); | ||
| } | ||
| } | ||
|
|
||
| template <typename Type> | ||
| void TypedColumnWriter<Type>::WriteDictionaryPage() { | ||
| auto dict_encoder = static_cast<DictEncoder<Type>*>(current_encoder_.get()); | ||
|
|
||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment to explicitly verify the encodings did uncover a bug in the metadata writer