-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-43944: [C++][Parquet] Add support for arrow::ArrayStatistics: non zero-copy int based types #43945
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-43944: [C++][Parquet] Add support for arrow::ArrayStatistics: non zero-copy int based types #43945
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -319,26 +319,59 @@ void ReconstructChunksWithoutNulls(::arrow::ArrayVector* chunks) { | |
| } | ||
|
|
||
| template <typename ArrowType, typename ParquetType> | ||
| Status TransferInt(RecordReader* reader, MemoryPool* pool, | ||
| const std::shared_ptr<Field>& field, Datum* out) { | ||
| Status TransferInt(RecordReader* reader, | ||
| std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, | ||
| const ReaderContext* ctx, const std::shared_ptr<Field>& field, | ||
| Datum* out) { | ||
| using ArrowCType = typename ArrowType::c_type; | ||
| using ParquetCType = typename ParquetType::c_type; | ||
| int64_t length = reader->values_written(); | ||
| ARROW_ASSIGN_OR_RAISE(auto data, | ||
| ::arrow::AllocateBuffer(length * sizeof(ArrowCType), pool)); | ||
| ::arrow::AllocateBuffer(length * sizeof(ArrowCType), ctx->pool)); | ||
|
|
||
| auto values = reinterpret_cast<const ParquetCType*>(reader->values()); | ||
| auto out_ptr = reinterpret_cast<ArrowCType*>(data->mutable_data()); | ||
| std::copy(values, values + length, out_ptr); | ||
| int64_t null_count = 0; | ||
| std::vector<std::shared_ptr<Buffer>> buffers = {nullptr, std::move(data)}; | ||
| if (field->nullable()) { | ||
| *out = std::make_shared<ArrayType<ArrowType>>(field->type(), length, std::move(data), | ||
| reader->ReleaseIsValid(), | ||
| reader->null_count()); | ||
| } else { | ||
| *out = | ||
| std::make_shared<ArrayType<ArrowType>>(field->type(), length, std::move(data), | ||
| /*null_bitmap=*/nullptr, /*null_count=*/0); | ||
| null_count = reader->null_count(); | ||
| buffers[0] = reader->ReleaseIsValid(); | ||
| } | ||
| auto array_data = | ||
| ::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count); | ||
| auto array_statistics = std::make_shared<::arrow::ArrayStatistics>(); | ||
| array_statistics->null_count = null_count; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The null_count for some type ( nested ) would be a bit weird, FYI
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the information. |
||
| auto statistics = metadata->statistics().get(); | ||
| if (statistics) { | ||
| if (statistics->HasDistinctCount()) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need a separate function for the stats conversion?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll do it when I add more target types as the next pull request. |
||
| array_statistics->distinct_count = statistics->distinct_count(); | ||
| } | ||
| if (statistics->HasMinMax()) { | ||
| auto typed_statistics = | ||
| static_cast<::parquet::TypedStatistics<ParquetType>*>(statistics); | ||
| const ArrowCType min = typed_statistics->min(); | ||
| const ArrowCType max = typed_statistics->max(); | ||
| if (std::is_signed<ArrowCType>::value) { | ||
| array_statistics->min = static_cast<int64_t>(min); | ||
| array_statistics->max = static_cast<int64_t>(max); | ||
| } else { | ||
| array_statistics->min = static_cast<uint64_t>(min); | ||
| array_statistics->max = static_cast<uint64_t>(max); | ||
| } | ||
| // We can assume that integer based min/max are always exact if | ||
| // they exist. Apache Parquet's "Statistics" has | ||
| // "is_min_value_exact" and "is_max_value_exact" but we can | ||
| // ignore them for integer based min/max. | ||
| // | ||
| // See also the discussion at dev@parquet.apache.org: | ||
| // https://lists.apache.org/thread/zfnmg5p51b7oylft5w5k4670wgkd4zv4 | ||
| array_statistics->is_min_exact = true; | ||
| array_statistics->is_max_exact = true; | ||
|
Comment on lines
+369
to
+370
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add correspond comment here? This might be a bit tricky
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. We should document about the discussion at #43595 (comment) , right? BTW, could you share the e-mail URL for #43595 (comment) ?
I couldn't find it at https://lists.apache.org/list.html?dev@parquet.apache.org . Ah, I forgot to add a writer check here. I should have set
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, let me setup a discussion, generally if it's from Parquet C++, it will works. I'm a bit busy this morning preparing for my tour, I'll try to work it out this noon
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No problem. Thanks!
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks!
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've added a comment that we can always use I didn't need
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FYI, I found the string and FLBA might being truncated, other types in public impl will not being truncated if exists |
||
| } | ||
| } | ||
| array_data->statistics = std::move(array_statistics); | ||
| *out = std::make_shared<ArrayType<ArrowType>>(std::move(array_data)); | ||
| return Status::OK(); | ||
| } | ||
|
|
||
|
|
@@ -728,21 +761,26 @@ Status TransferHalfFloat(RecordReader* reader, MemoryPool* pool, | |
|
|
||
| } // namespace | ||
|
|
||
| #define TRANSFER_INT32(ENUM, ArrowType) \ | ||
| case ::arrow::Type::ENUM: { \ | ||
| Status s = TransferInt<ArrowType, Int32Type>(reader, pool, value_field, &result); \ | ||
| RETURN_NOT_OK(s); \ | ||
| #define TRANSFER_INT32(ENUM, ArrowType) \ | ||
| case ::arrow::Type::ENUM: { \ | ||
| Status s = TransferInt<ArrowType, Int32Type>(reader, std::move(metadata), ctx, \ | ||
| value_field, &result); \ | ||
| RETURN_NOT_OK(s); \ | ||
| } break; | ||
|
|
||
| #define TRANSFER_INT64(ENUM, ArrowType) \ | ||
| case ::arrow::Type::ENUM: { \ | ||
| Status s = TransferInt<ArrowType, Int64Type>(reader, pool, value_field, &result); \ | ||
| RETURN_NOT_OK(s); \ | ||
| #define TRANSFER_INT64(ENUM, ArrowType) \ | ||
| case ::arrow::Type::ENUM: { \ | ||
| Status s = TransferInt<ArrowType, Int64Type>(reader, std::move(metadata), ctx, \ | ||
| value_field, &result); \ | ||
| RETURN_NOT_OK(s); \ | ||
| } break; | ||
|
|
||
| Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& value_field, | ||
| const ColumnDescriptor* descr, MemoryPool* pool, | ||
| Status TransferColumnData(RecordReader* reader, | ||
| std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, | ||
| const std::shared_ptr<Field>& value_field, | ||
| const ColumnDescriptor* descr, const ReaderContext* ctx, | ||
| std::shared_ptr<ChunkedArray>* out) { | ||
| auto pool = ctx->pool; | ||
| Datum result; | ||
| std::shared_ptr<ChunkedArray> chunked_result; | ||
| switch (value_field->type()->id()) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The call to
input_->column_chunk_metadata()fails here if the list ofrow_groupsininput_is empty, becauseinput_is not yet initialized properly at this point in that caseVia
row_group_metadata()->RowGroup(-1)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch!
Could you open a new issue for it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#45339