-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-35498: [C++] Relax EnsureAlignment check in Acero from requiring 64-byte aligned buffers to requiring value-aligned buffers #35565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
87e7dfa
9124352
bd67396
692c32c
b3c54a9
2918936
a429a7c
84d91db
55079a3
feb917e
28a046b
e7545ed
9d1071d
e869650
d6f9b00
c8dfd40
ad6e3df
c434bb2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -496,6 +496,16 @@ struct ARROW_ACERO_EXPORT Declaration { | |
| std::string label; | ||
| }; | ||
|
|
||
| /// \brief How to handle unaligned buffers | ||
| enum class UnalignedBufferHandling { kWarn, kIgnore, kReallocate, kError }; | ||
|
|
||
| /// \brief get the default behavior of unaligned buffer handling | ||
| /// | ||
| /// This is configurable via the ACERO_ALIGNMENT_HANDLING environment variable which | ||
| /// can be set to "warn", "ignore", "reallocate", or "error". If the environment | ||
| /// variable is not set, or is set to an invalid value, this will return kWarn | ||
| UnalignedBufferHandling GetDefaultUnalignedBufferHandling(); | ||
|
|
||
| /// \brief plan-wide options that can be specified when executing an execution plan | ||
| struct ARROW_ACERO_EXPORT QueryOptions { | ||
| /// \brief Should the plan use a legacy batching strategy | ||
|
|
@@ -562,6 +572,36 @@ struct ARROW_ACERO_EXPORT QueryOptions { | |
| /// | ||
| /// If set then the number of names must equal the number of output columns | ||
| std::vector<std::string> field_names; | ||
|
|
||
| /// \brief Policy for unaligned buffers in source data | ||
| /// | ||
| /// Various compute functions and acero internals will type pun array | ||
| /// buffers from uint8_t* to some kind of value type (e.g. we might | ||
| /// cast to int32_t* to add two int32 arrays) | ||
| /// | ||
| /// If the buffer is poorly aligned (e.g. an int32 array is not aligned | ||
| /// on a 4-byte boundary) then this is technically undefined behavior in C++. | ||
| /// However, most modern compilers and CPUs are fairly tolerant of this | ||
| /// behavior and nothing bad (beyond a small hit to performance) is likely | ||
| /// to happen. | ||
| /// | ||
| /// Note that this only applies to source buffers. All buffers allocated internally | ||
| /// by Acero will be suitably aligned. | ||
| /// | ||
| /// If this field is set to kWarn then Acero will check if any buffers are unaligned | ||
|
||
| /// and, if they are, will emit a warning. | ||
| /// | ||
| /// If this field is set to kReallocate then Acero will allocate a new, suitably aligned | ||
| /// buffer and copy the contents from the old buffer into this new buffer. | ||
| /// | ||
| /// If this field is set to kError then Acero will gracefully abort the plan instead. | ||
| /// | ||
| /// If this field is set to kIgnore then Acero will not even check if the buffers are | ||
| /// unaligned. | ||
| /// | ||
| /// If this field is not set then it will be treated as kWarn unless overridden | ||
| /// by the ACERO_ALIGNMENT_HANDLING environment variable | ||
| std::optional<UnalignedBufferHandling> unaligned_buffer_handling; | ||
| }; | ||
|
|
||
| /// \brief Calculate the output schema of a declaration | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -1704,5 +1704,45 @@ TEST(ExecPlanExecution, SegmentedAggregationWithBatchCrossingSegment) { | |||||||||||||||||
| {expected}); | ||||||||||||||||||
| } | ||||||||||||||||||
|
|
||||||||||||||||||
| TEST(ExecPlanExecution, UnalignedInput) { | ||||||||||||||||||
| std::shared_ptr<Array> array = ArrayFromJSON(int32(), "[1, 2, 3]"); | ||||||||||||||||||
| std::shared_ptr<Array> unaligned = UnalignBuffers(*array); | ||||||||||||||||||
| ASSERT_OK_AND_ASSIGN(ExecBatch sample_batch, | ||||||||||||||||||
| ExecBatch::Make({unaligned}, array->length())); | ||||||||||||||||||
|
|
||||||||||||||||||
| BatchesWithSchema data; | ||||||||||||||||||
| data.batches = {std::move(sample_batch)}; | ||||||||||||||||||
| data.schema = schema({field("i32", int32())}); | ||||||||||||||||||
|
|
||||||||||||||||||
| Declaration plan = Declaration::Sequence({ | ||||||||||||||||||
| {"exec_batch_source", ExecBatchSourceNodeOptions(data.schema, data.batches)}, | ||||||||||||||||||
| }); | ||||||||||||||||||
|
|
||||||||||||||||||
| int64_t initial_bytes_allocated = default_memory_pool()->total_bytes_allocated(); | ||||||||||||||||||
|
|
||||||||||||||||||
| // By default we should warn and so the plan should finish ok | ||||||||||||||||||
| ASSERT_OK(DeclarationToStatus(plan)); | ||||||||||||||||||
| ASSERT_EQ(initial_bytes_allocated, default_memory_pool()->total_bytes_allocated()); | ||||||||||||||||||
|
|
||||||||||||||||||
| QueryOptions query_options; | ||||||||||||||||||
|
|
||||||||||||||||||
| #ifndef ARROW_UBSAN | ||||||||||||||||||
| // Nothing should happen if we ignore alignment | ||||||||||||||||||
| query_options.unaligned_buffer_handling = UnalignedBufferHandling::kIgnore; | ||||||||||||||||||
| ASSERT_OK(DeclarationToStatus(plan, query_options)); | ||||||||||||||||||
| ASSERT_EQ(initial_bytes_allocated, default_memory_pool()->total_bytes_allocated()); | ||||||||||||||||||
|
||||||||||||||||||
| query_options.unaligned_buffer_handling = UnalignedBufferHandling::kIgnore; | |
| ASSERT_OK(DeclarationToStatus(plan, query_options)); | |
| ASSERT_EQ(initial_bytes_allocated, default_memory_pool()->total_bytes_allocated()); | |
| #ifndef ARROW_UBSAN | |
| query_options.unaligned_buffer_handling = UnalignedBufferHandling::kIgnore; | |
| ASSERT_OK(DeclarationToStatus(plan, query_options)); | |
| ASSERT_EQ(initial_bytes_allocated, default_memory_pool()->total_bytes_allocated()); | |
| #endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I use ARROW_UBSAN regularly and so I was curious why this wasn't a problem. It turns out we never pass -fsanitize=alignment to ubsan. Perhaps we should add this flag?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps open a separate issue for that, but it may open a can of worms :-)
Uh oh!
There was an error while loading. Please reload this page.