From 6ed85e631b67591ed87ba6421dd8328530190f38 Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Tue, 7 Mar 2023 14:57:31 -0500 Subject: [PATCH 01/11] Moved code from compute/exec to compute to reduce entanglement with acero --- cpp/src/arrow/CMakeLists.txt | 4 +- cpp/src/arrow/compute/CMakeLists.txt | 3 +- cpp/src/arrow/compute/exec/CMakeLists.txt | 3 +- cpp/src/arrow/compute/exec/asof_join_node.cc | 2 +- .../arrow/compute/exec/bloom_filter_test.cc | 2 +- cpp/src/arrow/compute/exec/hash_join_node.cc | 2 +- cpp/src/arrow/compute/exec/swiss_join.cc | 2 +- cpp/src/arrow/compute/exec/util.h | 172 +++++------ .../arrow/compute/kernels/hash_aggregate.cc | 2 +- cpp/src/arrow/compute/{exec => }/key_hash.cc | 2 +- cpp/src/arrow/compute/{exec => }/key_hash.h | 0 .../arrow/compute/{exec => }/key_hash_avx2.cc | 2 +- .../arrow/compute/{exec => }/key_hash_test.cc | 2 +- cpp/src/arrow/compute/light_array.h | 3 +- cpp/src/arrow/compute/light_array_test.cc | 284 +++++++++--------- cpp/src/arrow/compute/row/grouper.cc | 2 +- cpp/src/arrow/compute/util.h | 131 ++++++++ 17 files changed, 378 insertions(+), 240 deletions(-) rename cpp/src/arrow/compute/{exec => }/key_hash.cc (99%) rename cpp/src/arrow/compute/{exec => }/key_hash.h (100%) rename cpp/src/arrow/compute/{exec => }/key_hash_avx2.cc (99%) rename cpp/src/arrow/compute/{exec => }/key_hash_test.cc (99%) create mode 100644 cpp/src/arrow/compute/util.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2a4748d0a40..bf406ca29fb 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -401,7 +401,7 @@ list(APPEND compute/exec/hash_join.cc compute/exec/hash_join_dict.cc compute/exec/hash_join_node.cc - compute/exec/key_hash.cc + compute/key_hash.cc compute/exec/key_map.cc compute/exec/map_node.cc compute/exec/options.cc @@ -442,7 +442,7 @@ list(APPEND compute/row/row_internal.cc) append_avx2_src(compute/exec/bloom_filter_avx2.cc) -append_avx2_src(compute/exec/key_hash_avx2.cc) +append_avx2_src(compute/key_hash_avx2.cc) append_avx2_src(compute/exec/key_map_avx2.cc) append_avx2_src(compute/exec/swiss_join_avx2.cc) append_avx2_src(compute/exec/util_avx2.cc) diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt index cdf019b798b..4accec15d9c 100644 --- a/cpp/src/arrow/compute/CMakeLists.txt +++ b/cpp/src/arrow/compute/CMakeLists.txt @@ -82,7 +82,8 @@ add_arrow_compute_test(internals_test exec_test.cc kernel_test.cc light_array_test.cc - registry_test.cc) + registry_test.cc + key_hash_test.cc) add_arrow_benchmark(function_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index 9f3eedb63de..cc8e7175a2a 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -46,8 +46,7 @@ add_arrow_compute_test(hash_join_node_test "arrow-compute" SOURCES hash_join_node_test.cc - bloom_filter_test.cc - key_hash_test.cc) + bloom_filter_test.cc) add_arrow_compute_test(pivot_longer_node_test PREFIX "arrow-compute" diff --git a/cpp/src/arrow/compute/exec/asof_join_node.cc b/cpp/src/arrow/compute/exec/asof_join_node.cc index 00a733be25e..4aa0fb72f05 100644 --- a/cpp/src/arrow/compute/exec/asof_join_node.cc +++ b/cpp/src/arrow/compute/exec/asof_join_node.cc @@ -30,7 +30,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #include "arrow/compute/exec/exec_plan.h" -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/schema_util.h" diff --git a/cpp/src/arrow/compute/exec/bloom_filter_test.cc b/cpp/src/arrow/compute/exec/bloom_filter_test.cc index 50993b4cb10..5dc35ed42ab 100644 --- a/cpp/src/arrow/compute/exec/bloom_filter_test.cc +++ b/cpp/src/arrow/compute/exec/bloom_filter_test.cc @@ -23,7 +23,7 @@ #include #include #include "arrow/compute/exec/bloom_filter.h" -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/task_util.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/util.h" diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc index 6155ebd603f..c270b868ecc 100644 --- a/cpp/src/arrow/compute/exec/hash_join_node.cc +++ b/cpp/src/arrow/compute/exec/hash_join_node.cc @@ -24,7 +24,7 @@ #include "arrow/compute/exec/hash_join.h" #include "arrow/compute/exec/hash_join_dict.h" #include "arrow/compute/exec/hash_join_node.h" -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/schema_util.h" #include "arrow/compute/exec/util.h" diff --git a/cpp/src/arrow/compute/exec/swiss_join.cc b/cpp/src/arrow/compute/exec/swiss_join.cc index de9b720c480..69479325bbb 100644 --- a/cpp/src/arrow/compute/exec/swiss_join.cc +++ b/cpp/src/arrow/compute/exec/swiss_join.cc @@ -22,7 +22,7 @@ #include #include "arrow/array/util.h" // MakeArrayFromScalar #include "arrow/compute/exec/hash_join.h" -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/swiss_join_internal.h" #include "arrow/compute/exec/util.h" #include "arrow/compute/kernels/row_encoder_internal.h" diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h index a2018277cdc..def9e0c714b 100644 --- a/cpp/src/arrow/compute/exec/util.h +++ b/cpp/src/arrow/compute/exec/util.h @@ -28,6 +28,7 @@ #include "arrow/compute/exec/expression.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/type_fwd.h" +#include "arrow/compute/util.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" @@ -38,31 +39,33 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/type_fwd.h" -#if defined(__clang__) || defined(__GNUC__) -#define BYTESWAP(x) __builtin_bswap64(x) -#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31))) -#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63))) -#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) -#elif defined(_MSC_VER) -#include -#define BYTESWAP(x) _byteswap_uint64(x) -#define ROTL(x, n) _rotl((x), (n)) -#define ROTL64(x, n) _rotl64((x), (n)) -#if defined(_M_X64) || defined(_M_I86) -#include // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx -#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) -#else -#define PREFETCH(ptr) (void)(ptr) /* disabled */ -#endif -#endif +// DIPO +//#if defined(__clang__) || defined(__GNUC__) +//#define BYTESWAP(x) __builtin_bswap64(x) +//#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31))) +//#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63))) +//#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +//#elif defined(_MSC_VER) +//#include +//#define BYTESWAP(x) _byteswap_uint64(x) +//#define ROTL(x, n) _rotl((x), (n)) +//#define ROTL64(x, n) _rotl64((x), (n)) +//#if defined(_M_X64) || defined(_M_I86) +//#include // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx +//#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +//#else +//#define PREFETCH(ptr) (void)(ptr) /* disabled */ +//#endif +//#endif namespace arrow { namespace util { -template -inline void CheckAlignment(const void* ptr) { - ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); -} +// DIPO +//template +//inline void CheckAlignment(const void* ptr) { +// ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); +//} // Some platforms typedef int64_t as long int instead of long long int, // which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics @@ -84,69 +87,70 @@ class MiniBatch { static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; }; -/// Storage used to allocate temporary vectors of a batch size. -/// Temporary vectors should resemble allocating temporary variables on the stack -/// but in the context of vectorized processing where we need to store a vector of -/// temporaries instead of a single value. -class TempVectorStack { - template - friend class TempVectorHolder; - - public: - Status Init(MemoryPool* pool, int64_t size) { - num_vectors_ = 0; - top_ = 0; - buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t); - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); - // Ensure later operations don't accidentally read uninitialized memory. - std::memset(buffer->mutable_data(), 0xFF, size); - buffer_ = std::move(buffer); - return Status::OK(); - } - - private: - int64_t PaddedAllocationSize(int64_t num_bytes) { - // Round up allocation size to multiple of 8 bytes - // to avoid returning temp vectors with unaligned address. - // - // Also add padding at the end to facilitate loads and stores - // using SIMD when number of vector elements is not divisible - // by the number of SIMD lanes. - // - return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; - } - void alloc(uint32_t num_bytes, uint8_t** data, int* id) { - int64_t old_top = top_; - top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); - // Stack overflow check - ARROW_DCHECK(top_ <= buffer_size_); - *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); - // We set 8 bytes before the beginning of the allocated range and - // 8 bytes after the end to check for stack overflow (which would - // result in those known bytes being corrupted). - reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; - reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; - *id = num_vectors_++; - } - void release(int id, uint32_t num_bytes) { - ARROW_DCHECK(num_vectors_ == id + 1); - int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == - kGuard2); - ARROW_DCHECK(top_ >= size); - top_ -= size; - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == - kGuard1); - --num_vectors_; - } - static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; - static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; - static constexpr int64_t kPadding = 64; - int num_vectors_; - int64_t top_; - std::unique_ptr buffer_; - int64_t buffer_size_; -}; +// DIPO +///// Storage used to allocate temporary vectors of a batch size. +///// Temporary vectors should resemble allocating temporary variables on the stack +///// but in the context of vectorized processing where we need to store a vector of +///// temporaries instead of a single value. +//class TempVectorStack { +// template +// friend class TempVectorHolder; +// +// public: +// Status Init(MemoryPool* pool, int64_t size) { +// num_vectors_ = 0; +// top_ = 0; +// buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t); +// ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); +// // Ensure later operations don't accidentally read uninitialized memory. +// std::memset(buffer->mutable_data(), 0xFF, size); +// buffer_ = std::move(buffer); +// return Status::OK(); +// } +// +// private: +// int64_t PaddedAllocationSize(int64_t num_bytes) { +// // Round up allocation size to multiple of 8 bytes +// // to avoid returning temp vectors with unaligned address. +// // +// // Also add padding at the end to facilitate loads and stores +// // using SIMD when number of vector elements is not divisible +// // by the number of SIMD lanes. +// // +// return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; +// } +// void alloc(uint32_t num_bytes, uint8_t** data, int* id) { +// int64_t old_top = top_; +// top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); +// // Stack overflow check +// ARROW_DCHECK(top_ <= buffer_size_); +// *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); +// // We set 8 bytes before the beginning of the allocated range and +// // 8 bytes after the end to check for stack overflow (which would +// // result in those known bytes being corrupted). +// reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; +// reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; +// *id = num_vectors_++; +// } +// void release(int id, uint32_t num_bytes) { +// ARROW_DCHECK(num_vectors_ == id + 1); +// int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); +// ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == +// kGuard2); +// ARROW_DCHECK(top_ >= size); +// top_ -= size; +// ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == +// kGuard1); +// --num_vectors_; +// } +// static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; +// static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; +// static constexpr int64_t kPadding = 64; +// int num_vectors_; +// int64_t top_; +// std::unique_ptr buffer_; +// int64_t buffer_size_; +//}; template class TempVectorHolder { diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index c0459a14859..eecfb054321 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -29,7 +29,7 @@ #include "arrow/buffer_builder.h" #include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/key_map.h" #include "arrow/compute/exec/util.h" #include "arrow/compute/exec_internal.h" diff --git a/cpp/src/arrow/compute/exec/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc similarity index 99% rename from cpp/src/arrow/compute/exec/key_hash.cc rename to cpp/src/arrow/compute/key_hash.cc index 5ff0d4cf1e5..993d68b1fa3 100644 --- a/cpp/src/arrow/compute/exec/key_hash.cc +++ b/cpp/src/arrow/compute/key_hash.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/exec/key_hash.h" +#include "key_hash.h" #include diff --git a/cpp/src/arrow/compute/exec/key_hash.h b/cpp/src/arrow/compute/key_hash.h similarity index 100% rename from cpp/src/arrow/compute/exec/key_hash.h rename to cpp/src/arrow/compute/key_hash.h diff --git a/cpp/src/arrow/compute/exec/key_hash_avx2.cc b/cpp/src/arrow/compute/key_hash_avx2.cc similarity index 99% rename from cpp/src/arrow/compute/exec/key_hash_avx2.cc rename to cpp/src/arrow/compute/key_hash_avx2.cc index d36df9fc9f3..f30c3460bda 100644 --- a/cpp/src/arrow/compute/exec/key_hash_avx2.cc +++ b/cpp/src/arrow/compute/key_hash_avx2.cc @@ -17,7 +17,7 @@ #include -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/util/bit_util.h" namespace arrow { diff --git a/cpp/src/arrow/compute/exec/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc similarity index 99% rename from cpp/src/arrow/compute/exec/key_hash_test.cc rename to cpp/src/arrow/compute/key_hash_test.cc index 47f0f34560e..1ee9eb25312 100644 --- a/cpp/src/arrow/compute/exec/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -21,7 +21,7 @@ #include #include #include "arrow/array/builder_binary.h" -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/util.h" #include "arrow/util/cpu_info.h" diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h index 389b63cca41..33b48161733 100644 --- a/cpp/src/arrow/compute/light_array.h +++ b/cpp/src/arrow/compute/light_array.h @@ -21,7 +21,8 @@ #include "arrow/array.h" #include "arrow/compute/exec.h" -#include "arrow/compute/exec/util.h" +// DIPO #include "arrow/compute/exec/util.h" +#include "arrow/compute/util.h" #include "arrow/type.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index dcc7841a091..015d407e810 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -20,7 +20,7 @@ #include #include -#include "arrow/compute/exec/test_util.h" +// DIPO #include "arrow/compute/exec/test_util.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -216,26 +216,27 @@ TEST(KeyColumnArray, SliceBool) { } } -TEST(KeyColumnArray, FromExecBatch) { - ExecBatch batch = - ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); - std::vector arrays; - ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays)); - - ASSERT_EQ(2, arrays.size()); - ASSERT_EQ(8, arrays[0].metadata().fixed_length); - ASSERT_EQ(0, arrays[1].metadata().fixed_length); - ASSERT_EQ(3, arrays[0].length()); - ASSERT_EQ(3, arrays[1].length()); - - ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays)); - - ASSERT_EQ(2, arrays.size()); - ASSERT_EQ(8, arrays[0].metadata().fixed_length); - ASSERT_EQ(0, arrays[1].metadata().fixed_length); - ASSERT_EQ(1, arrays[0].length()); - ASSERT_EQ(1, arrays[1].length()); -} +// DIPO +//TEST(KeyColumnArray, FromExecBatch) { +// ExecBatch batch = +// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); +// std::vector arrays; +// ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays)); +// +// ASSERT_EQ(2, arrays.size()); +// ASSERT_EQ(8, arrays[0].metadata().fixed_length); +// ASSERT_EQ(0, arrays[1].metadata().fixed_length); +// ASSERT_EQ(3, arrays[0].length()); +// ASSERT_EQ(3, arrays[1].length()); +// +// ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays)); +// +// ASSERT_EQ(2, arrays.size()); +// ASSERT_EQ(8, arrays[0].metadata().fixed_length); +// ASSERT_EQ(0, arrays[1].metadata().fixed_length); +// ASSERT_EQ(1, arrays[0].length()); +// ASSERT_EQ(1, arrays[1].length()); +//} TEST(ResizableArrayData, Basic) { std::unique_ptr pool = MemoryPool::CreateDefault(); @@ -314,126 +315,127 @@ TEST(ResizableArrayData, Binary) { ASSERT_EQ(0, pool->bytes_allocated()); } } - -TEST(ExecBatchBuilder, AppendBatches) { - std::unique_ptr owned_pool = MemoryPool::CreateDefault(); - MemoryPool* pool = owned_pool.get(); - ExecBatch batch_one = - ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); - ExecBatch batch_two = - ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); - ExecBatch combined = ExecBatchFromJSON( - {int64(), boolean()}, - "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]"); - { - ExecBatchBuilder builder; - uint16_t row_ids[3] = {0, 1, 2}; - ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); - ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2)); - ExecBatch built = builder.Flush(); - ASSERT_EQ(combined, built); - ASSERT_NE(0, pool->bytes_allocated()); - } - ASSERT_EQ(0, pool->bytes_allocated()); -} - -TEST(ExecBatchBuilder, AppendBatchesSomeRows) { - std::unique_ptr owned_pool = MemoryPool::CreateDefault(); - MemoryPool* pool = owned_pool.get(); - ExecBatch batch_one = - ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); - ExecBatch batch_two = - ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); - ExecBatch combined = ExecBatchFromJSON( - {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]"); - { - ExecBatchBuilder builder; - uint16_t row_ids[2] = {0, 1}; - ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2)); - ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2)); - ExecBatch built = builder.Flush(); - ASSERT_EQ(combined, built); - ASSERT_NE(0, pool->bytes_allocated()); - } - ASSERT_EQ(0, pool->bytes_allocated()); -} - -TEST(ExecBatchBuilder, AppendBatchesSomeCols) { - std::unique_ptr owned_pool = MemoryPool::CreateDefault(); - MemoryPool* pool = owned_pool.get(); - ExecBatch batch_one = - ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); - ExecBatch batch_two = - ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); - ExecBatch first_col_only = - ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]"); - ExecBatch last_col_only = ExecBatchFromJSON( - {boolean()}, "[[true], [false], [null], [true], [true], [false]]"); - { - ExecBatchBuilder builder; - uint16_t row_ids[3] = {0, 1, 2}; - int first_col_ids[1] = {0}; - ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, - first_col_ids)); - ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, - first_col_ids)); - ExecBatch built = builder.Flush(); - ASSERT_EQ(first_col_only, built); - ASSERT_NE(0, pool->bytes_allocated()); - } - { - ExecBatchBuilder builder; - uint16_t row_ids[3] = {0, 1, 2}; - // If we don't specify col_ids and num_cols is 1 it is implicitly the first col - ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1)); - ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1)); - ExecBatch built = builder.Flush(); - ASSERT_EQ(first_col_only, built); - ASSERT_NE(0, pool->bytes_allocated()); - } - { - ExecBatchBuilder builder; - uint16_t row_ids[3] = {0, 1, 2}; - int last_col_ids[1] = {1}; - ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, - last_col_ids)); - ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, - last_col_ids)); - ExecBatch built = builder.Flush(); - ASSERT_EQ(last_col_only, built); - ASSERT_NE(0, pool->bytes_allocated()); - } - ASSERT_EQ(0, pool->bytes_allocated()); -} - -TEST(ExecBatchBuilder, AppendNulls) { - std::unique_ptr owned_pool = MemoryPool::CreateDefault(); - MemoryPool* pool = owned_pool.get(); - ExecBatch batch_one = - ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); - ExecBatch combined = ExecBatchFromJSON( - {int64(), boolean()}, - "[[1, true], [2, false], [null, null], [null, null], [null, null]]"); - ExecBatch just_nulls = - ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]"); - { - ExecBatchBuilder builder; - uint16_t row_ids[3] = {0, 1, 2}; - ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); - ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); - ExecBatch built = builder.Flush(); - ASSERT_EQ(combined, built); - ASSERT_NE(0, pool->bytes_allocated()); - } - { - ExecBatchBuilder builder; - ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); - ExecBatch built = builder.Flush(); - ASSERT_EQ(just_nulls, built); - ASSERT_NE(0, pool->bytes_allocated()); - } - ASSERT_EQ(0, pool->bytes_allocated()); -} +// DIPO +//TEST(ExecBatchBuilder, AppendBatches) { +// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); +// MemoryPool* pool = owned_pool.get(); +// ExecBatch batch_one = +// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); +// ExecBatch batch_two = +// ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); +// ExecBatch combined = ExecBatchFromJSON( +// {int64(), boolean()}, +// "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]"); +// { +// ExecBatchBuilder builder; +// uint16_t row_ids[3] = {0, 1, 2}; +// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); +// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2)); +// ExecBatch built = builder.Flush(); +// ASSERT_EQ(combined, built); +// ASSERT_NE(0, pool->bytes_allocated()); +// } +// ASSERT_EQ(0, pool->bytes_allocated()); +//} + +// DIPO +//TEST(ExecBatchBuilder, AppendBatchesSomeRows) { +// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); +// MemoryPool* pool = owned_pool.get(); +// ExecBatch batch_one = +// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); +// ExecBatch batch_two = +// ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); +// ExecBatch combined = ExecBatchFromJSON( +// {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]"); +// { +// ExecBatchBuilder builder; +// uint16_t row_ids[2] = {0, 1}; +// ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2)); +// ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2)); +// ExecBatch built = builder.Flush(); +// ASSERT_EQ(combined, built); +// ASSERT_NE(0, pool->bytes_allocated()); +// } +// ASSERT_EQ(0, pool->bytes_allocated()); +//} +// DIPO +//TEST(ExecBatchBuilder, AppendBatchesSomeCols) { +// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); +// MemoryPool* pool = owned_pool.get(); +// ExecBatch batch_one = +// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); +// ExecBatch batch_two = +// ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); +// ExecBatch first_col_only = +// ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]"); +// ExecBatch last_col_only = ExecBatchFromJSON( +// {boolean()}, "[[true], [false], [null], [true], [true], [false]]"); +// { +// ExecBatchBuilder builder; +// uint16_t row_ids[3] = {0, 1, 2}; +// int first_col_ids[1] = {0}; +// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, +// first_col_ids)); +// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, +// first_col_ids)); +// ExecBatch built = builder.Flush(); +// ASSERT_EQ(first_col_only, built); +// ASSERT_NE(0, pool->bytes_allocated()); +// } +// { +// ExecBatchBuilder builder; +// uint16_t row_ids[3] = {0, 1, 2}; +// // If we don't specify col_ids and num_cols is 1 it is implicitly the first col +// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1)); +// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1)); +// ExecBatch built = builder.Flush(); +// ASSERT_EQ(first_col_only, built); +// ASSERT_NE(0, pool->bytes_allocated()); +// } +// { +// ExecBatchBuilder builder; +// uint16_t row_ids[3] = {0, 1, 2}; +// int last_col_ids[1] = {1}; +// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, +// last_col_ids)); +// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, +// last_col_ids)); +// ExecBatch built = builder.Flush(); +// ASSERT_EQ(last_col_only, built); +// ASSERT_NE(0, pool->bytes_allocated()); +// } +// ASSERT_EQ(0, pool->bytes_allocated()); +//} +// +//TEST(ExecBatchBuilder, AppendNulls) { +// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); +// MemoryPool* pool = owned_pool.get(); +// ExecBatch batch_one = +// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); +// ExecBatch combined = ExecBatchFromJSON( +// {int64(), boolean()}, +// "[[1, true], [2, false], [null, null], [null, null], [null, null]]"); +// ExecBatch just_nulls = +// ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]"); +// { +// ExecBatchBuilder builder; +// uint16_t row_ids[3] = {0, 1, 2}; +// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); +// ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); +// ExecBatch built = builder.Flush(); +// ASSERT_EQ(combined, built); +// ASSERT_NE(0, pool->bytes_allocated()); +// } +// { +// ExecBatchBuilder builder; +// ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); +// ExecBatch built = builder.Flush(); +// ASSERT_EQ(just_nulls, built); +// ASSERT_NE(0, pool->bytes_allocated()); +// } +// ASSERT_EQ(0, pool->bytes_allocated()); +//} TEST(ExecBatchBuilder, AppendNullsBeyondLimit) { std::unique_ptr owned_pool = MemoryPool::CreateDefault(); diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index d003137d3e5..579c448319a 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -20,7 +20,7 @@ #include #include -#include "arrow/compute/exec/key_hash.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/key_map.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec_internal.h" diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h new file mode 100644 index 00000000000..42477857a18 --- /dev/null +++ b/cpp/src/arrow/compute/util.h @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/compute/type_fwd.h" +#include "arrow/memory_pool.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/cpu_info.h" +#include "arrow/util/logging.h" +#include "arrow/util/mutex.h" +#include "arrow/util/thread_pool.h" +#include "arrow/util/type_fwd.h" + +#if defined(__clang__) || defined(__GNUC__) +#define BYTESWAP(x) __builtin_bswap64(x) +#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31))) +#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63))) +#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +#elif defined(_MSC_VER) +#include +#define BYTESWAP(x) _byteswap_uint64(x) +#define ROTL(x, n) _rotl((x), (n)) +#define ROTL64(x, n) _rotl64((x), (n)) +#if defined(_M_X64) || defined(_M_I86) +#include // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx +#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +#else +#define PREFETCH(ptr) (void)(ptr) /* disabled */ +#endif +#endif + +namespace arrow { +namespace util { + +template +inline void CheckAlignment(const void *ptr) { + ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); +} + +/// Storage used to allocate temporary vectors of a batch size. +/// Temporary vectors should resemble allocating temporary variables on the stack +/// but in the context of vectorized processing where we need to store a vector of +/// temporaries instead of a single value. +class TempVectorStack { + template + friend + class TempVectorHolder; + + public: + Status Init(MemoryPool *pool, int64_t size) { + num_vectors_ = 0; + top_ = 0; + buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t); + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); + // Ensure later operations don't accidentally read uninitialized memory. + std::memset(buffer->mutable_data(), 0xFF, size); + buffer_ = std::move(buffer); + return Status::OK(); + } + + private: + int64_t PaddedAllocationSize(int64_t num_bytes) { + // Round up allocation size to multiple of 8 bytes + // to avoid returning temp vectors with unaligned address. + // + // Also add padding at the end to facilitate loads and stores + // using SIMD when number of vector elements is not divisible + // by the number of SIMD lanes. + // + return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; + } + void alloc(uint32_t num_bytes, uint8_t **data, int *id) { + int64_t old_top = top_; + top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); + // Stack overflow check + ARROW_DCHECK(top_ <= buffer_size_); + *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); + // We set 8 bytes before the beginning of the allocated range and + // 8 bytes after the end to check for stack overflow (which would + // result in those known bytes being corrupted). + reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; + reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; + *id = num_vectors_++; + } + void release(int id, uint32_t num_bytes) { + ARROW_DCHECK(num_vectors_ == id + 1); + int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == + kGuard2); + ARROW_DCHECK(top_ >= size); + top_ -= size; + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == + kGuard1); + --num_vectors_; + } + static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; + static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; + static constexpr int64_t kPadding = 64; + int num_vectors_; + int64_t top_; + std::unique_ptr buffer_; + int64_t buffer_size_; +}; + +} +} \ No newline at end of file From c985f97ebfc82f5d92131a1acc1d9e87efe7f4c9 Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 07:53:16 -0500 Subject: [PATCH 02/11] Moved code from compute/exec to compute to reduce entanglement with acero --- cpp/src/arrow/CMakeLists.txt | 9 +- .../arrow/compute/exec/swiss_join_internal.h | 2 +- cpp/src/arrow/compute/exec/util.cc | 606 +++++++++--------- cpp/src/arrow/compute/exec/util.h | 202 +++--- .../arrow/compute/kernels/hash_aggregate.cc | 2 +- cpp/src/arrow/compute/{exec => }/key_map.cc | 50 +- cpp/src/arrow/compute/{exec => }/key_map.h | 50 +- .../arrow/compute/{exec => }/key_map_avx2.cc | 2 +- cpp/src/arrow/compute/row/compare_internal.cc | 4 +- cpp/src/arrow/compute/row/compare_internal.h | 3 +- cpp/src/arrow/compute/row/encode_internal.cc | 3 +- cpp/src/arrow/compute/row/encode_internal.h | 6 +- cpp/src/arrow/compute/row/grouper.cc | 7 +- cpp/src/arrow/compute/row/grouper.h | 5 +- cpp/src/arrow/compute/row/row_internal.cc | 3 +- cpp/src/arrow/compute/util.cc | 363 +++++++++++ cpp/src/arrow/compute/util.h | 133 +++- cpp/src/arrow/compute/util_internal.h | 30 + 18 files changed, 981 insertions(+), 499 deletions(-) rename cpp/src/arrow/compute/{exec => }/key_map.cc (94%) rename cpp/src/arrow/compute/{exec => }/key_map.h (80%) rename cpp/src/arrow/compute/{exec => }/key_map_avx2.cc (99%) create mode 100644 cpp/src/arrow/compute/util.cc create mode 100644 cpp/src/arrow/compute/util_internal.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index bf406ca29fb..dc41f238d2c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -401,8 +401,6 @@ list(APPEND compute/exec/hash_join.cc compute/exec/hash_join_dict.cc compute/exec/hash_join_node.cc - compute/key_hash.cc - compute/exec/key_map.cc compute/exec/map_node.cc compute/exec/options.cc compute/exec/order_by_impl.cc @@ -420,6 +418,8 @@ list(APPEND compute/function.cc compute/function_internal.cc compute/kernel.cc + compute/key_hash.cc + compute/key_map.cc compute/light_array.cc compute/ordering.cc compute/registry.cc @@ -439,11 +439,12 @@ list(APPEND compute/row/encode_internal.cc compute/row/compare_internal.cc compute/row/grouper.cc - compute/row/row_internal.cc) + compute/row/row_internal.cc + compute/util.cc) append_avx2_src(compute/exec/bloom_filter_avx2.cc) append_avx2_src(compute/key_hash_avx2.cc) -append_avx2_src(compute/exec/key_map_avx2.cc) +append_avx2_src(compute/key_map_avx2.cc) append_avx2_src(compute/exec/swiss_join_avx2.cc) append_avx2_src(compute/exec/util_avx2.cc) append_avx2_src(compute/row/compare_internal_avx2.cc) diff --git a/cpp/src/arrow/compute/exec/swiss_join_internal.h b/cpp/src/arrow/compute/exec/swiss_join_internal.h index 355aff70944..4c765874bea 100644 --- a/cpp/src/arrow/compute/exec/swiss_join_internal.h +++ b/cpp/src/arrow/compute/exec/swiss_join_internal.h @@ -18,7 +18,7 @@ #pragma once #include -#include "arrow/compute/exec/key_map.h" +#include "arrow/compute/key_map.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/partition_util.h" #include "arrow/compute/exec/schema_util.h" diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc index 752f8cac764..98c193a8920 100644 --- a/cpp/src/arrow/compute/exec/util.cc +++ b/cpp/src/arrow/compute/exec/util.cc @@ -26,311 +26,311 @@ namespace arrow { -using bit_util::CountTrailingZeros; +//using bit_util::CountTrailingZeros; namespace util { - -inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { - // This will not be correct on big-endian architectures. -#if !ARROW_LITTLE_ENDIAN - ARROW_DCHECK(false); -#endif - ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); - if (num_bytes == 8) { - return util::SafeLoad(reinterpret_cast(bytes)); - } else { - uint64_t word = 0; - for (int i = 0; i < num_bytes; ++i) { - word |= static_cast(bytes[i]) << (8 * i); - } - return word; - } -} - -inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) { - // This will not be correct on big-endian architectures. -#if !ARROW_LITTLE_ENDIAN - ARROW_DCHECK(false); -#endif - ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); - if (num_bytes == 8) { - util::SafeStore(reinterpret_cast(bytes), value); - } else { - for (int i = 0; i < num_bytes; ++i) { - bytes[i] = static_cast(value >> (8 * i)); - } - } -} - -inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index, - int* num_indexes, uint16_t* indexes) { - int n = *num_indexes; - while (word) { - indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); - word &= word - 1; - } - *num_indexes = n; -} - -inline void bit_util::bits_filter_indexes_helper(uint64_t word, - const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes) { - int n = *num_indexes; - while (word) { - indexes[n++] = input_indexes[CountTrailingZeros(word)]; - word &= word - 1; - } - *num_indexes = n; -} - -template -void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, uint16_t base_index) { - // 64 bits at a time - constexpr int unroll = 64; - int tail = num_bits % unroll; -#if defined(ARROW_HAVE_AVX2) - if (hardware_flags & arrow::internal::CpuInfo::AVX2) { - if (filter_input_indexes) { - bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, - num_indexes, indexes); - } else { - bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes, - base_index); - } - } else { -#endif - *num_indexes = 0; - for (int i = 0; i < num_bits / unroll; ++i) { - uint64_t word = util::SafeLoad(&reinterpret_cast(bits)[i]); - if (bit_to_search == 0) { - word = ~word; - } - if (filter_input_indexes) { - bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes); - } else { - bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); - } - } -#if defined(ARROW_HAVE_AVX2) - } -#endif - // Optionally process the last partial word with masking out bits outside range - if (tail) { - const uint8_t* bits_tail = bits + (num_bits - tail) / 8; - uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8); - if (bit_to_search == 0) { - word = ~word; - } - word &= ~0ULL >> (64 - tail); - if (filter_input_indexes) { - bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes, - indexes); - } else { - bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes); - } - } -} - -void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, - const uint8_t* bits, int* num_indexes, uint16_t* indexes, - int bit_offset) { - bits += bit_offset / 8; - bit_offset %= 8; - *num_indexes = 0; - uint16_t base_index = 0; - if (bit_offset != 0) { - uint64_t bits_head = bits[0] >> bit_offset; - int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); - bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte, - reinterpret_cast(&bits_head), num_indexes, indexes); - if (num_bits <= bits_in_first_byte) { - return; - } - num_bits -= bits_in_first_byte; - indexes += *num_indexes; - bits += 1; - base_index = bits_in_first_byte; - } - - int num_indexes_new = 0; - if (bit_to_search == 0) { - bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr, - &num_indexes_new, indexes, base_index); - } else { - ARROW_DCHECK(bit_to_search == 1); - bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr, - &num_indexes_new, indexes, base_index); - } - *num_indexes += num_indexes_new; -} - -void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, int bit_offset) { - bits += bit_offset / 8; - bit_offset %= 8; - if (bit_offset != 0) { - int num_indexes_head = 0; - uint64_t bits_head = bits[0] >> bit_offset; - int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); - bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte, - reinterpret_cast(&bits_head), input_indexes, - &num_indexes_head, indexes); - int num_indexes_tail = 0; - if (num_bits > bits_in_first_byte) { - bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte, - bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail, - indexes + num_indexes_head); - } - *num_indexes = num_indexes_head + num_indexes_tail; - return; - } - - if (bit_to_search == 0) { - bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes, - num_indexes, indexes); - } else { - ARROW_DCHECK(bit_to_search == 1); - bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes, - num_indexes, indexes); - } -} - -void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, int* num_indexes_bit0, - uint16_t* indexes_bit0, uint16_t* indexes_bit1, - int bit_offset) { - bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0, - bit_offset); - int num_indexes_bit1; - bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1, - bit_offset); -} - -void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, uint8_t* bytes, int bit_offset) { - bits += bit_offset / 8; - bit_offset %= 8; - if (bit_offset != 0) { - uint64_t bits_head = bits[0] >> bit_offset; - int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); - bits_to_bytes(hardware_flags, bits_in_first_byte, - reinterpret_cast(&bits_head), bytes); - if (num_bits > bits_in_first_byte) { - bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1, - bytes + bits_in_first_byte); - } - return; - } - - int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) - if (hardware_flags & arrow::internal::CpuInfo::AVX2) { - // The function call below processes whole 32 bit chunks together. - num_processed = num_bits - (num_bits % 32); - bits_to_bytes_avx2(num_processed, bits, bytes); - } -#endif - // Processing 8 bits at a time - constexpr int unroll = 8; - for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { - uint8_t bits_next = bits[i]; - // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart - // from the previous. - uint64_t unpacked = static_cast(bits_next & 0xfe) * - ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | - (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); - unpacked |= (bits_next & 1); - unpacked &= 0x0101010101010101ULL; - unpacked *= 255; - util::SafeStore(&reinterpret_cast(bytes)[i], unpacked); - } - int tail = num_bits % unroll; - if (tail) { - uint8_t bits_next = bits[(num_bits - tail) / unroll]; - // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart - // from the previous. - uint64_t unpacked = static_cast(bits_next & 0xfe) * - ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | - (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); - unpacked |= (bits_next & 1); - unpacked &= 0x0101010101010101ULL; - unpacked *= 255; - SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked); - } -} - -void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, - const uint8_t* bytes, uint8_t* bits, int bit_offset) { - bits += bit_offset / 8; - bit_offset %= 8; - if (bit_offset != 0) { - uint64_t bits_head; - int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); - bytes_to_bits(hardware_flags, bits_in_first_byte, bytes, - reinterpret_cast(&bits_head)); - uint8_t mask = (1 << bit_offset) - 1; - *bits = static_cast((*bits & mask) | (bits_head << bit_offset)); - - if (num_bits > bits_in_first_byte) { - bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte, - bytes + bits_in_first_byte, bits + 1); - } - return; - } - - int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) - if (hardware_flags & arrow::internal::CpuInfo::AVX2) { - // The function call below processes whole 32 bit chunks together. - num_processed = num_bits - (num_bits % 32); - bytes_to_bits_avx2(num_processed, bytes, bits); - } -#endif - // Process 8 bits at a time - constexpr int unroll = 8; - for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { - uint64_t bytes_next = util::SafeLoad(&reinterpret_cast(bytes)[i]); - bytes_next &= 0x0101010101010101ULL; - bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes - bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes - bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte - bits[i] = static_cast(bytes_next & 0xff); - } - int tail = num_bits % unroll; - if (tail) { - uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail); - bytes_next &= 0x0101010101010101ULL; - bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes - bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes - bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte - bits[num_bits / 8] = static_cast(bytes_next & 0xff); - } -} - -bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, - uint32_t num_bytes) { -#if defined(ARROW_HAVE_AVX2) - if (hardware_flags & arrow::internal::CpuInfo::AVX2) { - return are_all_bytes_zero_avx2(bytes, num_bytes); - } -#endif - uint64_t result_or = 0; - uint32_t i; - for (i = 0; i < num_bytes / 8; ++i) { - uint64_t x = util::SafeLoad(&reinterpret_cast(bytes)[i]); - result_or |= x; - } - if (num_bytes % 8 > 0) { - uint64_t tail = 0; - result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8); - } - return result_or == 0; -} +// +//inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { +// // This will not be correct on big-endian architectures. +//#if !ARROW_LITTLE_ENDIAN +// ARROW_DCHECK(false); +//#endif +// ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); +// if (num_bytes == 8) { +// return util::SafeLoad(reinterpret_cast(bytes)); +// } else { +// uint64_t word = 0; +// for (int i = 0; i < num_bytes; ++i) { +// word |= static_cast(bytes[i]) << (8 * i); +// } +// return word; +// } +//} +// +//inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) { +// // This will not be correct on big-endian architectures. +//#if !ARROW_LITTLE_ENDIAN +// ARROW_DCHECK(false); +//#endif +// ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); +// if (num_bytes == 8) { +// util::SafeStore(reinterpret_cast(bytes), value); +// } else { +// for (int i = 0; i < num_bytes; ++i) { +// bytes[i] = static_cast(value >> (8 * i)); +// } +// } +//} +// +//inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index, +// int* num_indexes, uint16_t* indexes) { +// int n = *num_indexes; +// while (word) { +// indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); +// word &= word - 1; +// } +// *num_indexes = n; +//} +// +//inline void bit_util::bits_filter_indexes_helper(uint64_t word, +// const uint16_t* input_indexes, +// int* num_indexes, uint16_t* indexes) { +// int n = *num_indexes; +// while (word) { +// indexes[n++] = input_indexes[CountTrailingZeros(word)]; +// word &= word - 1; +// } +// *num_indexes = n; +//} +// +//template +//void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, +// const uint8_t* bits, +// const uint16_t* input_indexes, int* num_indexes, +// uint16_t* indexes, uint16_t base_index) { +// // 64 bits at a time +// constexpr int unroll = 64; +// int tail = num_bits % unroll; +//#if defined(ARROW_HAVE_AVX2) +// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { +// if (filter_input_indexes) { +// bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, +// num_indexes, indexes); +// } else { +// bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes, +// base_index); +// } +// } else { +//#endif +// *num_indexes = 0; +// for (int i = 0; i < num_bits / unroll; ++i) { +// uint64_t word = util::SafeLoad(&reinterpret_cast(bits)[i]); +// if (bit_to_search == 0) { +// word = ~word; +// } +// if (filter_input_indexes) { +// bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes); +// } else { +// bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); +// } +// } +//#if defined(ARROW_HAVE_AVX2) +// } +//#endif +// // Optionally process the last partial word with masking out bits outside range +// if (tail) { +// const uint8_t* bits_tail = bits + (num_bits - tail) / 8; +// uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8); +// if (bit_to_search == 0) { +// word = ~word; +// } +// word &= ~0ULL >> (64 - tail); +// if (filter_input_indexes) { +// bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes, +// indexes); +// } else { +// bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes); +// } +// } +//} +// +//void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, +// const uint8_t* bits, int* num_indexes, uint16_t* indexes, +// int bit_offset) { +// bits += bit_offset / 8; +// bit_offset %= 8; +// *num_indexes = 0; +// uint16_t base_index = 0; +// if (bit_offset != 0) { +// uint64_t bits_head = bits[0] >> bit_offset; +// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); +// bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte, +// reinterpret_cast(&bits_head), num_indexes, indexes); +// if (num_bits <= bits_in_first_byte) { +// return; +// } +// num_bits -= bits_in_first_byte; +// indexes += *num_indexes; +// bits += 1; +// base_index = bits_in_first_byte; +// } +// +// int num_indexes_new = 0; +// if (bit_to_search == 0) { +// bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr, +// &num_indexes_new, indexes, base_index); +// } else { +// ARROW_DCHECK(bit_to_search == 1); +// bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr, +// &num_indexes_new, indexes, base_index); +// } +// *num_indexes += num_indexes_new; +//} +// +//void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, +// const int num_bits, const uint8_t* bits, +// const uint16_t* input_indexes, int* num_indexes, +// uint16_t* indexes, int bit_offset) { +// bits += bit_offset / 8; +// bit_offset %= 8; +// if (bit_offset != 0) { +// int num_indexes_head = 0; +// uint64_t bits_head = bits[0] >> bit_offset; +// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); +// bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte, +// reinterpret_cast(&bits_head), input_indexes, +// &num_indexes_head, indexes); +// int num_indexes_tail = 0; +// if (num_bits > bits_in_first_byte) { +// bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte, +// bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail, +// indexes + num_indexes_head); +// } +// *num_indexes = num_indexes_head + num_indexes_tail; +// return; +// } +// +// if (bit_to_search == 0) { +// bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes, +// num_indexes, indexes); +// } else { +// ARROW_DCHECK(bit_to_search == 1); +// bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes, +// num_indexes, indexes); +// } +//} +// +//void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits, +// const uint8_t* bits, int* num_indexes_bit0, +// uint16_t* indexes_bit0, uint16_t* indexes_bit1, +// int bit_offset) { +// bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0, +// bit_offset); +// int num_indexes_bit1; +// bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1, +// bit_offset); +//} +// +//void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, +// const uint8_t* bits, uint8_t* bytes, int bit_offset) { +// bits += bit_offset / 8; +// bit_offset %= 8; +// if (bit_offset != 0) { +// uint64_t bits_head = bits[0] >> bit_offset; +// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); +// bits_to_bytes(hardware_flags, bits_in_first_byte, +// reinterpret_cast(&bits_head), bytes); +// if (num_bits > bits_in_first_byte) { +// bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1, +// bytes + bits_in_first_byte); +// } +// return; +// } +// +// int num_processed = 0; +//#if defined(ARROW_HAVE_AVX2) +// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { +// // The function call below processes whole 32 bit chunks together. +// num_processed = num_bits - (num_bits % 32); +// bits_to_bytes_avx2(num_processed, bits, bytes); +// } +//#endif +// // Processing 8 bits at a time +// constexpr int unroll = 8; +// for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { +// uint8_t bits_next = bits[i]; +// // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart +// // from the previous. +// uint64_t unpacked = static_cast(bits_next & 0xfe) * +// ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | +// (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); +// unpacked |= (bits_next & 1); +// unpacked &= 0x0101010101010101ULL; +// unpacked *= 255; +// util::SafeStore(&reinterpret_cast(bytes)[i], unpacked); +// } +// int tail = num_bits % unroll; +// if (tail) { +// uint8_t bits_next = bits[(num_bits - tail) / unroll]; +// // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart +// // from the previous. +// uint64_t unpacked = static_cast(bits_next & 0xfe) * +// ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | +// (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); +// unpacked |= (bits_next & 1); +// unpacked &= 0x0101010101010101ULL; +// unpacked *= 255; +// SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked); +// } +//} +// +//void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, +// const uint8_t* bytes, uint8_t* bits, int bit_offset) { +// bits += bit_offset / 8; +// bit_offset %= 8; +// if (bit_offset != 0) { +// uint64_t bits_head; +// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); +// bytes_to_bits(hardware_flags, bits_in_first_byte, bytes, +// reinterpret_cast(&bits_head)); +// uint8_t mask = (1 << bit_offset) - 1; +// *bits = static_cast((*bits & mask) | (bits_head << bit_offset)); +// +// if (num_bits > bits_in_first_byte) { +// bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte, +// bytes + bits_in_first_byte, bits + 1); +// } +// return; +// } +// +// int num_processed = 0; +//#if defined(ARROW_HAVE_AVX2) +// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { +// // The function call below processes whole 32 bit chunks together. +// num_processed = num_bits - (num_bits % 32); +// bytes_to_bits_avx2(num_processed, bytes, bits); +// } +//#endif +// // Process 8 bits at a time +// constexpr int unroll = 8; +// for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { +// uint64_t bytes_next = util::SafeLoad(&reinterpret_cast(bytes)[i]); +// bytes_next &= 0x0101010101010101ULL; +// bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes +// bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes +// bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte +// bits[i] = static_cast(bytes_next & 0xff); +// } +// int tail = num_bits % unroll; +// if (tail) { +// uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail); +// bytes_next &= 0x0101010101010101ULL; +// bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes +// bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes +// bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte +// bits[num_bits / 8] = static_cast(bytes_next & 0xff); +// } +//} +// +//bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, +// uint32_t num_bytes) { +//#if defined(ARROW_HAVE_AVX2) +// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { +// return are_all_bytes_zero_avx2(bytes, num_bytes); +// } +//#endif +// uint64_t result_or = 0; +// uint32_t i; +// for (i = 0; i < num_bytes / 8; ++i) { +// uint64_t x = util::SafeLoad(&reinterpret_cast(bytes)[i]); +// result_or |= x; +// } +// if (num_bytes % 8 > 0) { +// uint64_t tail = 0; +// result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8); +// } +// return result_or == 0; +//} } // namespace util diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h index def9e0c714b..e0ffca29070 100644 --- a/cpp/src/arrow/compute/exec/util.h +++ b/cpp/src/arrow/compute/exec/util.h @@ -67,25 +67,25 @@ namespace util { // ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); //} -// Some platforms typedef int64_t as long int instead of long long int, -// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics -// which need long long. -// We use the cast to the type below in these intrinsics to make the code -// compile in all cases. +//// Some platforms typedef int64_t as long int instead of long long int, +//// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics +//// which need long long. +//// We use the cast to the type below in these intrinsics to make the code +//// compile in all cases. +//// +//using int64_for_gather_t = const long long int; // NOLINT runtime-int // -using int64_for_gather_t = const long long int; // NOLINT runtime-int - -// All MiniBatch... classes use TempVectorStack for vector allocations and can -// only work with vectors up to 1024 elements. -// -// They should only be allocated on the stack to guarantee the right sequence -// of allocation and deallocation of vectors from TempVectorStack. -// -class MiniBatch { - public: - static constexpr int kLogMiniBatchLength = 10; - static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; -}; +//// All MiniBatch... classes use TempVectorStack for vector allocations and can +//// only work with vectors up to 1024 elements. +//// +//// They should only be allocated on the stack to guarantee the right sequence +//// of allocation and deallocation of vectors from TempVectorStack. +//// +//class MiniBatch { +// public: +// static constexpr int kLogMiniBatchLength = 10; +// static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; +//}; // DIPO ///// Storage used to allocate temporary vectors of a batch size. @@ -151,89 +151,89 @@ class MiniBatch { // std::unique_ptr buffer_; // int64_t buffer_size_; //}; - -template -class TempVectorHolder { - friend class TempVectorStack; - - public: - ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } - T* mutable_data() { return reinterpret_cast(data_); } - TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { - stack_ = stack; - num_elements_ = num_elements; - stack_->alloc(num_elements * sizeof(T), &data_, &id_); - } - - private: - TempVectorStack* stack_; - uint8_t* data_; - int id_; - uint32_t num_elements_; -}; - -class bit_util { - public: - static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, int* num_indexes, - uint16_t* indexes, int bit_offset = 0); - - static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, - const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, int* num_indexes, - uint16_t* indexes, int bit_offset = 0); - - // Input and output indexes may be pointing to the same data (in-place filtering). - static void bits_split_indexes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, int* num_indexes_bit0, - uint16_t* indexes_bit0, uint16_t* indexes_bit1, - int bit_offset = 0); - - // Bit 1 is replaced with byte 0xFF. - static void bits_to_bytes(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); - - // Return highest bit of each byte. - static void bytes_to_bits(int64_t hardware_flags, const int num_bits, - const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); - - static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, - uint32_t num_bytes); - - private: - inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes); - inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value); - inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index, - int* num_indexes, uint16_t* indexes); - inline static void bits_filter_indexes_helper(uint64_t word, - const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes); - template - static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, - const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes, - uint16_t base_index = 0); - -#if defined(ARROW_HAVE_AVX2) - static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, int* num_indexes, - uint16_t* indexes, uint16_t base_index = 0); - static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, - const uint8_t* bits, const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes); - template - static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - int* num_indexes, uint16_t* indexes, - uint16_t base_index = 0); - template - static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, - const uint16_t* input_indexes, - int* num_indexes, uint16_t* indexes); - static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes); - static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits); - static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); -#endif -}; +// +//template +//class TempVectorHolder { +// friend class TempVectorStack; +// +// public: +// ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } +// T* mutable_data() { return reinterpret_cast(data_); } +// TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { +// stack_ = stack; +// num_elements_ = num_elements; +// stack_->alloc(num_elements * sizeof(T), &data_, &id_); +// } +// +// private: +// TempVectorStack* stack_; +// uint8_t* data_; +// int id_; +// uint32_t num_elements_; +//}; +// +//class bit_util { +// public: +// static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, +// const int num_bits, const uint8_t* bits, int* num_indexes, +// uint16_t* indexes, int bit_offset = 0); +// +// static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, +// const int num_bits, const uint8_t* bits, +// const uint16_t* input_indexes, int* num_indexes, +// uint16_t* indexes, int bit_offset = 0); +// +// // Input and output indexes may be pointing to the same data (in-place filtering). +// static void bits_split_indexes(int64_t hardware_flags, const int num_bits, +// const uint8_t* bits, int* num_indexes_bit0, +// uint16_t* indexes_bit0, uint16_t* indexes_bit1, +// int bit_offset = 0); +// +// // Bit 1 is replaced with byte 0xFF. +// static void bits_to_bytes(int64_t hardware_flags, const int num_bits, +// const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); +// +// // Return highest bit of each byte. +// static void bytes_to_bits(int64_t hardware_flags, const int num_bits, +// const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); +// +// static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, +// uint32_t num_bytes); +// +// private: +// inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes); +// inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value); +// inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index, +// int* num_indexes, uint16_t* indexes); +// inline static void bits_filter_indexes_helper(uint64_t word, +// const uint16_t* input_indexes, +// int* num_indexes, uint16_t* indexes); +// template +// static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, +// const uint8_t* bits, const uint16_t* input_indexes, +// int* num_indexes, uint16_t* indexes, +// uint16_t base_index = 0); +// +//#if defined(ARROW_HAVE_AVX2) +// static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, +// const uint8_t* bits, int* num_indexes, +// uint16_t* indexes, uint16_t base_index = 0); +// static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, +// const uint8_t* bits, const uint16_t* input_indexes, +// int* num_indexes, uint16_t* indexes); +// template +// static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, +// int* num_indexes, uint16_t* indexes, +// uint16_t base_index = 0); +// template +// static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, +// const uint16_t* input_indexes, +// int* num_indexes, uint16_t* indexes); +// static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes); +// static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits); +// static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); +//#endif +//}; } // namespace util namespace compute { diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index eecfb054321..81fb2e871b5 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -30,7 +30,7 @@ #include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" #include "arrow/compute/key_hash.h" -#include "arrow/compute/exec/key_map.h" +#include "arrow/compute/key_map.h" #include "arrow/compute/exec/util.h" #include "arrow/compute/exec_internal.h" #include "arrow/compute/kernel.h" diff --git a/cpp/src/arrow/compute/exec/key_map.cc b/cpp/src/arrow/compute/key_map.cc similarity index 94% rename from cpp/src/arrow/compute/exec/key_map.cc rename to cpp/src/arrow/compute/key_map.cc index a61184e4ca9..46741602cd6 100644 --- a/cpp/src/arrow/compute/exec/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/compute/exec/key_map.h" +#include "key_map.h" #include @@ -25,6 +25,8 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/ubsan.h" +#include "arrow/util/logging.h" + namespace arrow { @@ -830,5 +832,51 @@ void SwissTable::cleanup() { num_inserted_ = 0; } + +uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot, + uint64_t group_id_mask) const { + // Group id values for all 8 slots in the block are bit-packed and follow the status + // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In + // that case we can extract group id using aligned 64-bit word access. + int num_group_id_bits = static_cast(ARROW_POPCOUNT64(group_id_mask)); + ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 || + num_group_id_bits == 32 || num_group_id_bits == 64); + + int bit_offset = slot * num_group_id_bits; + const uint64_t* group_id_bytes = + reinterpret_cast(block_ptr) + 1 + (bit_offset >> 6); + uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask; + + return group_id; +} + +void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash, + uint32_t group_id) { + const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); + + // We assume here that the number of bits is rounded up to 8, 16, 32 or 64. + // In that case we can insert group id value using aligned 64-bit word access. + ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || + num_groupid_bits == 32 || num_groupid_bits == 64); + + const uint64_t num_block_bytes = (8 + num_groupid_bits); + constexpr uint64_t stamp_mask = 0x7f; + + int start_slot = (slot_id & 7); + int stamp = + static_cast((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask); + uint64_t block_id = slot_id >> 3; + uint8_t* blockbase = blocks_ + num_block_bytes * block_id; + + blockbase[7 - start_slot] = static_cast(stamp); + int groupid_bit_offset = static_cast(start_slot * num_groupid_bits); + + // Block status bytes should start at an address aligned to 8 bytes + ARROW_DCHECK((reinterpret_cast(blockbase) & 7) == 0); + uint64_t* ptr = reinterpret_cast(blockbase) + 1 + (groupid_bit_offset >> 6); + *ptr |= (static_cast(group_id) << (groupid_bit_offset & 63)); +} + + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_map.h b/cpp/src/arrow/compute/key_map.h similarity index 80% rename from cpp/src/arrow/compute/exec/key_map.h rename to cpp/src/arrow/compute/key_map.h index cc630e0b1c3..c9a3a9dc100 100644 --- a/cpp/src/arrow/compute/exec/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -19,7 +19,7 @@ #include -#include "arrow/compute/exec/util.h" +#include "arrow/compute/util.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" @@ -107,7 +107,7 @@ class SwissTable { /// \brief Extract group id for a given slot in a given block. /// - inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot, + uint64_t extract_group_id(const uint8_t* block_ptr, int slot, uint64_t group_id_mask) const; void extract_group_ids(const int num_keys, const uint16_t* optional_selection, const uint32_t* hashes, const uint8_t* local_slots, @@ -159,7 +159,7 @@ class SwissTable { inline bool find_next_stamp_match(const uint32_t hash, const uint32_t in_slot_id, uint32_t* out_slot_id, uint32_t* out_group_id) const; - inline void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id); + void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id); // Slow processing of input keys in the most generic case. // Handles inserting new keys. @@ -227,49 +227,5 @@ class SwissTable { MemoryPool* pool_; }; -uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot, - uint64_t group_id_mask) const { - // Group id values for all 8 slots in the block are bit-packed and follow the status - // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In - // that case we can extract group id using aligned 64-bit word access. - int num_group_id_bits = static_cast(ARROW_POPCOUNT64(group_id_mask)); - ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 || - num_group_id_bits == 32 || num_group_id_bits == 64); - - int bit_offset = slot * num_group_id_bits; - const uint64_t* group_id_bytes = - reinterpret_cast(block_ptr) + 1 + (bit_offset >> 6); - uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask; - - return group_id; -} - -void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash, - uint32_t group_id) { - const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); - - // We assume here that the number of bits is rounded up to 8, 16, 32 or 64. - // In that case we can insert group id value using aligned 64-bit word access. - ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || - num_groupid_bits == 32 || num_groupid_bits == 64); - - const uint64_t num_block_bytes = (8 + num_groupid_bits); - constexpr uint64_t stamp_mask = 0x7f; - - int start_slot = (slot_id & 7); - int stamp = - static_cast((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask); - uint64_t block_id = slot_id >> 3; - uint8_t* blockbase = blocks_ + num_block_bytes * block_id; - - blockbase[7 - start_slot] = static_cast(stamp); - int groupid_bit_offset = static_cast(start_slot * num_groupid_bits); - - // Block status bytes should start at an address aligned to 8 bytes - ARROW_DCHECK((reinterpret_cast(blockbase) & 7) == 0); - uint64_t* ptr = reinterpret_cast(blockbase) + 1 + (groupid_bit_offset >> 6); - *ptr |= (static_cast(group_id) << (groupid_bit_offset & 63)); -} - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/exec/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_avx2.cc similarity index 99% rename from cpp/src/arrow/compute/exec/key_map_avx2.cc rename to cpp/src/arrow/compute/key_map_avx2.cc index 4c77f3af237..102e28eca41 100644 --- a/cpp/src/arrow/compute/exec/key_map_avx2.cc +++ b/cpp/src/arrow/compute/key_map_avx2.cc @@ -17,7 +17,7 @@ #include -#include "arrow/compute/exec/key_map.h" +#include "arrow/compute/key_map.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 750012e60e2..2216add6849 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -22,7 +22,9 @@ #include #include -#include "arrow/compute/exec/util.h" +// DIPO +#include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/ubsan.h" diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index f9ec1e7f535..162eae19790 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -19,7 +19,8 @@ #include -#include "arrow/compute/exec/util.h" +// DIPO +#include "arrow/compute/util.h" #include "arrow/compute/light_array.h" #include "arrow/compute/row/encode_internal.h" #include "arrow/compute/row/row_internal.h" diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 9d138258d66..8c0c0aac6a7 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -16,7 +16,8 @@ // under the License. #include "arrow/compute/row/encode_internal.h" -#include "arrow/compute/exec.h" +//#include "arrow/compute/exec.h" +//DIPO #include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 970537a3067..58d4abd3e83 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -22,8 +22,10 @@ #include #include "arrow/array/data.h" -#include "arrow/compute/exec.h" -#include "arrow/compute/exec/util.h" +//#include "arrow/compute/exec.h" +// DIPO +#include "arrow/compute/key_map.h" +#include "arrow/compute/util.h" #include "arrow/compute/light_array.h" #include "arrow/compute/row/row_internal.h" #include "arrow/memory_pool.h" diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index 579c448319a..c879e7c7403 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -21,9 +21,10 @@ #include #include "arrow/compute/key_hash.h" -#include "arrow/compute/exec/key_map.h" -#include "arrow/compute/exec/options.h" -#include "arrow/compute/exec_internal.h" +// DIPO +//#include "arrow/compute/exec/key_map.h" +//#include "arrow/compute/exec/options.h" +//#include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" #include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/light_array.h" diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index ce09adf09b3..0ab5bba5f3e 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -20,8 +20,9 @@ #include #include -#include "arrow/compute/exec.h" -#include "arrow/compute/exec/options.h" +//#include "arrow/compute/exec.h" +//#include "arrow/compute/exec/options.h" +#include "arrow/compute/key_map.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 11a8a0bc436..52e0c9cb6c7 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -17,7 +17,8 @@ #include "arrow/compute/row/row_internal.h" -#include "arrow/compute/exec/util.h" +// DIPO +#include "arrow/compute/util.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc new file mode 100644 index 00000000000..572a52a5f19 --- /dev/null +++ b/cpp/src/arrow/compute/util.cc @@ -0,0 +1,363 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/util.h" + +#include "arrow/table.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/tracing_internal.h" +#include "arrow/util/ubsan.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using bit_util::CountTrailingZeros; + +namespace util { + +void TempVectorStack::alloc(uint32_t num_bytes, uint8_t **data, int *id) { + int64_t old_top = top_; + top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); + // Stack overflow check + ARROW_DCHECK(top_ <= buffer_size_); + *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); + // We set 8 bytes before the beginning of the allocated range and + // 8 bytes after the end to check for stack overflow (which would + // result in those known bytes being corrupted). + reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; + reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; + *id = num_vectors_++; +} + +void TempVectorStack::release(int id, uint32_t num_bytes) { + ARROW_DCHECK(num_vectors_ == id + 1); + int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == + kGuard2); + ARROW_DCHECK(top_ >= size); + top_ -= size; + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == + kGuard1); + --num_vectors_; +} + +inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { + // This will not be correct on big-endian architectures. +#if !ARROW_LITTLE_ENDIAN + ARROW_DCHECK(false); +#endif + ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); + if (num_bytes == 8) { + return util::SafeLoad(reinterpret_cast(bytes)); + } else { + uint64_t word = 0; + for (int i = 0; i < num_bytes; ++i) { + word |= static_cast(bytes[i]) << (8 * i); + } + return word; + } +} + +inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) { + // This will not be correct on big-endian architectures. +#if !ARROW_LITTLE_ENDIAN + ARROW_DCHECK(false); +#endif + ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); + if (num_bytes == 8) { + util::SafeStore(reinterpret_cast(bytes), value); + } else { + for (int i = 0; i < num_bytes; ++i) { + bytes[i] = static_cast(value >> (8 * i)); + } + } +} + +inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index, + int* num_indexes, uint16_t* indexes) { + int n = *num_indexes; + while (word) { + indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); + word &= word - 1; + } + *num_indexes = n; +} + +inline void bit_util::bits_filter_indexes_helper(uint64_t word, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes) { + int n = *num_indexes; + while (word) { + indexes[n++] = input_indexes[CountTrailingZeros(word)]; + word &= word - 1; + } + *num_indexes = n; +} + +template +void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, uint16_t base_index) { + // 64 bits at a time + constexpr int unroll = 64; + int tail = num_bits % unroll; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + if (filter_input_indexes) { + bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, + num_indexes, indexes); + } else { + bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes, + base_index); + } + } else { +#endif + *num_indexes = 0; + for (int i = 0; i < num_bits / unroll; ++i) { + uint64_t word = util::SafeLoad(&reinterpret_cast(bits)[i]); + if (bit_to_search == 0) { + word = ~word; + } + if (filter_input_indexes) { + bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes); + } else { + bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); + } + } +#if defined(ARROW_HAVE_AVX2) + } +#endif + // Optionally process the last partial word with masking out bits outside range + if (tail) { + const uint8_t* bits_tail = bits + (num_bits - tail) / 8; + uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8); + if (bit_to_search == 0) { + word = ~word; + } + word &= ~0ULL >> (64 - tail); + if (filter_input_indexes) { + bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes, + indexes); + } else { + bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes); + } + } +} + +void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, + const uint8_t* bits, int* num_indexes, uint16_t* indexes, + int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + *num_indexes = 0; + uint16_t base_index = 0; + if (bit_offset != 0) { + uint64_t bits_head = bits[0] >> bit_offset; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte, + reinterpret_cast(&bits_head), num_indexes, indexes); + if (num_bits <= bits_in_first_byte) { + return; + } + num_bits -= bits_in_first_byte; + indexes += *num_indexes; + bits += 1; + base_index = bits_in_first_byte; + } + + int num_indexes_new = 0; + if (bit_to_search == 0) { + bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr, + &num_indexes_new, indexes, base_index); + } else { + ARROW_DCHECK(bit_to_search == 1); + bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr, + &num_indexes_new, indexes, base_index); + } + *num_indexes += num_indexes_new; +} + +void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + if (bit_offset != 0) { + int num_indexes_head = 0; + uint64_t bits_head = bits[0] >> bit_offset; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte, + reinterpret_cast(&bits_head), input_indexes, + &num_indexes_head, indexes); + int num_indexes_tail = 0; + if (num_bits > bits_in_first_byte) { + bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte, + bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail, + indexes + num_indexes_head); + } + *num_indexes = num_indexes_head + num_indexes_tail; + return; + } + + if (bit_to_search == 0) { + bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes, + num_indexes, indexes); + } else { + ARROW_DCHECK(bit_to_search == 1); + bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes, + num_indexes, indexes); + } +} + +void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, uint16_t* indexes_bit1, + int bit_offset) { + bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0, + bit_offset); + int num_indexes_bit1; + bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1, + bit_offset); +} + +void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + if (bit_offset != 0) { + uint64_t bits_head = bits[0] >> bit_offset; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bits_to_bytes(hardware_flags, bits_in_first_byte, + reinterpret_cast(&bits_head), bytes); + if (num_bits > bits_in_first_byte) { + bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1, + bytes + bits_in_first_byte); + } + return; + } + + int num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + // The function call below processes whole 32 bit chunks together. + num_processed = num_bits - (num_bits % 32); + bits_to_bytes_avx2(num_processed, bits, bytes); + } +#endif + // Processing 8 bits at a time + constexpr int unroll = 8; + for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { + uint8_t bits_next = bits[i]; + // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart + // from the previous. + uint64_t unpacked = static_cast(bits_next & 0xfe) * + ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | + (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); + unpacked |= (bits_next & 1); + unpacked &= 0x0101010101010101ULL; + unpacked *= 255; + util::SafeStore(&reinterpret_cast(bytes)[i], unpacked); + } + int tail = num_bits % unroll; + if (tail) { + uint8_t bits_next = bits[(num_bits - tail) / unroll]; + // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart + // from the previous. + uint64_t unpacked = static_cast(bits_next & 0xfe) * + ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | + (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); + unpacked |= (bits_next & 1); + unpacked &= 0x0101010101010101ULL; + unpacked *= 255; + SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked); + } +} + +void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, int bit_offset) { + bits += bit_offset / 8; + bit_offset %= 8; + if (bit_offset != 0) { + uint64_t bits_head; + int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); + bytes_to_bits(hardware_flags, bits_in_first_byte, bytes, + reinterpret_cast(&bits_head)); + uint8_t mask = (1 << bit_offset) - 1; + *bits = static_cast((*bits & mask) | (bits_head << bit_offset)); + + if (num_bits > bits_in_first_byte) { + bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte, + bytes + bits_in_first_byte, bits + 1); + } + return; + } + + int num_processed = 0; +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + // The function call below processes whole 32 bit chunks together. + num_processed = num_bits - (num_bits % 32); + bytes_to_bits_avx2(num_processed, bytes, bits); + } +#endif + // Process 8 bits at a time + constexpr int unroll = 8; + for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { + uint64_t bytes_next = util::SafeLoad(&reinterpret_cast(bytes)[i]); + bytes_next &= 0x0101010101010101ULL; + bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes + bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes + bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte + bits[i] = static_cast(bytes_next & 0xff); + } + int tail = num_bits % unroll; + if (tail) { + uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail); + bytes_next &= 0x0101010101010101ULL; + bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes + bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes + bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte + bits[num_bits / 8] = static_cast(bytes_next & 0xff); + } +} + +bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes) { +#if defined(ARROW_HAVE_AVX2) + if (hardware_flags & arrow::internal::CpuInfo::AVX2) { + return are_all_bytes_zero_avx2(bytes, num_bytes); + } +#endif + uint64_t result_or = 0; + uint32_t i; + for (i = 0; i < num_bytes / 8; ++i) { + uint64_t x = util::SafeLoad(&reinterpret_cast(bytes)[i]); + result_or |= x; + } + if (num_bytes % 8 > 0) { + uint64_t tail = 0; + result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8); + } + return result_or == 0; +} + +} // namespace util + +} // namespace arrow diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 42477857a18..8447d28261c 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -31,7 +31,6 @@ #include "arrow/status.h" #include "arrow/util/bit_util.h" #include "arrow/util/cpu_info.h" -#include "arrow/util/logging.h" #include "arrow/util/mutex.h" #include "arrow/util/thread_pool.h" #include "arrow/util/type_fwd.h" @@ -57,10 +56,25 @@ namespace arrow { namespace util { -template -inline void CheckAlignment(const void *ptr) { - ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); -} +// Some platforms typedef int64_t as long int instead of long long int, +// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics +// which need long long. +// We use the cast to the type below in these intrinsics to make the code +// compile in all cases. +// +using int64_for_gather_t = const long long int; // NOLINT runtime-int + +// All MiniBatch... classes use TempVectorStack for vector allocations and can +// only work with vectors up to 1024 elements. +// +// They should only be allocated on the stack to guarantee the right sequence +// of allocation and deallocation of vectors from TempVectorStack. +// +class MiniBatch { + public: + static constexpr int kLogMiniBatchLength = 10; + static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; +}; /// Storage used to allocate temporary vectors of a batch size. /// Temporary vectors should resemble allocating temporary variables on the stack @@ -94,30 +108,8 @@ class TempVectorStack { // return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; } - void alloc(uint32_t num_bytes, uint8_t **data, int *id) { - int64_t old_top = top_; - top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); - // Stack overflow check - ARROW_DCHECK(top_ <= buffer_size_); - *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); - // We set 8 bytes before the beginning of the allocated range and - // 8 bytes after the end to check for stack overflow (which would - // result in those known bytes being corrupted). - reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; - reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; - *id = num_vectors_++; - } - void release(int id, uint32_t num_bytes) { - ARROW_DCHECK(num_vectors_ == id + 1); - int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == - kGuard2); - ARROW_DCHECK(top_ >= size); - top_ -= size; - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == - kGuard1); - --num_vectors_; - } + void alloc(uint32_t num_bytes, uint8_t **data, int *id); + void release(int id, uint32_t num_bytes); static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; static constexpr int64_t kPadding = 64; @@ -127,5 +119,88 @@ class TempVectorStack { int64_t buffer_size_; }; +template +class TempVectorHolder { + friend class TempVectorStack; + + public: + ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } + T* mutable_data() { return reinterpret_cast(data_); } + TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { + stack_ = stack; + num_elements_ = num_elements; + stack_->alloc(num_elements * sizeof(T), &data_, &id_); + } + + private: + TempVectorStack* stack_; + uint8_t* data_; + int id_; + uint32_t num_elements_; +}; + +class bit_util { + public: + static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, int* num_indexes, + uint16_t* indexes, int bit_offset = 0); + + static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, + const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, int* num_indexes, + uint16_t* indexes, int bit_offset = 0); + + // Input and output indexes may be pointing to the same data (in-place filtering). + static void bits_split_indexes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, int* num_indexes_bit0, + uint16_t* indexes_bit0, uint16_t* indexes_bit1, + int bit_offset = 0); + + // Bit 1 is replaced with byte 0xFF. + static void bits_to_bytes(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); + + // Return highest bit of each byte. + static void bytes_to_bits(int64_t hardware_flags, const int num_bits, + const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); + + static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, + uint32_t num_bytes); + + private: + inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes); + inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value); + inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index, + int* num_indexes, uint16_t* indexes); + inline static void bits_filter_indexes_helper(uint64_t word, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); + template + static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes, + uint16_t base_index = 0); + +#if defined(ARROW_HAVE_AVX2) + static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, int* num_indexes, + uint16_t* indexes, uint16_t base_index = 0); + static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, + const uint8_t* bits, const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); + template + static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + int* num_indexes, uint16_t* indexes, + uint16_t base_index = 0); + template + static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, + const uint16_t* input_indexes, + int* num_indexes, uint16_t* indexes); + static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes); + static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits); + static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); +#endif +}; + } } \ No newline at end of file diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h new file mode 100644 index 00000000000..9d45ede940f --- /dev/null +++ b/cpp/src/arrow/compute/util_internal.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/logging.h" + +namespace arrow { +namespace util { + +template void CheckAlignment(const void *ptr) { + ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); +} + +} +} From f251a4545239411ecbe0edddf1e550364c93073b Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 09:22:31 -0500 Subject: [PATCH 03/11] some cleanup --- cpp/src/arrow/compute/exec/util.cc | 309 ------------------ cpp/src/arrow/compute/exec/util.h | 196 ----------- cpp/src/arrow/compute/key_map.cc | 49 --- cpp/src/arrow/compute/key_map.h | 49 ++- cpp/src/arrow/compute/light_array.h | 1 - cpp/src/arrow/compute/row/compare_internal.cc | 1 - cpp/src/arrow/compute/row/compare_internal.h | 1 - cpp/src/arrow/compute/row/encode_internal.cc | 2 - cpp/src/arrow/compute/row/encode_internal.h | 2 - cpp/src/arrow/compute/row/grouper.cc | 4 - cpp/src/arrow/compute/row/grouper.h | 2 - cpp/src/arrow/compute/row/row_internal.cc | 1 - 12 files changed, 47 insertions(+), 570 deletions(-) diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc index 98c193a8920..6a1fd37aa19 100644 --- a/cpp/src/arrow/compute/exec/util.cc +++ b/cpp/src/arrow/compute/exec/util.cc @@ -25,315 +25,6 @@ #include "arrow/util/ubsan.h" namespace arrow { - -//using bit_util::CountTrailingZeros; - -namespace util { -// -//inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) { -// // This will not be correct on big-endian architectures. -//#if !ARROW_LITTLE_ENDIAN -// ARROW_DCHECK(false); -//#endif -// ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); -// if (num_bytes == 8) { -// return util::SafeLoad(reinterpret_cast(bytes)); -// } else { -// uint64_t word = 0; -// for (int i = 0; i < num_bytes; ++i) { -// word |= static_cast(bytes[i]) << (8 * i); -// } -// return word; -// } -//} -// -//inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) { -// // This will not be correct on big-endian architectures. -//#if !ARROW_LITTLE_ENDIAN -// ARROW_DCHECK(false); -//#endif -// ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8); -// if (num_bytes == 8) { -// util::SafeStore(reinterpret_cast(bytes), value); -// } else { -// for (int i = 0; i < num_bytes; ++i) { -// bytes[i] = static_cast(value >> (8 * i)); -// } -// } -//} -// -//inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index, -// int* num_indexes, uint16_t* indexes) { -// int n = *num_indexes; -// while (word) { -// indexes[n++] = base_index + static_cast(CountTrailingZeros(word)); -// word &= word - 1; -// } -// *num_indexes = n; -//} -// -//inline void bit_util::bits_filter_indexes_helper(uint64_t word, -// const uint16_t* input_indexes, -// int* num_indexes, uint16_t* indexes) { -// int n = *num_indexes; -// while (word) { -// indexes[n++] = input_indexes[CountTrailingZeros(word)]; -// word &= word - 1; -// } -// *num_indexes = n; -//} -// -//template -//void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, -// const uint8_t* bits, -// const uint16_t* input_indexes, int* num_indexes, -// uint16_t* indexes, uint16_t base_index) { -// // 64 bits at a time -// constexpr int unroll = 64; -// int tail = num_bits % unroll; -//#if defined(ARROW_HAVE_AVX2) -// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { -// if (filter_input_indexes) { -// bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, -// num_indexes, indexes); -// } else { -// bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes, -// base_index); -// } -// } else { -//#endif -// *num_indexes = 0; -// for (int i = 0; i < num_bits / unroll; ++i) { -// uint64_t word = util::SafeLoad(&reinterpret_cast(bits)[i]); -// if (bit_to_search == 0) { -// word = ~word; -// } -// if (filter_input_indexes) { -// bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes); -// } else { -// bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); -// } -// } -//#if defined(ARROW_HAVE_AVX2) -// } -//#endif -// // Optionally process the last partial word with masking out bits outside range -// if (tail) { -// const uint8_t* bits_tail = bits + (num_bits - tail) / 8; -// uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8); -// if (bit_to_search == 0) { -// word = ~word; -// } -// word &= ~0ULL >> (64 - tail); -// if (filter_input_indexes) { -// bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes, -// indexes); -// } else { -// bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes); -// } -// } -//} -// -//void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits, -// const uint8_t* bits, int* num_indexes, uint16_t* indexes, -// int bit_offset) { -// bits += bit_offset / 8; -// bit_offset %= 8; -// *num_indexes = 0; -// uint16_t base_index = 0; -// if (bit_offset != 0) { -// uint64_t bits_head = bits[0] >> bit_offset; -// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); -// bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte, -// reinterpret_cast(&bits_head), num_indexes, indexes); -// if (num_bits <= bits_in_first_byte) { -// return; -// } -// num_bits -= bits_in_first_byte; -// indexes += *num_indexes; -// bits += 1; -// base_index = bits_in_first_byte; -// } -// -// int num_indexes_new = 0; -// if (bit_to_search == 0) { -// bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr, -// &num_indexes_new, indexes, base_index); -// } else { -// ARROW_DCHECK(bit_to_search == 1); -// bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr, -// &num_indexes_new, indexes, base_index); -// } -// *num_indexes += num_indexes_new; -//} -// -//void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags, -// const int num_bits, const uint8_t* bits, -// const uint16_t* input_indexes, int* num_indexes, -// uint16_t* indexes, int bit_offset) { -// bits += bit_offset / 8; -// bit_offset %= 8; -// if (bit_offset != 0) { -// int num_indexes_head = 0; -// uint64_t bits_head = bits[0] >> bit_offset; -// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); -// bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte, -// reinterpret_cast(&bits_head), input_indexes, -// &num_indexes_head, indexes); -// int num_indexes_tail = 0; -// if (num_bits > bits_in_first_byte) { -// bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte, -// bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail, -// indexes + num_indexes_head); -// } -// *num_indexes = num_indexes_head + num_indexes_tail; -// return; -// } -// -// if (bit_to_search == 0) { -// bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes, -// num_indexes, indexes); -// } else { -// ARROW_DCHECK(bit_to_search == 1); -// bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes, -// num_indexes, indexes); -// } -//} -// -//void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits, -// const uint8_t* bits, int* num_indexes_bit0, -// uint16_t* indexes_bit0, uint16_t* indexes_bit1, -// int bit_offset) { -// bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0, -// bit_offset); -// int num_indexes_bit1; -// bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1, -// bit_offset); -//} -// -//void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits, -// const uint8_t* bits, uint8_t* bytes, int bit_offset) { -// bits += bit_offset / 8; -// bit_offset %= 8; -// if (bit_offset != 0) { -// uint64_t bits_head = bits[0] >> bit_offset; -// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); -// bits_to_bytes(hardware_flags, bits_in_first_byte, -// reinterpret_cast(&bits_head), bytes); -// if (num_bits > bits_in_first_byte) { -// bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1, -// bytes + bits_in_first_byte); -// } -// return; -// } -// -// int num_processed = 0; -//#if defined(ARROW_HAVE_AVX2) -// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { -// // The function call below processes whole 32 bit chunks together. -// num_processed = num_bits - (num_bits % 32); -// bits_to_bytes_avx2(num_processed, bits, bytes); -// } -//#endif -// // Processing 8 bits at a time -// constexpr int unroll = 8; -// for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { -// uint8_t bits_next = bits[i]; -// // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart -// // from the previous. -// uint64_t unpacked = static_cast(bits_next & 0xfe) * -// ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | -// (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); -// unpacked |= (bits_next & 1); -// unpacked &= 0x0101010101010101ULL; -// unpacked *= 255; -// util::SafeStore(&reinterpret_cast(bytes)[i], unpacked); -// } -// int tail = num_bits % unroll; -// if (tail) { -// uint8_t bits_next = bits[(num_bits - tail) / unroll]; -// // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart -// // from the previous. -// uint64_t unpacked = static_cast(bits_next & 0xfe) * -// ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) | -// (1ULL << 35) | (1ULL << 42) | (1ULL << 49)); -// unpacked |= (bits_next & 1); -// unpacked &= 0x0101010101010101ULL; -// unpacked *= 255; -// SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked); -// } -//} -// -//void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits, -// const uint8_t* bytes, uint8_t* bits, int bit_offset) { -// bits += bit_offset / 8; -// bit_offset %= 8; -// if (bit_offset != 0) { -// uint64_t bits_head; -// int bits_in_first_byte = std::min(num_bits, 8 - bit_offset); -// bytes_to_bits(hardware_flags, bits_in_first_byte, bytes, -// reinterpret_cast(&bits_head)); -// uint8_t mask = (1 << bit_offset) - 1; -// *bits = static_cast((*bits & mask) | (bits_head << bit_offset)); -// -// if (num_bits > bits_in_first_byte) { -// bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte, -// bytes + bits_in_first_byte, bits + 1); -// } -// return; -// } -// -// int num_processed = 0; -//#if defined(ARROW_HAVE_AVX2) -// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { -// // The function call below processes whole 32 bit chunks together. -// num_processed = num_bits - (num_bits % 32); -// bytes_to_bits_avx2(num_processed, bytes, bits); -// } -//#endif -// // Process 8 bits at a time -// constexpr int unroll = 8; -// for (int i = num_processed / unroll; i < num_bits / unroll; ++i) { -// uint64_t bytes_next = util::SafeLoad(&reinterpret_cast(bytes)[i]); -// bytes_next &= 0x0101010101010101ULL; -// bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes -// bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes -// bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte -// bits[i] = static_cast(bytes_next & 0xff); -// } -// int tail = num_bits % unroll; -// if (tail) { -// uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail); -// bytes_next &= 0x0101010101010101ULL; -// bytes_next |= (bytes_next >> 7); // Pairs of adjacent output bits in individual bytes -// bytes_next |= (bytes_next >> 14); // 4 adjacent output bits in individual bytes -// bytes_next |= (bytes_next >> 28); // All 8 output bits in the lowest byte -// bits[num_bits / 8] = static_cast(bytes_next & 0xff); -// } -//} -// -//bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, -// uint32_t num_bytes) { -//#if defined(ARROW_HAVE_AVX2) -// if (hardware_flags & arrow::internal::CpuInfo::AVX2) { -// return are_all_bytes_zero_avx2(bytes, num_bytes); -// } -//#endif -// uint64_t result_or = 0; -// uint32_t i; -// for (i = 0; i < num_bytes / 8; ++i) { -// uint64_t x = util::SafeLoad(&reinterpret_cast(bytes)[i]); -// result_or |= x; -// } -// if (num_bytes % 8 > 0) { -// uint64_t tail = 0; -// result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8); -// } -// return result_or == 0; -//} - -} // namespace util - namespace compute { Status ValidateExecNodeInputs(ExecPlan* plan, const std::vector& inputs, diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h index e0ffca29070..8e5001d99c1 100644 --- a/cpp/src/arrow/compute/exec/util.h +++ b/cpp/src/arrow/compute/exec/util.h @@ -39,203 +39,7 @@ #include "arrow/util/thread_pool.h" #include "arrow/util/type_fwd.h" -// DIPO -//#if defined(__clang__) || defined(__GNUC__) -//#define BYTESWAP(x) __builtin_bswap64(x) -//#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31))) -//#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63))) -//#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) -//#elif defined(_MSC_VER) -//#include -//#define BYTESWAP(x) _byteswap_uint64(x) -//#define ROTL(x, n) _rotl((x), (n)) -//#define ROTL64(x, n) _rotl64((x), (n)) -//#if defined(_M_X64) || defined(_M_I86) -//#include // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx -//#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) -//#else -//#define PREFETCH(ptr) (void)(ptr) /* disabled */ -//#endif -//#endif - namespace arrow { -namespace util { - -// DIPO -//template -//inline void CheckAlignment(const void* ptr) { -// ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); -//} - -//// Some platforms typedef int64_t as long int instead of long long int, -//// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics -//// which need long long. -//// We use the cast to the type below in these intrinsics to make the code -//// compile in all cases. -//// -//using int64_for_gather_t = const long long int; // NOLINT runtime-int -// -//// All MiniBatch... classes use TempVectorStack for vector allocations and can -//// only work with vectors up to 1024 elements. -//// -//// They should only be allocated on the stack to guarantee the right sequence -//// of allocation and deallocation of vectors from TempVectorStack. -//// -//class MiniBatch { -// public: -// static constexpr int kLogMiniBatchLength = 10; -// static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; -//}; - -// DIPO -///// Storage used to allocate temporary vectors of a batch size. -///// Temporary vectors should resemble allocating temporary variables on the stack -///// but in the context of vectorized processing where we need to store a vector of -///// temporaries instead of a single value. -//class TempVectorStack { -// template -// friend class TempVectorHolder; -// -// public: -// Status Init(MemoryPool* pool, int64_t size) { -// num_vectors_ = 0; -// top_ = 0; -// buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t); -// ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool)); -// // Ensure later operations don't accidentally read uninitialized memory. -// std::memset(buffer->mutable_data(), 0xFF, size); -// buffer_ = std::move(buffer); -// return Status::OK(); -// } -// -// private: -// int64_t PaddedAllocationSize(int64_t num_bytes) { -// // Round up allocation size to multiple of 8 bytes -// // to avoid returning temp vectors with unaligned address. -// // -// // Also add padding at the end to facilitate loads and stores -// // using SIMD when number of vector elements is not divisible -// // by the number of SIMD lanes. -// // -// return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; -// } -// void alloc(uint32_t num_bytes, uint8_t** data, int* id) { -// int64_t old_top = top_; -// top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); -// // Stack overflow check -// ARROW_DCHECK(top_ <= buffer_size_); -// *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); -// // We set 8 bytes before the beginning of the allocated range and -// // 8 bytes after the end to check for stack overflow (which would -// // result in those known bytes being corrupted). -// reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; -// reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; -// *id = num_vectors_++; -// } -// void release(int id, uint32_t num_bytes) { -// ARROW_DCHECK(num_vectors_ == id + 1); -// int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); -// ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == -// kGuard2); -// ARROW_DCHECK(top_ >= size); -// top_ -= size; -// ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == -// kGuard1); -// --num_vectors_; -// } -// static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; -// static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; -// static constexpr int64_t kPadding = 64; -// int num_vectors_; -// int64_t top_; -// std::unique_ptr buffer_; -// int64_t buffer_size_; -//}; -// -//template -//class TempVectorHolder { -// friend class TempVectorStack; -// -// public: -// ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); } -// T* mutable_data() { return reinterpret_cast(data_); } -// TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) { -// stack_ = stack; -// num_elements_ = num_elements; -// stack_->alloc(num_elements * sizeof(T), &data_, &id_); -// } -// -// private: -// TempVectorStack* stack_; -// uint8_t* data_; -// int id_; -// uint32_t num_elements_; -//}; -// -//class bit_util { -// public: -// static void bits_to_indexes(int bit_to_search, int64_t hardware_flags, -// const int num_bits, const uint8_t* bits, int* num_indexes, -// uint16_t* indexes, int bit_offset = 0); -// -// static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, -// const int num_bits, const uint8_t* bits, -// const uint16_t* input_indexes, int* num_indexes, -// uint16_t* indexes, int bit_offset = 0); -// -// // Input and output indexes may be pointing to the same data (in-place filtering). -// static void bits_split_indexes(int64_t hardware_flags, const int num_bits, -// const uint8_t* bits, int* num_indexes_bit0, -// uint16_t* indexes_bit0, uint16_t* indexes_bit1, -// int bit_offset = 0); -// -// // Bit 1 is replaced with byte 0xFF. -// static void bits_to_bytes(int64_t hardware_flags, const int num_bits, -// const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); -// -// // Return highest bit of each byte. -// static void bytes_to_bits(int64_t hardware_flags, const int num_bits, -// const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); -// -// static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, -// uint32_t num_bytes); -// -// private: -// inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes); -// inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value); -// inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index, -// int* num_indexes, uint16_t* indexes); -// inline static void bits_filter_indexes_helper(uint64_t word, -// const uint16_t* input_indexes, -// int* num_indexes, uint16_t* indexes); -// template -// static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, -// const uint8_t* bits, const uint16_t* input_indexes, -// int* num_indexes, uint16_t* indexes, -// uint16_t base_index = 0); -// -//#if defined(ARROW_HAVE_AVX2) -// static void bits_to_indexes_avx2(int bit_to_search, const int num_bits, -// const uint8_t* bits, int* num_indexes, -// uint16_t* indexes, uint16_t base_index = 0); -// static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, -// const uint8_t* bits, const uint16_t* input_indexes, -// int* num_indexes, uint16_t* indexes); -// template -// static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, -// int* num_indexes, uint16_t* indexes, -// uint16_t base_index = 0); -// template -// static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits, -// const uint16_t* input_indexes, -// int* num_indexes, uint16_t* indexes); -// static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes); -// static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits); -// static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); -//#endif -//}; - -} // namespace util namespace compute { ARROW_EXPORT diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index 46741602cd6..dac1dfa2483 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -17,15 +17,12 @@ #include "key_map.h" -#include - #include #include #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/ubsan.h" -#include "arrow/util/logging.h" namespace arrow { @@ -832,51 +829,5 @@ void SwissTable::cleanup() { num_inserted_ = 0; } - -uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot, - uint64_t group_id_mask) const { - // Group id values for all 8 slots in the block are bit-packed and follow the status - // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In - // that case we can extract group id using aligned 64-bit word access. - int num_group_id_bits = static_cast(ARROW_POPCOUNT64(group_id_mask)); - ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 || - num_group_id_bits == 32 || num_group_id_bits == 64); - - int bit_offset = slot * num_group_id_bits; - const uint64_t* group_id_bytes = - reinterpret_cast(block_ptr) + 1 + (bit_offset >> 6); - uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask; - - return group_id; -} - -void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash, - uint32_t group_id) { - const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); - - // We assume here that the number of bits is rounded up to 8, 16, 32 or 64. - // In that case we can insert group id value using aligned 64-bit word access. - ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || - num_groupid_bits == 32 || num_groupid_bits == 64); - - const uint64_t num_block_bytes = (8 + num_groupid_bits); - constexpr uint64_t stamp_mask = 0x7f; - - int start_slot = (slot_id & 7); - int stamp = - static_cast((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask); - uint64_t block_id = slot_id >> 3; - uint8_t* blockbase = blocks_ + num_block_bytes * block_id; - - blockbase[7 - start_slot] = static_cast(stamp); - int groupid_bit_offset = static_cast(start_slot * num_groupid_bits); - - // Block status bytes should start at an address aligned to 8 bytes - ARROW_DCHECK((reinterpret_cast(blockbase) & 7) == 0); - uint64_t* ptr = reinterpret_cast(blockbase) + 1 + (groupid_bit_offset >> 6); - *ptr |= (static_cast(group_id) << (groupid_bit_offset & 63)); -} - - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h index c9a3a9dc100..790c90e6411 100644 --- a/cpp/src/arrow/compute/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -20,6 +20,7 @@ #include #include "arrow/compute/util.h" +#include "arrow/compute/util_internal.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" @@ -107,7 +108,7 @@ class SwissTable { /// \brief Extract group id for a given slot in a given block. /// - uint64_t extract_group_id(const uint8_t* block_ptr, int slot, + inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot, uint64_t group_id_mask) const; void extract_group_ids(const int num_keys, const uint16_t* optional_selection, const uint32_t* hashes, const uint8_t* local_slots, @@ -159,7 +160,7 @@ class SwissTable { inline bool find_next_stamp_match(const uint32_t hash, const uint32_t in_slot_id, uint32_t* out_slot_id, uint32_t* out_group_id) const; - void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id); + inline void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id); // Slow processing of input keys in the most generic case. // Handles inserting new keys. @@ -227,5 +228,49 @@ class SwissTable { MemoryPool* pool_; }; +uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot, + uint64_t group_id_mask) const { + // Group id values for all 8 slots in the block are bit-packed and follow the status + // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In + // that case we can extract group id using aligned 64-bit word access. + int num_group_id_bits = static_cast(ARROW_POPCOUNT64(group_id_mask)); + ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 || + num_group_id_bits == 32 || num_group_id_bits == 64); + + int bit_offset = slot * num_group_id_bits; + const uint64_t* group_id_bytes = + reinterpret_cast(block_ptr) + 1 + (bit_offset >> 6); + uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask; + + return group_id; +} + +void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash, + uint32_t group_id) { + const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_); + + // We assume here that the number of bits is rounded up to 8, 16, 32 or 64. + // In that case we can insert group id value using aligned 64-bit word access. + ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || + num_groupid_bits == 32 || num_groupid_bits == 64); + + const uint64_t num_block_bytes = (8 + num_groupid_bits); + constexpr uint64_t stamp_mask = 0x7f; + + int start_slot = (slot_id & 7); + int stamp = + static_cast((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask); + uint64_t block_id = slot_id >> 3; + uint8_t* blockbase = blocks_ + num_block_bytes * block_id; + + blockbase[7 - start_slot] = static_cast(stamp); + int groupid_bit_offset = static_cast(start_slot * num_groupid_bits); + + // Block status bytes should start at an address aligned to 8 bytes + ARROW_DCHECK((reinterpret_cast(blockbase) & 7) == 0); + uint64_t* ptr = reinterpret_cast(blockbase) + 1 + (groupid_bit_offset >> 6); + *ptr |= (static_cast(group_id) << (groupid_bit_offset & 63)); +} + } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h index 33b48161733..d617b0aa064 100644 --- a/cpp/src/arrow/compute/light_array.h +++ b/cpp/src/arrow/compute/light_array.h @@ -21,7 +21,6 @@ #include "arrow/array.h" #include "arrow/compute/exec.h" -// DIPO #include "arrow/compute/exec/util.h" #include "arrow/compute/util.h" #include "arrow/type.h" #include "arrow/util/cpu_info.h" diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 2216add6849..39ac33932b5 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -22,7 +22,6 @@ #include #include -// DIPO #include "arrow/compute/util.h" #include "arrow/compute/util_internal.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index 162eae19790..85a4a4f68af 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -19,7 +19,6 @@ #include -// DIPO #include "arrow/compute/util.h" #include "arrow/compute/light_array.h" #include "arrow/compute/row/encode_internal.h" diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 8c0c0aac6a7..3a6a85b0272 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -16,8 +16,6 @@ // under the License. #include "arrow/compute/row/encode_internal.h" -//#include "arrow/compute/exec.h" -//DIPO #include "arrow/util/checked_cast.h" namespace arrow { diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 58d4abd3e83..2caa02d2f9c 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -22,8 +22,6 @@ #include #include "arrow/array/data.h" -//#include "arrow/compute/exec.h" -// DIPO #include "arrow/compute/key_map.h" #include "arrow/compute/util.h" #include "arrow/compute/light_array.h" diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index c879e7c7403..1747bf73267 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -21,10 +21,6 @@ #include #include "arrow/compute/key_hash.h" -// DIPO -//#include "arrow/compute/exec/key_map.h" -//#include "arrow/compute/exec/options.h" -//#include "arrow/compute/exec_internal.h" #include "arrow/compute/function.h" #include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/light_array.h" diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index 0ab5bba5f3e..8f43d2601d5 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -20,8 +20,6 @@ #include #include -//#include "arrow/compute/exec.h" -//#include "arrow/compute/exec/options.h" #include "arrow/compute/key_map.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc index 52e0c9cb6c7..f6a62c09fcf 100644 --- a/cpp/src/arrow/compute/row/row_internal.cc +++ b/cpp/src/arrow/compute/row/row_internal.cc @@ -17,7 +17,6 @@ #include "arrow/compute/row/row_internal.h" -// DIPO #include "arrow/compute/util.h" namespace arrow { From 817013dcb455c8e5c1c6d9012a0037c09d53266c Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 09:40:43 -0500 Subject: [PATCH 04/11] fix api test --- cpp/src/arrow/compute/key_map.cc | 2 +- cpp/src/arrow/compute/row/grouper.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index dac1dfa2483..4161f1e75c3 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "key_map.h" +#include "arrow/compute/key_map.h" #include #include diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h index 8f43d2601d5..94c591687da 100644 --- a/cpp/src/arrow/compute/row/grouper.h +++ b/cpp/src/arrow/compute/row/grouper.h @@ -20,7 +20,6 @@ #include #include -#include "arrow/compute/key_map.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" #include "arrow/result.h" From 105cd3c9ad732092cf84ecafadff721607c76e9f Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 10:44:18 -0500 Subject: [PATCH 05/11] more include update --- cpp/src/arrow/compute/key_hash.h | 2 +- cpp/src/arrow/compute/row/grouper.cc | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h index 68197973e02..f0056af5cb1 100644 --- a/cpp/src/arrow/compute/key_hash.h +++ b/cpp/src/arrow/compute/key_hash.h @@ -23,7 +23,7 @@ #include -#include "arrow/compute/exec/util.h" +#include "arrow/compute/util.h" #include "arrow/compute/light_array.h" namespace arrow { diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index 1747bf73267..fce39261a73 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -25,6 +25,7 @@ #include "arrow/compute/kernels/row_encoder_internal.h" #include "arrow/compute/light_array.h" #include "arrow/compute/registry.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/row/compare_internal.h" #include "arrow/type.h" #include "arrow/util/bitmap_ops.h" From ea5eeb9a987bbd48364f6943bfdb24d3ebe7abed Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 12:30:40 -0500 Subject: [PATCH 06/11] more include update --- cpp/src/arrow/compute/kernels/codegen_internal.h | 1 - cpp/src/arrow/compute/kernels/common_internal.h | 3 ++- cpp/src/arrow/compute/kernels/hash_aggregate.cc | 4 ---- cpp/src/arrow/compute/kernels/row_encoder_internal.h | 1 - 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index b05a47b600c..3dd1f2b8112 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -30,7 +30,6 @@ #include "arrow/array/data.h" #include "arrow/buffer.h" #include "arrow/buffer_builder.h" -#include "arrow/compute/exec.h" #include "arrow/compute/kernel.h" #include "arrow/datum.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/kernels/common_internal.h b/cpp/src/arrow/compute/kernels/common_internal.h index bf90d114512..d5fe3e0376d 100644 --- a/cpp/src/arrow/compute/kernels/common_internal.h +++ b/cpp/src/arrow/compute/kernels/common_internal.h @@ -30,7 +30,8 @@ #include "arrow/array/data.h" #include "arrow/buffer.h" #include "arrow/chunked_array.h" -#include "arrow/compute/exec.h" + + #include "arrow/compute/function.h" #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/codegen_internal.h" diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc index 81fb2e871b5..380dde016ef 100644 --- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc +++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc @@ -29,10 +29,6 @@ #include "arrow/buffer_builder.h" #include "arrow/compute/api_aggregate.h" #include "arrow/compute/api_vector.h" -#include "arrow/compute/key_hash.h" -#include "arrow/compute/key_map.h" -#include "arrow/compute/exec/util.h" -#include "arrow/compute/exec_internal.h" #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/aggregate_internal.h" #include "arrow/compute/kernels/aggregate_var_std_internal.h" diff --git a/cpp/src/arrow/compute/kernels/row_encoder_internal.h b/cpp/src/arrow/compute/kernels/row_encoder_internal.h index 5fe80e0f506..9bf7c1d1c4f 100644 --- a/cpp/src/arrow/compute/kernels/row_encoder_internal.h +++ b/cpp/src/arrow/compute/kernels/row_encoder_internal.h @@ -19,7 +19,6 @@ #include -#include "arrow/compute/exec.h" #include "arrow/compute/kernels/codegen_internal.h" #include "arrow/visit_data_inline.h" From 2a25b29d39805079373de23474505a1eb27785ad Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 13:15:35 -0500 Subject: [PATCH 07/11] format files --- cpp/src/arrow/compute/exec/asof_join_node.cc | 2 +- .../arrow/compute/exec/bloom_filter_test.cc | 2 +- cpp/src/arrow/compute/exec/hash_join_node.cc | 2 +- cpp/src/arrow/compute/exec/swiss_join.cc | 2 +- .../arrow/compute/exec/swiss_join_internal.h | 2 +- .../arrow/compute/kernels/common_internal.h | 1 - cpp/src/arrow/compute/key_hash.h | 2 +- cpp/src/arrow/compute/key_hash_test.cc | 2 +- cpp/src/arrow/compute/key_map.cc | 1 - cpp/src/arrow/compute/key_map.h | 10 ++++----- cpp/src/arrow/compute/light_array_test.cc | 10 ++++----- cpp/src/arrow/compute/row/compare_internal.h | 2 +- cpp/src/arrow/compute/row/encode_internal.h | 2 +- cpp/src/arrow/compute/row/grouper.cc | 4 ++-- cpp/src/arrow/compute/util.cc | 22 +++++++++---------- cpp/src/arrow/compute/util.h | 13 +++++------ cpp/src/arrow/compute/util_internal.h | 9 ++++---- 17 files changed, 43 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/compute/exec/asof_join_node.cc b/cpp/src/arrow/compute/exec/asof_join_node.cc index 4aa0fb72f05..47acc41e889 100644 --- a/cpp/src/arrow/compute/exec/asof_join_node.cc +++ b/cpp/src/arrow/compute/exec/asof_join_node.cc @@ -30,11 +30,11 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #include "arrow/compute/exec/exec_plan.h" -#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/query_context.h" #include "arrow/compute/exec/schema_util.h" #include "arrow/compute/exec/util.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/light_array.h" #include "arrow/record_batch.h" #include "arrow/result.h" diff --git a/cpp/src/arrow/compute/exec/bloom_filter_test.cc b/cpp/src/arrow/compute/exec/bloom_filter_test.cc index 5dc35ed42ab..3a79c10be2a 100644 --- a/cpp/src/arrow/compute/exec/bloom_filter_test.cc +++ b/cpp/src/arrow/compute/exec/bloom_filter_test.cc @@ -23,10 +23,10 @@ #include #include #include "arrow/compute/exec/bloom_filter.h" -#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/task_util.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/util.h" +#include "arrow/compute/key_hash.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/cpu_info.h" diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc index c270b868ecc..6da58330e22 100644 --- a/cpp/src/arrow/compute/exec/hash_join_node.cc +++ b/cpp/src/arrow/compute/exec/hash_join_node.cc @@ -24,10 +24,10 @@ #include "arrow/compute/exec/hash_join.h" #include "arrow/compute/exec/hash_join_dict.h" #include "arrow/compute/exec/hash_join_node.h" -#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/schema_util.h" #include "arrow/compute/exec/util.h" +#include "arrow/compute/key_hash.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" #include "arrow/util/thread_pool.h" diff --git a/cpp/src/arrow/compute/exec/swiss_join.cc b/cpp/src/arrow/compute/exec/swiss_join.cc index 69479325bbb..8bf2ee1df47 100644 --- a/cpp/src/arrow/compute/exec/swiss_join.cc +++ b/cpp/src/arrow/compute/exec/swiss_join.cc @@ -22,10 +22,10 @@ #include #include "arrow/array/util.h" // MakeArrayFromScalar #include "arrow/compute/exec/hash_join.h" -#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/swiss_join_internal.h" #include "arrow/compute/exec/util.h" #include "arrow/compute/kernels/row_encoder_internal.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/row/compare_internal.h" #include "arrow/compute/row/encode_internal.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/compute/exec/swiss_join_internal.h b/cpp/src/arrow/compute/exec/swiss_join_internal.h index 4c765874bea..766f40e131c 100644 --- a/cpp/src/arrow/compute/exec/swiss_join_internal.h +++ b/cpp/src/arrow/compute/exec/swiss_join_internal.h @@ -18,12 +18,12 @@ #pragma once #include -#include "arrow/compute/key_map.h" #include "arrow/compute/exec/options.h" #include "arrow/compute/exec/partition_util.h" #include "arrow/compute/exec/schema_util.h" #include "arrow/compute/exec/task_util.h" #include "arrow/compute/kernels/row_encoder_internal.h" +#include "arrow/compute/key_map.h" #include "arrow/compute/light_array.h" #include "arrow/compute/row/encode_internal.h" diff --git a/cpp/src/arrow/compute/kernels/common_internal.h b/cpp/src/arrow/compute/kernels/common_internal.h index d5fe3e0376d..744aee23795 100644 --- a/cpp/src/arrow/compute/kernels/common_internal.h +++ b/cpp/src/arrow/compute/kernels/common_internal.h @@ -31,7 +31,6 @@ #include "arrow/buffer.h" #include "arrow/chunked_array.h" - #include "arrow/compute/function.h" #include "arrow/compute/kernel.h" #include "arrow/compute/kernels/codegen_internal.h" diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h index f0056af5cb1..ddf86dfcdc0 100644 --- a/cpp/src/arrow/compute/key_hash.h +++ b/cpp/src/arrow/compute/key_hash.h @@ -23,8 +23,8 @@ #include -#include "arrow/compute/util.h" #include "arrow/compute/light_array.h" +#include "arrow/compute/util.h" namespace arrow { namespace compute { diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index 1ee9eb25312..d030e622641 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -21,9 +21,9 @@ #include #include #include "arrow/array/builder_binary.h" -#include "arrow/compute/key_hash.h" #include "arrow/compute/exec/test_util.h" #include "arrow/compute/exec/util.h" +#include "arrow/compute/key_hash.h" #include "arrow/util/cpu_info.h" #include "arrow/util/pcg_random.h" diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index 4161f1e75c3..ebbf8a7b828 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -24,7 +24,6 @@ #include "arrow/util/bitmap_ops.h" #include "arrow/util/ubsan.h" - namespace arrow { using bit_util::CountLeadingZeros; diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h index 790c90e6411..4702c5ecc8b 100644 --- a/cpp/src/arrow/compute/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -234,8 +234,8 @@ uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot, // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In // that case we can extract group id using aligned 64-bit word access. int num_group_id_bits = static_cast(ARROW_POPCOUNT64(group_id_mask)); - ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 || - num_group_id_bits == 32 || num_group_id_bits == 64); + ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 || + num_group_id_bits == 32 || num_group_id_bits == 64); int bit_offset = slot * num_group_id_bits; const uint64_t* group_id_bytes = @@ -251,8 +251,8 @@ void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash, // We assume here that the number of bits is rounded up to 8, 16, 32 or 64. // In that case we can insert group id value using aligned 64-bit word access. - ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || - num_groupid_bits == 32 || num_groupid_bits == 64); + ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 || + num_groupid_bits == 32 || num_groupid_bits == 64); const uint64_t num_block_bytes = (8 + num_groupid_bits); constexpr uint64_t stamp_mask = 0x7f; @@ -267,7 +267,7 @@ void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash, int groupid_bit_offset = static_cast(start_slot * num_groupid_bits); // Block status bytes should start at an address aligned to 8 bytes - ARROW_DCHECK((reinterpret_cast(blockbase) & 7) == 0); + ARROW_DCHECK((reinterpret_cast(blockbase) & 7) == 0); uint64_t* ptr = reinterpret_cast(blockbase) + 1 + (groupid_bit_offset >> 6); *ptr |= (static_cast(group_id) << (groupid_bit_offset & 63)); } diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 015d407e810..8b0e38e64c8 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -217,7 +217,7 @@ TEST(KeyColumnArray, SliceBool) { } // DIPO -//TEST(KeyColumnArray, FromExecBatch) { +// TEST(KeyColumnArray, FromExecBatch) { // ExecBatch batch = // ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); // std::vector arrays; @@ -316,7 +316,7 @@ TEST(ResizableArrayData, Binary) { } } // DIPO -//TEST(ExecBatchBuilder, AppendBatches) { +// TEST(ExecBatchBuilder, AppendBatches) { // std::unique_ptr owned_pool = MemoryPool::CreateDefault(); // MemoryPool* pool = owned_pool.get(); // ExecBatch batch_one = @@ -339,7 +339,7 @@ TEST(ResizableArrayData, Binary) { //} // DIPO -//TEST(ExecBatchBuilder, AppendBatchesSomeRows) { +// TEST(ExecBatchBuilder, AppendBatchesSomeRows) { // std::unique_ptr owned_pool = MemoryPool::CreateDefault(); // MemoryPool* pool = owned_pool.get(); // ExecBatch batch_one = @@ -360,7 +360,7 @@ TEST(ResizableArrayData, Binary) { // ASSERT_EQ(0, pool->bytes_allocated()); //} // DIPO -//TEST(ExecBatchBuilder, AppendBatchesSomeCols) { +// TEST(ExecBatchBuilder, AppendBatchesSomeCols) { // std::unique_ptr owned_pool = MemoryPool::CreateDefault(); // MemoryPool* pool = owned_pool.get(); // ExecBatch batch_one = @@ -408,7 +408,7 @@ TEST(ResizableArrayData, Binary) { // ASSERT_EQ(0, pool->bytes_allocated()); //} // -//TEST(ExecBatchBuilder, AppendNulls) { +// TEST(ExecBatchBuilder, AppendNulls) { // std::unique_ptr owned_pool = MemoryPool::CreateDefault(); // MemoryPool* pool = owned_pool.get(); // ExecBatch batch_one = diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index 85a4a4f68af..778485e5c46 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -19,10 +19,10 @@ #include -#include "arrow/compute/util.h" #include "arrow/compute/light_array.h" #include "arrow/compute/row/encode_internal.h" #include "arrow/compute/row/row_internal.h" +#include "arrow/compute/util.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index 2caa02d2f9c..bdf38df4fc3 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -23,9 +23,9 @@ #include "arrow/array/data.h" #include "arrow/compute/key_map.h" -#include "arrow/compute/util.h" #include "arrow/compute/light_array.h" #include "arrow/compute/row/row_internal.h" +#include "arrow/compute/util.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc index fce39261a73..ca26600c98b 100644 --- a/cpp/src/arrow/compute/row/grouper.cc +++ b/cpp/src/arrow/compute/row/grouper.cc @@ -20,12 +20,12 @@ #include #include -#include "arrow/compute/key_hash.h" +#include "arrow/compute/api_vector.h" #include "arrow/compute/function.h" #include "arrow/compute/kernels/row_encoder_internal.h" +#include "arrow/compute/key_hash.h" #include "arrow/compute/light_array.h" #include "arrow/compute/registry.h" -#include "arrow/compute/api_vector.h" #include "arrow/compute/row/compare_internal.h" #include "arrow/type.h" #include "arrow/util/bitmap_ops.h" diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index 572a52a5f19..78f90ea37f7 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -20,9 +20,9 @@ #include "arrow/table.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" +#include "arrow/util/logging.h" #include "arrow/util/tracing_internal.h" #include "arrow/util/ubsan.h" -#include "arrow/util/logging.h" namespace arrow { @@ -30,29 +30,29 @@ using bit_util::CountTrailingZeros; namespace util { -void TempVectorStack::alloc(uint32_t num_bytes, uint8_t **data, int *id) { +void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) { int64_t old_top = top_; top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); // Stack overflow check - ARROW_DCHECK(top_ <= buffer_size_); + ARROW_DCHECK(top_ <= buffer_size_); *data = buffer_->mutable_data() + old_top + sizeof(uint64_t); // We set 8 bytes before the beginning of the allocated range and // 8 bytes after the end to check for stack overflow (which would // result in those known bytes being corrupted). - reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; - reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; + reinterpret_cast(buffer_->mutable_data() + old_top)[0] = kGuard1; + reinterpret_cast(buffer_->mutable_data() + top_)[-1] = kGuard2; *id = num_vectors_++; } void TempVectorStack::release(int id, uint32_t num_bytes) { - ARROW_DCHECK(num_vectors_ == id + 1); + ARROW_DCHECK(num_vectors_ == id + 1); int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t); - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == - kGuard2); - ARROW_DCHECK(top_ >= size); + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[-1] == + kGuard2); + ARROW_DCHECK(top_ >= size); top_ -= size; - ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == - kGuard1); + ARROW_DCHECK(reinterpret_cast(buffer_->mutable_data() + top_)[0] == + kGuard1); --num_vectors_; } diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 8447d28261c..66082afc0e2 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -81,12 +81,11 @@ class MiniBatch { /// but in the context of vectorized processing where we need to store a vector of /// temporaries instead of a single value. class TempVectorStack { - template - friend - class TempVectorHolder; + template + friend class TempVectorHolder; public: - Status Init(MemoryPool *pool, int64_t size) { + Status Init(MemoryPool* pool, int64_t size) { num_vectors_ = 0; top_ = 0; buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t); @@ -108,7 +107,7 @@ class TempVectorStack { // return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding; } - void alloc(uint32_t num_bytes, uint8_t **data, int *id); + void alloc(uint32_t num_bytes, uint8_t** data, int* id); void release(int id, uint32_t num_bytes); static constexpr uint64_t kGuard1 = 0x3141592653589793ULL; static constexpr uint64_t kGuard2 = 0x0577215664901532ULL; @@ -202,5 +201,5 @@ class bit_util { #endif }; -} -} \ No newline at end of file +} // namespace util +} // namespace arrow \ No newline at end of file diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h index 9d45ede940f..87e89a33507 100644 --- a/cpp/src/arrow/compute/util_internal.h +++ b/cpp/src/arrow/compute/util_internal.h @@ -22,9 +22,10 @@ namespace arrow { namespace util { -template void CheckAlignment(const void *ptr) { - ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); +template +void CheckAlignment(const void* ptr) { + ARROW_DCHECK(reinterpret_cast(ptr) % sizeof(T) == 0); } -} -} +} // namespace util +} // namespace arrow From b81e2713fcf087edb3ca77bae6201bed1064262e Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 17:08:32 -0500 Subject: [PATCH 08/11] fix light_array test --- cpp/src/arrow/compute/exec/CMakeLists.txt | 6 +- .../compute/exec/light_array_exec_test.cc | 172 ++++++++++++++++++ cpp/src/arrow/compute/light_array_test.cc | 144 --------------- 3 files changed, 177 insertions(+), 145 deletions(-) create mode 100644 cpp/src/arrow/compute/exec/light_array_exec_test.cc diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index cc8e7175a2a..ec854c83936 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -53,7 +53,6 @@ add_arrow_compute_test(pivot_longer_node_test SOURCES pivot_longer_node_test.cc test_nodes.cc) - add_arrow_compute_test(asof_join_node_test REQUIRE_ALL_KERNELS PREFIX @@ -70,6 +69,11 @@ add_arrow_compute_test(util_test SOURCES util_test.cc task_util_test.cc) +add_arrow_compute_test(light_array_exec_test + PREFIX + "arrow-compute" + SOURCES + light_array_exec_test.cc) add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute") diff --git a/cpp/src/arrow/compute/exec/light_array_exec_test.cc b/cpp/src/arrow/compute/exec/light_array_exec_test.cc new file mode 100644 index 00000000000..ff452fa91fe --- /dev/null +++ b/cpp/src/arrow/compute/exec/light_array_exec_test.cc @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/compute/light_array.h" + +#include + +#include "arrow/compute/exec/test_util.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/type.h" +#include "arrow/util/checked_cast.h" + +namespace arrow { +namespace compute { + + TEST(KeyColumnArray, FromExecBatch) { + ExecBatch batch = + ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); + std::vector arrays; + ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays)); + + ASSERT_EQ(2, arrays.size()); + ASSERT_EQ(8, arrays[0].metadata().fixed_length); + ASSERT_EQ(0, arrays[1].metadata().fixed_length); + ASSERT_EQ(3, arrays[0].length()); + ASSERT_EQ(3, arrays[1].length()); + + ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays)); + + ASSERT_EQ(2, arrays.size()); + ASSERT_EQ(8, arrays[0].metadata().fixed_length); + ASSERT_EQ(0, arrays[1].metadata().fixed_length); + ASSERT_EQ(1, arrays[0].length()); + ASSERT_EQ(1, arrays[1].length()); +} + +TEST(ExecBatchBuilder, AppendBatches) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + ExecBatch batch_one = + ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); + ExecBatch batch_two = + ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); + ExecBatch combined = ExecBatchFromJSON( + {int64(), boolean()}, + "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]"); + { + ExecBatchBuilder builder; + uint16_t row_ids[3] = {0, 1, 2}; + ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); + ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(combined, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); +} + +TEST(ExecBatchBuilder, AppendBatchesSomeRows) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + ExecBatch batch_one = + ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); + ExecBatch batch_two = + ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); + ExecBatch combined = ExecBatchFromJSON( + {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]"); + { + ExecBatchBuilder builder; + uint16_t row_ids[2] = {0, 1}; + ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2)); + ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(combined, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); +} + +TEST(ExecBatchBuilder, AppendBatchesSomeCols) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + ExecBatch batch_one = + ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); + ExecBatch batch_two = + ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); + ExecBatch first_col_only = + ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]"); + ExecBatch last_col_only = ExecBatchFromJSON( + {boolean()}, "[[true], [false], [null], [true], [true], [false]]"); + { + ExecBatchBuilder builder; + uint16_t row_ids[3] = {0, 1, 2}; + int first_col_ids[1] = {0}; + ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, + first_col_ids)); + ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, + first_col_ids)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(first_col_only, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + { + ExecBatchBuilder builder; + uint16_t row_ids[3] = {0, 1, 2}; + // If we don't specify col_ids and num_cols is 1 it is implicitly the first col + ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1)); + ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(first_col_only, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + { + ExecBatchBuilder builder; + uint16_t row_ids[3] = {0, 1, 2}; + int last_col_ids[1] = {1}; + ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, + last_col_ids)); + ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, + last_col_ids)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(last_col_only, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); +} + + TEST(ExecBatchBuilder, AppendNulls) { + std::unique_ptr owned_pool = MemoryPool::CreateDefault(); + MemoryPool* pool = owned_pool.get(); + ExecBatch batch_one = + ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); + ExecBatch combined = ExecBatchFromJSON( + {int64(), boolean()}, + "[[1, true], [2, false], [null, null], [null, null], [null, null]]"); + ExecBatch just_nulls = + ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]"); + { + ExecBatchBuilder builder; + uint16_t row_ids[3] = {0, 1, 2}; + ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); + ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(combined, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + { + ExecBatchBuilder builder; + ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); + ExecBatch built = builder.Flush(); + ASSERT_EQ(just_nulls, built); + ASSERT_NE(0, pool->bytes_allocated()); + } + ASSERT_EQ(0, pool->bytes_allocated()); +} + +} // namespace compute +} // namespace arrow diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index 8b0e38e64c8..c5e83b546b6 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -20,7 +20,6 @@ #include #include -// DIPO #include "arrow/compute/exec/test_util.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -216,28 +215,6 @@ TEST(KeyColumnArray, SliceBool) { } } -// DIPO -// TEST(KeyColumnArray, FromExecBatch) { -// ExecBatch batch = -// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); -// std::vector arrays; -// ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays)); -// -// ASSERT_EQ(2, arrays.size()); -// ASSERT_EQ(8, arrays[0].metadata().fixed_length); -// ASSERT_EQ(0, arrays[1].metadata().fixed_length); -// ASSERT_EQ(3, arrays[0].length()); -// ASSERT_EQ(3, arrays[1].length()); -// -// ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays)); -// -// ASSERT_EQ(2, arrays.size()); -// ASSERT_EQ(8, arrays[0].metadata().fixed_length); -// ASSERT_EQ(0, arrays[1].metadata().fixed_length); -// ASSERT_EQ(1, arrays[0].length()); -// ASSERT_EQ(1, arrays[1].length()); -//} - TEST(ResizableArrayData, Basic) { std::unique_ptr pool = MemoryPool::CreateDefault(); for (const auto& type : kSampleFixedDataTypes) { @@ -315,127 +292,6 @@ TEST(ResizableArrayData, Binary) { ASSERT_EQ(0, pool->bytes_allocated()); } } -// DIPO -// TEST(ExecBatchBuilder, AppendBatches) { -// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); -// MemoryPool* pool = owned_pool.get(); -// ExecBatch batch_one = -// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); -// ExecBatch batch_two = -// ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); -// ExecBatch combined = ExecBatchFromJSON( -// {int64(), boolean()}, -// "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]"); -// { -// ExecBatchBuilder builder; -// uint16_t row_ids[3] = {0, 1, 2}; -// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); -// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2)); -// ExecBatch built = builder.Flush(); -// ASSERT_EQ(combined, built); -// ASSERT_NE(0, pool->bytes_allocated()); -// } -// ASSERT_EQ(0, pool->bytes_allocated()); -//} - -// DIPO -// TEST(ExecBatchBuilder, AppendBatchesSomeRows) { -// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); -// MemoryPool* pool = owned_pool.get(); -// ExecBatch batch_one = -// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); -// ExecBatch batch_two = -// ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); -// ExecBatch combined = ExecBatchFromJSON( -// {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]"); -// { -// ExecBatchBuilder builder; -// uint16_t row_ids[2] = {0, 1}; -// ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2)); -// ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2)); -// ExecBatch built = builder.Flush(); -// ASSERT_EQ(combined, built); -// ASSERT_NE(0, pool->bytes_allocated()); -// } -// ASSERT_EQ(0, pool->bytes_allocated()); -//} -// DIPO -// TEST(ExecBatchBuilder, AppendBatchesSomeCols) { -// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); -// MemoryPool* pool = owned_pool.get(); -// ExecBatch batch_one = -// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); -// ExecBatch batch_two = -// ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]"); -// ExecBatch first_col_only = -// ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]"); -// ExecBatch last_col_only = ExecBatchFromJSON( -// {boolean()}, "[[true], [false], [null], [true], [true], [false]]"); -// { -// ExecBatchBuilder builder; -// uint16_t row_ids[3] = {0, 1, 2}; -// int first_col_ids[1] = {0}; -// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, -// first_col_ids)); -// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, -// first_col_ids)); -// ExecBatch built = builder.Flush(); -// ASSERT_EQ(first_col_only, built); -// ASSERT_NE(0, pool->bytes_allocated()); -// } -// { -// ExecBatchBuilder builder; -// uint16_t row_ids[3] = {0, 1, 2}; -// // If we don't specify col_ids and num_cols is 1 it is implicitly the first col -// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1)); -// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1)); -// ExecBatch built = builder.Flush(); -// ASSERT_EQ(first_col_only, built); -// ASSERT_NE(0, pool->bytes_allocated()); -// } -// { -// ExecBatchBuilder builder; -// uint16_t row_ids[3] = {0, 1, 2}; -// int last_col_ids[1] = {1}; -// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1, -// last_col_ids)); -// ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1, -// last_col_ids)); -// ExecBatch built = builder.Flush(); -// ASSERT_EQ(last_col_only, built); -// ASSERT_NE(0, pool->bytes_allocated()); -// } -// ASSERT_EQ(0, pool->bytes_allocated()); -//} -// -// TEST(ExecBatchBuilder, AppendNulls) { -// std::unique_ptr owned_pool = MemoryPool::CreateDefault(); -// MemoryPool* pool = owned_pool.get(); -// ExecBatch batch_one = -// ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); -// ExecBatch combined = ExecBatchFromJSON( -// {int64(), boolean()}, -// "[[1, true], [2, false], [null, null], [null, null], [null, null]]"); -// ExecBatch just_nulls = -// ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]"); -// { -// ExecBatchBuilder builder; -// uint16_t row_ids[3] = {0, 1, 2}; -// ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2)); -// ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); -// ExecBatch built = builder.Flush(); -// ASSERT_EQ(combined, built); -// ASSERT_NE(0, pool->bytes_allocated()); -// } -// { -// ExecBatchBuilder builder; -// ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2)); -// ExecBatch built = builder.Flush(); -// ASSERT_EQ(just_nulls, built); -// ASSERT_NE(0, pool->bytes_allocated()); -// } -// ASSERT_EQ(0, pool->bytes_allocated()); -//} TEST(ExecBatchBuilder, AppendNullsBeyondLimit) { std::unique_ptr owned_pool = MemoryPool::CreateDefault(); From 2401904ed04db5bfbd2d2ca7985e2017fee4b3f4 Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Wed, 8 Mar 2023 17:26:24 -0500 Subject: [PATCH 09/11] fix format --- cpp/src/arrow/compute/exec/light_array_exec_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/exec/light_array_exec_test.cc b/cpp/src/arrow/compute/exec/light_array_exec_test.cc index ff452fa91fe..cbc93299a5b 100644 --- a/cpp/src/arrow/compute/exec/light_array_exec_test.cc +++ b/cpp/src/arrow/compute/exec/light_array_exec_test.cc @@ -27,7 +27,7 @@ namespace arrow { namespace compute { - TEST(KeyColumnArray, FromExecBatch) { +TEST(KeyColumnArray, FromExecBatch) { ExecBatch batch = ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]"); std::vector arrays; @@ -139,7 +139,7 @@ TEST(ExecBatchBuilder, AppendBatchesSomeCols) { ASSERT_EQ(0, pool->bytes_allocated()); } - TEST(ExecBatchBuilder, AppendNulls) { +TEST(ExecBatchBuilder, AppendNulls) { std::unique_ptr owned_pool = MemoryPool::CreateDefault(); MemoryPool* pool = owned_pool.get(); ExecBatch batch_one = From 93778dbe4486f1415c51fdfcffb5e1a440f9ddec Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Thu, 9 Mar 2023 09:00:02 -0500 Subject: [PATCH 10/11] fix lint --- cpp/src/arrow/compute/key_hash.cc | 2 +- cpp/src/arrow/compute/util.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/compute/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc index 993d68b1fa3..3fcfbf3d831 100644 --- a/cpp/src/arrow/compute/key_hash.cc +++ b/cpp/src/arrow/compute/key_hash.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "key_hash.h" +#include "arrow/compute/key_hash.h" #include diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 66082afc0e2..60c20137c8c 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -202,4 +202,4 @@ class bit_util { }; } // namespace util -} // namespace arrow \ No newline at end of file +} // namespace arrow From 24edd64febbf0288a780b7e4e5c8600f5f50657f Mon Sep 17 00:00:00 2001 From: Davide Pasetto Date: Thu, 9 Mar 2023 09:22:29 -0500 Subject: [PATCH 11/11] fix cmake format --- cpp/src/arrow/compute/exec/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt index ec854c83936..914840b0396 100644 --- a/cpp/src/arrow/compute/exec/CMakeLists.txt +++ b/cpp/src/arrow/compute/exec/CMakeLists.txt @@ -70,10 +70,10 @@ add_arrow_compute_test(util_test util_test.cc task_util_test.cc) add_arrow_compute_test(light_array_exec_test - PREFIX - "arrow-compute" - SOURCES - light_array_exec_test.cc) + PREFIX + "arrow-compute" + SOURCES + light_array_exec_test.cc) add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute")