From 3701b4700d7ed0ed6ff20778e5974afa14fcf4a1 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 3 Sep 2025 17:36:34 +0200 Subject: [PATCH 01/17] Expose unpack32_default and unpack64_default --- cpp/src/arrow/util/bit_stream_utils_internal.h | 1 + cpp/src/arrow/util/bpacking.cc | 8 ++------ cpp/src/arrow/util/bpacking64_default_internal.h | 8 +++----- cpp/src/arrow/util/bpacking_internal.h | 15 +++++++++------ 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 9d67c278bcc..3d1afe88261 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -25,6 +25,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bpacking_internal.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 326dd050fe1..a38ffcf2a2e 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -36,8 +36,6 @@ namespace arrow { namespace internal { -namespace { - int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -149,6 +147,8 @@ int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_ return batch_size; } +namespace { + struct Unpack32DynamicFunction { using FunctionType = decltype(&unpack32_default); @@ -177,8 +177,6 @@ int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { #endif } -namespace { - int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -386,8 +384,6 @@ int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_b return batch_size; } -} // namespace - int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { // TODO: unpack64_neon, unpack64_avx2 and unpack64_avx512 return unpack64_default(in, out, batch_size, num_bits); diff --git a/cpp/src/arrow/util/bpacking64_default_internal.h b/cpp/src/arrow/util/bpacking64_default_internal.h index 4f45619b2a7..256cdda87e3 100644 --- a/cpp/src/arrow/util/bpacking64_default_internal.h +++ b/cpp/src/arrow/util/bpacking64_default_internal.h @@ -26,11 +26,10 @@ #pragma once -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/ubsan.h" -namespace arrow { -namespace internal { +namespace arrow::internal { inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) { for (int k = 0; k < 32; k += 1) { @@ -5638,5 +5637,4 @@ inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out) { return in; } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h index dd85c1638c7..dd3e3ca4c3f 100644 --- a/cpp/src/arrow/util/bpacking_internal.h +++ b/cpp/src/arrow/util/bpacking_internal.h @@ -17,18 +17,21 @@ #pragma once -#include "arrow/util/endian.h" #include "arrow/util/visibility.h" -#include +#include -namespace arrow { -namespace internal { +namespace arrow::internal { + +/// The scalar 32 bit unpacking. +int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); + +/// The scalar 64 bit unpacking. +int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); ARROW_EXPORT int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); ARROW_EXPORT int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); -} // namespace internal -} // namespace arrow +} // namespace arrow::internal From f23171072fddd869b5e6c3ffc63b904baa092e0b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 3 Sep 2025 18:09:50 +0200 Subject: [PATCH 02/17] Uniform unpack signature --- cpp/src/arrow/util/bit_stream_utils_internal.h | 7 +++---- cpp/src/arrow/util/bpacking.cc | 6 ++++-- cpp/src/arrow/util/bpacking_avx2.cc | 12 +++++------- cpp/src/arrow/util/bpacking_avx2_internal.h | 10 ++++------ cpp/src/arrow/util/bpacking_avx512.cc | 12 +++++------- cpp/src/arrow/util/bpacking_avx512_internal.h | 10 ++++------ cpp/src/arrow/util/bpacking_internal.h | 4 ++-- cpp/src/arrow/util/bpacking_neon.cc | 12 +++++------- cpp/src/arrow/util/bpacking_neon_internal.h | 10 ++++------ 9 files changed, 36 insertions(+), 47 deletions(-) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 3d1afe88261..2b5ec3830ee 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -340,8 +340,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { if (sizeof(T) == 4) { int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - reinterpret_cast(v + i), batch_size - i, num_bits); + internal::unpack32(buffer + byte_offset, reinterpret_cast(v + i), + batch_size - i, num_bits); i += num_unpacked; byte_offset += num_unpacked * num_bits / 8; } else if (sizeof(T) == 8 && num_bits > 32) { @@ -361,8 +361,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { while (i < batch_size) { int unpack_size = std::min(buffer_size, batch_size - i); int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - unpack_buffer, unpack_size, num_bits); + internal::unpack32(buffer + byte_offset, unpack_buffer, unpack_size, num_bits); if (num_unpacked == 0) { break; } diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index a38ffcf2a2e..a54fb340393 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -36,7 +36,9 @@ namespace arrow { namespace internal { -int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { +int unpack32_default(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { + const uint32_t* in = reinterpret_cast(in_); + batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -168,7 +170,7 @@ struct Unpack32DynamicFunction { } // namespace -int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { +int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { #if defined(ARROW_HAVE_NEON) return unpack32_neon(in, out, batch_size, num_bits); #else diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 02510a07b9f..84f091594c1 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -19,13 +19,11 @@ #include "arrow/util/bpacking_simd256_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx2_internal.h b/cpp/src/arrow/util/bpacking_avx2_internal.h index 7a7d8bf8c44..99676b6215f 100644 --- a/cpp/src/arrow/util/bpacking_avx2_internal.h +++ b/cpp/src/arrow/util/bpacking_avx2_internal.h @@ -17,12 +17,10 @@ #pragma once -#include +#include -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 6272ef1cde8..35de0dd5b47 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -19,13 +19,11 @@ #include "arrow/util/bpacking_simd512_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512_internal.h b/cpp/src/arrow/util/bpacking_avx512_internal.h index 96723f803e0..67970c555ba 100644 --- a/cpp/src/arrow/util/bpacking_avx512_internal.h +++ b/cpp/src/arrow/util/bpacking_avx512_internal.h @@ -17,12 +17,10 @@ #pragma once -#include +#include -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h index dd3e3ca4c3f..60a2e7d16ac 100644 --- a/cpp/src/arrow/util/bpacking_internal.h +++ b/cpp/src/arrow/util/bpacking_internal.h @@ -24,13 +24,13 @@ namespace arrow::internal { /// The scalar 32 bit unpacking. -int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +int unpack32_default(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); /// The scalar 64 bit unpacking. int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); ARROW_EXPORT -int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); ARROW_EXPORT int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc index 72b520e8cf1..407b309b7e8 100644 --- a/cpp/src/arrow/util/bpacking_neon.cc +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -19,13 +19,11 @@ #include "arrow/util/bpacking_simd128_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon_internal.h b/cpp/src/arrow/util/bpacking_neon_internal.h index 9d02cd568ac..cd3636cc6b2 100644 --- a/cpp/src/arrow/util/bpacking_neon_internal.h +++ b/cpp/src/arrow/util/bpacking_neon_internal.h @@ -17,12 +17,10 @@ #pragma once -#include +#include -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); -} // namespace internal -} // namespace arrow +} // namespace arrow::internal From 0a0f22b6650e7eeb09828828d2b1d5802dd56d76 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 5 Sep 2025 10:16:56 +0200 Subject: [PATCH 03/17] Add unpack tests --- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/bpacking_test.cc | 141 ++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 cpp/src/arrow/util/bpacking_test.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index df47389240e..25ef25fa7d7 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -98,6 +98,7 @@ add_arrow_test(bit-utility-test SOURCES bit_block_counter_test.cc bit_util_test.cc + bpacking_test.cc rle_encoding_test.cc) add_arrow_test(threading-utility-test diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc new file mode 100644 index 00000000000..090eb123fb7 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -0,0 +1,141 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/testing/util.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/bpacking_neon_internal.h" +#include "arrow/util/logging.h" + +namespace arrow::internal { + +template +using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); + +/// Generate random bytes as packed integers. +std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_width) { + constexpr uint32_t kSeed = 3214; + + auto const num_bits = num_values * bit_width; + if (num_bits % 8 != 0) { + throw std::invalid_argument("Must generate a multiple of 8 bits."); + } + auto const num_bytes = num_bits / 8; + + std::vector out(num_bytes); + random_bytes(num_bytes, kSeed, out.data()); + + return out; +} + +/// Convenience wrapper to unpack into a vector +template +std::vector unpackValues(std::vector const& packed, int32_t num_values, + int32_t bit_width, UnpackFunc unpack) { + std::vector out(num_values); + int values_read = unpack(packed.data(), out.data(), num_values, bit_width); + ARROW_DCHECK_GE(values_read, 0); + out.resize(values_read); + return out; +} + +/// Use BitWriter to pack values into a vector. +template +std::vector packValues(std::vector const& values, int32_t num_values, + int32_t bit_width) { + auto const num_bits = num_values * bit_width; + if (num_bits % 8 != 0) { + throw std::invalid_argument("Must generate a multiple of 8 bits."); + } + auto const num_bytes = num_bits / 8; + + std::vector out(static_cast(num_bytes)); + bit_util::BitWriter writer(out.data(), num_bytes); + for (auto const& v : values) { + bool written = writer.PutValue(v, bit_width); + if (!written) { + throw std::runtime_error("Cannot write move values"); + } + } + + return out; +} + +template +void checkUnpackPackRoundtrip(std::vector const& packed, int32_t num_values, + int32_t bit_width, UnpackFunc unpack) { + auto const unpacked = unpackValues(packed, num_values, bit_width, unpack); + EXPECT_EQ(unpacked.size(), num_values); + auto const roundtrip = packValues(unpacked, num_values, bit_width); + EXPECT_EQ(packed.size(), roundtrip.size()); + EXPECT_EQ(packed, roundtrip); +} + +struct UnpackingData { + int32_t num_values; + int32_t bit_width; +}; + +class UnpackingRandomRoundTrip : public ::testing::TestWithParam { + protected: + template + void testRoundtrip(UnpackFunc unpack) { + auto [num_values, bit_width] = GetParam(); + auto const original = generateRandomPackedValues(num_values, bit_width); + checkUnpackPackRoundtrip(original, num_values, bit_width, unpack); + } +}; + +INSTANTIATE_TEST_SUITE_P( + MutpliesOf64Values, UnpackingRandomRoundTrip, + ::testing::Values(UnpackingData{64, 1}, UnpackingData{128, 1}, UnpackingData{2048, 1}, + UnpackingData{64, 31}, UnpackingData{128, 31}, + UnpackingData{2048, 31}, UnpackingData{64000, 7}, + UnpackingData{64000, 8}, UnpackingData{64000, 13}, + UnpackingData{64000, 16}, UnpackingData{64000, 31}, + UnpackingData{64000, 32})); + +TEST_P(UnpackingRandomRoundTrip, unpack32Default) { + this->testRoundtrip(&unpack32_default); +} +TEST_P(UnpackingRandomRoundTrip, unpack64Default) { + this->testRoundtrip(&unpack64_default); +} + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +TEST_P(UnpackingRandomRoundTrip, unpack32Avx2) { this->testRoundtrip(&unpack32_avx2); } +#endif + +#if defined(ARROW_HAVE_RUNTIME_AVX512) +TEST_P(UnpackingRandomRoundTrip, unpack32Avx512) { + this->testRoundtrip(&unpack32_avx512); +} +#endif + +#if defined(ARROW_HAVE_NEON) +TEST_P(UnpackingRandomRoundTrip, unpack32Neon) { this->testRoundtrip(&unpack32_neon); } +#endif + +TEST_P(UnpackingRandomRoundTrip, unpack32) { this->testRoundtrip(&unpack32); } +TEST_P(UnpackingRandomRoundTrip, unpack64) { this->testRoundtrip(&unpack64); } + +} // namespace arrow::internal From d89722f7b8c6e7cbd93f1e78bbac39c825027d14 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 5 Sep 2025 11:16:05 +0200 Subject: [PATCH 04/17] Add unaligned bpacking test --- cpp/src/arrow/util/bpacking_test.cc | 79 +++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 20 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 090eb123fb7..2fc91a21275 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -23,23 +23,36 @@ #include "arrow/testing/util.h" #include "arrow/util/bit_stream_utils_internal.h" #include "arrow/util/bpacking_internal.h" -#include "arrow/util/bpacking_neon_internal.h" #include "arrow/util/logging.h" +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "arrow/util/bpacking_avx2_internal.h" +#endif +#if defined(ARROW_HAVE_RUNTIME_AVX512) +# include "arrow/util/bpacking_avx512_internal.h" +#endif +#if defined(ARROW_HAVE_NEON) +# include "arrow/util/bpacking_neon_internal.h" +#endif + namespace arrow::internal { template using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); -/// Generate random bytes as packed integers. -std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_width) { - constexpr uint32_t kSeed = 3214; - +/// Get the number of bytes associate with a packing. +constexpr int32_t getNumBytes(int32_t num_values, int32_t bit_width) { auto const num_bits = num_values * bit_width; if (num_bits % 8 != 0) { - throw std::invalid_argument("Must generate a multiple of 8 bits."); + throw std::invalid_argument("Must pack a multiple of 8 bits."); } - auto const num_bytes = num_bits / 8; + return num_bits / 8; +} + +/// Generate random bytes as packed integers. +std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_width) { + constexpr uint32_t kSeed = 3214; + auto const num_bytes = getNumBytes(num_values, bit_width); std::vector out(num_bytes); random_bytes(num_bytes, kSeed, out.data()); @@ -49,10 +62,10 @@ std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_ /// Convenience wrapper to unpack into a vector template -std::vector unpackValues(std::vector const& packed, int32_t num_values, +std::vector unpackValues(const uint8_t* packed, int32_t num_values, int32_t bit_width, UnpackFunc unpack) { std::vector out(num_values); - int values_read = unpack(packed.data(), out.data(), num_values, bit_width); + int values_read = unpack(packed, out.data(), num_values, bit_width); ARROW_DCHECK_GE(values_read, 0); out.resize(values_read); return out; @@ -62,11 +75,7 @@ std::vector unpackValues(std::vector const& packed, int32_t num_va template std::vector packValues(std::vector const& values, int32_t num_values, int32_t bit_width) { - auto const num_bits = num_values * bit_width; - if (num_bits % 8 != 0) { - throw std::invalid_argument("Must generate a multiple of 8 bits."); - } - auto const num_bytes = num_bits / 8; + auto const num_bytes = getNumBytes(num_values, bit_width); std::vector out(static_cast(num_bytes)); bit_util::BitWriter writer(out.data(), num_bytes); @@ -81,13 +90,30 @@ std::vector packValues(std::vector const& values, int32_t num_valu } template -void checkUnpackPackRoundtrip(std::vector const& packed, int32_t num_values, +void checkUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, int32_t bit_width, UnpackFunc unpack) { + auto const num_bytes = getNumBytes(num_values, bit_width); + auto const unpacked = unpackValues(packed, num_values, bit_width, unpack); EXPECT_EQ(unpacked.size(), num_values); auto const roundtrip = packValues(unpacked, num_values, bit_width); - EXPECT_EQ(packed.size(), roundtrip.size()); - EXPECT_EQ(packed, roundtrip); + EXPECT_EQ(num_bytes, roundtrip.size()); + for (int i = 0; i < num_bytes; ++i) { + EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i; + } +} + +const uint8_t* getNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { + auto addr = reinterpret_cast(ptr); + + if (addr % alignment == 0) { + return ptr; + } + + auto remainder = addr % alignment; + auto bytes_to_add = alignment - remainder; + + return ptr + bytes_to_add; } struct UnpackingData { @@ -98,10 +124,23 @@ struct UnpackingData { class UnpackingRandomRoundTrip : public ::testing::TestWithParam { protected: template - void testRoundtrip(UnpackFunc unpack) { + void testRoundtripAlignment(UnpackFunc unpack, std::size_t alignment_offset) { auto [num_values, bit_width] = GetParam(); - auto const original = generateRandomPackedValues(num_values, bit_width); - checkUnpackPackRoundtrip(original, num_values, bit_width, unpack); + constexpr int32_t kExtraValues = sizeof(Int) * 8; + + auto const packed = generateRandomPackedValues(num_values + kExtraValues, bit_width); + const uint8_t* packed_unaligned = + getNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset; + + checkUnpackPackRoundtrip(packed_unaligned, num_values, bit_width, unpack); + } + + template + void testRoundtrip(UnpackFunc unpack) { + // Aligned test + testRoundtripAlignment(unpack, 0); + // Unaligned test + testRoundtripAlignment(unpack, 1); } }; From 69c811a0af3fa429ccbb6e05a75d4b55ee11900b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 5 Sep 2025 15:47:12 +0200 Subject: [PATCH 05/17] Add bpacking benchmark --- cpp/src/arrow/util/CMakeLists.txt | 3 +- cpp/src/arrow/util/bpacking_benchmark.cc | 140 +++++++++++++++++++++++ 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 cpp/src/arrow/util/bpacking_benchmark.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 25ef25fa7d7..3fa1a74123a 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -117,7 +117,8 @@ add_arrow_test(crc32-test Boost::headers) add_arrow_benchmark(bit_block_counter_benchmark) -add_arrow_benchmark(bit_util_benchmark) +add_arrow_benchmark(bit_util_benchmark SOURCES bit_util_benchmark.cc + bpacking_benchmark.cc) add_arrow_benchmark(bitmap_reader_benchmark) add_arrow_benchmark(cache_benchmark) add_arrow_benchmark(compression_benchmark) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc new file mode 100644 index 00000000000..77975e46b2a --- /dev/null +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/testing/util.h" +#include "arrow/util/bpacking_internal.h" + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "arrow/util/bpacking_avx2_internal.h" +#endif +#if defined(ARROW_HAVE_RUNTIME_AVX512) +# include "arrow/util/bpacking_avx512_internal.h" +#endif +#if defined(ARROW_HAVE_NEON) +# include "arrow/util/bpacking_neon_internal.h" +#endif + +namespace arrow::internal { +namespace { + +template +using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); + +/// Get the number of bytes associate with a packing. +constexpr int32_t getNumBytes(int32_t num_values, int32_t bit_width) { + auto const num_bits = num_values * bit_width; + if (num_bits % 8 != 0) { + throw std::invalid_argument("Must pack a multiple of 8 bits."); + } + return num_bits / 8; +} + +/// Generate random bytes as packed integers. +std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_width) { + constexpr uint32_t kSeed = 3214; + auto const num_bytes = getNumBytes(num_values, bit_width); + + std::vector out(num_bytes); + random_bytes(num_bytes, kSeed, out.data()); + + return out; +} + +const uint8_t* getNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { + auto addr = reinterpret_cast(ptr); + + if (addr % alignment == 0) { + return ptr; + } + + auto remainder = addr % alignment; + auto bytes_to_add = alignment - remainder; + + return ptr + bytes_to_add; +} + +template +void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack) { + auto const bit_width = static_cast(state.range(0)); + auto const num_values = static_cast(state.range(1)); + constexpr int32_t kExtraValues = sizeof(Int) * 8; + + auto const packed = generateRandomPackedValues(num_values + kExtraValues, bit_width); + const uint8_t* packed_ptr = + getNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); + + std::vector unpacked(num_values, 0); + + for (auto _ : state) { + unpack(packed_ptr, unpacked.data(), num_values, bit_width); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(num_values); +} + +constexpr int32_t MIN_RANGE = 64; +constexpr int32_t MAX_RANGE = 32768; +constexpr std::initializer_list kBitWidths32 = {1, 8, 20}; +constexpr std::initializer_list kBitWidths64 = {1, 8, 20, 47}; +static const std::vector> bitWidthsNumValues32 = { + kBitWidths32, + benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/8), +}; +static const std::vector> bitWidthsNumValues64 = { + kBitWidths64, + benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/8), +}; + +BENCHMARK_CAPTURE(BM_Unpack, unpack32_default_unaligned, false, + unpack32_default) + ->ArgsProduct(bitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_Unpack, unpack64_default_unaligned, false, + unpack64_default) + ->ArgsProduct(bitWidthsNumValues64); + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +BENCHMARK_CAPTURE(BM_Unpack, unpack32_avx2_unaligned, false, unpack32_avx2) + ->ArgsProduct(bitWidthsNumValues32); +#endif + +#if defined(ARROW_HAVE_RUNTIME_AVX512) +BENCHMARK_CAPTURE(BM_Unpack, unpack32_avx512_unaligned, false, unpack32_avx512) + ->ArgsProduct(bitWidthsNumValues32); +#endif + +#if defined(ARROW_HAVE_NEON) +BENCHMARK_CAPTURE(BM_Unpack, unpack32_neon_unaligned, false, unpack32_neon) + ->ArgsProduct(bitWidthsNumValues32); +#endif + +BENCHMARK_CAPTURE(BM_Unpack, unpack32_aligned, true, unpack32) + ->ArgsProduct(bitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_Unpack, unpack32_unaligned, false, unpack32) + ->ArgsProduct(bitWidthsNumValues32); + +BENCHMARK_CAPTURE(BM_Unpack, unpack64_aligned, true, unpack64) + ->ArgsProduct(bitWidthsNumValues64); +BENCHMARK_CAPTURE(BM_Unpack, unpack64_unaligned, false, unpack64) + ->ArgsProduct(bitWidthsNumValues64); + +} // namespace +} // namespace arrow::internal From 085d4f054d549462413a5ec7abcf90e81880d05b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 5 Sep 2025 16:29:05 +0200 Subject: [PATCH 06/17] Fix simd detection macro --- cpp/src/arrow/util/bpacking_benchmark.cc | 8 ++++---- cpp/src/arrow/util/bpacking_test.cc | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 77975e46b2a..add644841ae 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -23,10 +23,10 @@ #include "arrow/testing/util.h" #include "arrow/util/bpacking_internal.h" -#if defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(ARROW_HAVE_AVX2) # include "arrow/util/bpacking_avx2_internal.h" #endif -#if defined(ARROW_HAVE_RUNTIME_AVX512) +#if defined(ARROW_HAVE_AVX512) # include "arrow/util/bpacking_avx512_internal.h" #endif #if defined(ARROW_HAVE_NEON) @@ -111,12 +111,12 @@ BENCHMARK_CAPTURE(BM_Unpack, unpack64_default_unaligned, false, unpack64_default) ->ArgsProduct(bitWidthsNumValues64); -#if defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(ARROW_HAVE_AVX2) BENCHMARK_CAPTURE(BM_Unpack, unpack32_avx2_unaligned, false, unpack32_avx2) ->ArgsProduct(bitWidthsNumValues32); #endif -#if defined(ARROW_HAVE_RUNTIME_AVX512) +#if defined(ARROW_HAVE_AVX512) BENCHMARK_CAPTURE(BM_Unpack, unpack32_avx512_unaligned, false, unpack32_avx512) ->ArgsProduct(bitWidthsNumValues32); #endif diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 2fc91a21275..98f93b5dd31 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -25,10 +25,10 @@ #include "arrow/util/bpacking_internal.h" #include "arrow/util/logging.h" -#if defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(ARROW_HAVE_AVX2) # include "arrow/util/bpacking_avx2_internal.h" #endif -#if defined(ARROW_HAVE_RUNTIME_AVX512) +#if defined(ARROW_HAVE_AVX512) # include "arrow/util/bpacking_avx512_internal.h" #endif #if defined(ARROW_HAVE_NEON) @@ -160,11 +160,11 @@ TEST_P(UnpackingRandomRoundTrip, unpack64Default) { this->testRoundtrip(&unpack64_default); } -#if defined(ARROW_HAVE_RUNTIME_AVX2) +#if defined(ARROW_HAVE_AVX2) TEST_P(UnpackingRandomRoundTrip, unpack32Avx2) { this->testRoundtrip(&unpack32_avx2); } #endif -#if defined(ARROW_HAVE_RUNTIME_AVX512) +#if defined(ARROW_HAVE_AVX512) TEST_P(UnpackingRandomRoundTrip, unpack32Avx512) { this->testRoundtrip(&unpack32_avx512); } From ef9a9484fe136b3b581ad3fcac2b4a96654b7dc7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 8 Sep 2025 13:54:24 +0200 Subject: [PATCH 07/17] Nudge MSVC --- cpp/src/arrow/util/bpacking_benchmark.cc | 29 +++++++++++++++--------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index add644841ae..7cd20627b23 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -104,36 +104,43 @@ static const std::vector> bitWidthsNumValues64 = { benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/8), }; -BENCHMARK_CAPTURE(BM_Unpack, unpack32_default_unaligned, false, - unpack32_default) +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc unpack) { + return BM_Unpack(state, aligned, unpack); +} +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc unpack) { + return BM_Unpack(state, aligned, unpack); +} + +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_default_unaligned, false, unpack32_default) ->ArgsProduct(bitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_Unpack, unpack64_default_unaligned, false, - unpack64_default) +BENCHMARK_CAPTURE(BM_UnpackUint64, unpack64_default_unaligned, false, unpack64_default) ->ArgsProduct(bitWidthsNumValues64); #if defined(ARROW_HAVE_AVX2) -BENCHMARK_CAPTURE(BM_Unpack, unpack32_avx2_unaligned, false, unpack32_avx2) +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx2_unaligned, false, unpack32_avx2) ->ArgsProduct(bitWidthsNumValues32); #endif #if defined(ARROW_HAVE_AVX512) -BENCHMARK_CAPTURE(BM_Unpack, unpack32_avx512_unaligned, false, unpack32_avx512) +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx512_unaligned, false, unpack32_avx512) ->ArgsProduct(bitWidthsNumValues32); #endif #if defined(ARROW_HAVE_NEON) -BENCHMARK_CAPTURE(BM_Unpack, unpack32_neon_unaligned, false, unpack32_neon) +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_neon_unaligned, false, unpack32_neon) ->ArgsProduct(bitWidthsNumValues32); #endif -BENCHMARK_CAPTURE(BM_Unpack, unpack32_aligned, true, unpack32) +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_aligned, true, unpack32) ->ArgsProduct(bitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_Unpack, unpack32_unaligned, false, unpack32) +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_unaligned, false, unpack32) ->ArgsProduct(bitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_Unpack, unpack64_aligned, true, unpack64) +BENCHMARK_CAPTURE(BM_UnpackUint64, unpack64_aligned, true, unpack64) ->ArgsProduct(bitWidthsNumValues64); -BENCHMARK_CAPTURE(BM_Unpack, unpack64_unaligned, false, unpack64) +BENCHMARK_CAPTURE(BM_UnpackUint64, unpack64_unaligned, false, unpack64) ->ArgsProduct(bitWidthsNumValues64); } // namespace From 2c4e4af062c521ecbab3e8c9b9622a55cee63214 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 8 Sep 2025 14:11:14 +0200 Subject: [PATCH 08/17] Export scalar unpack functions --- cpp/src/arrow/util/bpacking_internal.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h index 60a2e7d16ac..53bb7a6b337 100644 --- a/cpp/src/arrow/util/bpacking_internal.h +++ b/cpp/src/arrow/util/bpacking_internal.h @@ -24,10 +24,12 @@ namespace arrow::internal { /// The scalar 32 bit unpacking. -int unpack32_default(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); +ARROW_EXPORT int unpack32_default(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); /// The scalar 64 bit unpacking. -int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); +ARROW_EXPORT int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, + int num_bits); ARROW_EXPORT int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); From ad4c82761f4cec3346e152cde960758805de37ec Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 8 Sep 2025 15:03:14 +0200 Subject: [PATCH 09/17] Add export for SIMD bpacking functions --- cpp/src/arrow/util/bpacking_avx2_internal.h | 5 ++++- cpp/src/arrow/util/bpacking_avx512_internal.h | 5 ++++- cpp/src/arrow/util/bpacking_neon_internal.h | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_avx2_internal.h b/cpp/src/arrow/util/bpacking_avx2_internal.h index 99676b6215f..b2c213fe2aa 100644 --- a/cpp/src/arrow/util/bpacking_avx2_internal.h +++ b/cpp/src/arrow/util/bpacking_avx2_internal.h @@ -17,10 +17,13 @@ #pragma once +#include "arrow/util/visibility.h" + #include namespace arrow::internal { -int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); +ARROW_EXPORT int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512_internal.h b/cpp/src/arrow/util/bpacking_avx512_internal.h index 67970c555ba..847aa981433 100644 --- a/cpp/src/arrow/util/bpacking_avx512_internal.h +++ b/cpp/src/arrow/util/bpacking_avx512_internal.h @@ -17,10 +17,13 @@ #pragma once +#include "arrow/util/visibility.h" + #include namespace arrow::internal { -int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); +ARROW_EXPORT int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); } // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon_internal.h b/cpp/src/arrow/util/bpacking_neon_internal.h index cd3636cc6b2..683aa5cbc47 100644 --- a/cpp/src/arrow/util/bpacking_neon_internal.h +++ b/cpp/src/arrow/util/bpacking_neon_internal.h @@ -17,10 +17,13 @@ #pragma once +#include "arrow/util/visibility.h" + #include namespace arrow::internal { -int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); +ARROW_EXPORT int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); } // namespace arrow::internal From 4b0e5157e8f2ad84dc2e8b0978b7c2d94662a5c9 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 9 Sep 2025 12:15:33 +0200 Subject: [PATCH 10/17] Address review comments --- cpp/src/arrow/util/CMakeLists.txt | 4 +- cpp/src/arrow/util/bpacking_benchmark.cc | 31 ++++++----- cpp/src/arrow/util/bpacking_test.cc | 70 ++++++++++++++---------- 3 files changed, 60 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 3fa1a74123a..56545f6aa79 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -117,8 +117,8 @@ add_arrow_test(crc32-test Boost::headers) add_arrow_benchmark(bit_block_counter_benchmark) -add_arrow_benchmark(bit_util_benchmark SOURCES bit_util_benchmark.cc - bpacking_benchmark.cc) +add_arrow_benchmark(bit_util_benchmark) +add_arrow_benchmark(bpacking_benchmark) add_arrow_benchmark(bitmap_reader_benchmark) add_arrow_benchmark(cache_benchmark) add_arrow_benchmark(compression_benchmark) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 7cd20627b23..62ac38ad096 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -40,7 +40,7 @@ template using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); /// Get the number of bytes associate with a packing. -constexpr int32_t getNumBytes(int32_t num_values, int32_t bit_width) { +constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { auto const num_bits = num_values * bit_width; if (num_bits % 8 != 0) { throw std::invalid_argument("Must pack a multiple of 8 bits."); @@ -49,9 +49,9 @@ constexpr int32_t getNumBytes(int32_t num_values, int32_t bit_width) { } /// Generate random bytes as packed integers. -std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_width) { +std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { constexpr uint32_t kSeed = 3214; - auto const num_bytes = getNumBytes(num_values, bit_width); + auto const num_bytes = GetNumBytes(num_values, bit_width); std::vector out(num_bytes); random_bytes(num_bytes, kSeed, out.data()); @@ -59,7 +59,7 @@ std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_ return out; } -const uint8_t* getNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { +const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { auto addr = reinterpret_cast(ptr); if (addr % alignment == 0) { @@ -76,11 +76,14 @@ template void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack) { auto const bit_width = static_cast(state.range(0)); auto const num_values = static_cast(state.range(1)); - constexpr int32_t kExtraValues = sizeof(Int) * 8; - auto const packed = generateRandomPackedValues(num_values + kExtraValues, bit_width); + // Assume std::vector allocation is likely be aligned for greater than a byte. + // So we allocate more values than necessary and skip to the next byte with the + // desired (non) alignment to test the proper condition. + constexpr int32_t kExtraValues = sizeof(Int) * 8; + auto const packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); const uint8_t* packed_ptr = - getNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); + GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); std::vector unpacked(num_values, 0); @@ -88,20 +91,20 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack) { unpack(packed_ptr, unpacked.data(), num_values, bit_width); benchmark::ClobberMemory(); } - state.SetItemsProcessed(num_values); + state.SetItemsProcessed(num_values * state.iterations()); } -constexpr int32_t MIN_RANGE = 64; -constexpr int32_t MAX_RANGE = 32768; -constexpr std::initializer_list kBitWidths32 = {1, 8, 20}; -constexpr std::initializer_list kBitWidths64 = {1, 8, 20, 47}; +constexpr int32_t kMinRange = 64; +constexpr int32_t kMaxRange = 32768; +constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; +constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; static const std::vector> bitWidthsNumValues32 = { kBitWidths32, - benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/8), + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), }; static const std::vector> bitWidthsNumValues64 = { kBitWidths64, - benchmark::CreateRange(MIN_RANGE, MAX_RANGE, /*multi=*/8), + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), }; /// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 98f93b5dd31..90d2f1a8f45 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -25,10 +25,11 @@ #include "arrow/util/bpacking_internal.h" #include "arrow/util/logging.h" -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) # include "arrow/util/bpacking_avx2_internal.h" +# include "arrow/util/cpu_info.h" #endif -#if defined(ARROW_HAVE_AVX512) +#if defined(ARROW_HAVE_RUNTIME_AVX512) # include "arrow/util/bpacking_avx512_internal.h" #endif #if defined(ARROW_HAVE_NEON) @@ -41,7 +42,7 @@ template using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); /// Get the number of bytes associate with a packing. -constexpr int32_t getNumBytes(int32_t num_values, int32_t bit_width) { +constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { auto const num_bits = num_values * bit_width; if (num_bits % 8 != 0) { throw std::invalid_argument("Must pack a multiple of 8 bits."); @@ -50,9 +51,9 @@ constexpr int32_t getNumBytes(int32_t num_values, int32_t bit_width) { } /// Generate random bytes as packed integers. -std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_width) { +std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { constexpr uint32_t kSeed = 3214; - auto const num_bytes = getNumBytes(num_values, bit_width); + auto const num_bytes = GetNumBytes(num_values, bit_width); std::vector out(num_bytes); random_bytes(num_bytes, kSeed, out.data()); @@ -62,7 +63,7 @@ std::vector generateRandomPackedValues(int32_t num_values, int32_t bit_ /// Convenience wrapper to unpack into a vector template -std::vector unpackValues(const uint8_t* packed, int32_t num_values, +std::vector UnpackValues(const uint8_t* packed, int32_t num_values, int32_t bit_width, UnpackFunc unpack) { std::vector out(num_values); int values_read = unpack(packed, out.data(), num_values, bit_width); @@ -73,9 +74,9 @@ std::vector unpackValues(const uint8_t* packed, int32_t num_values, /// Use BitWriter to pack values into a vector. template -std::vector packValues(std::vector const& values, int32_t num_values, +std::vector PackValues(std::vector const& values, int32_t num_values, int32_t bit_width) { - auto const num_bytes = getNumBytes(num_values, bit_width); + auto const num_bytes = GetNumBytes(num_values, bit_width); std::vector out(static_cast(num_bytes)); bit_util::BitWriter writer(out.data(), num_bytes); @@ -90,20 +91,20 @@ std::vector packValues(std::vector const& values, int32_t num_valu } template -void checkUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, +void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, int32_t bit_width, UnpackFunc unpack) { - auto const num_bytes = getNumBytes(num_values, bit_width); + auto const num_bytes = GetNumBytes(num_values, bit_width); - auto const unpacked = unpackValues(packed, num_values, bit_width, unpack); + auto const unpacked = UnpackValues(packed, num_values, bit_width, unpack); EXPECT_EQ(unpacked.size(), num_values); - auto const roundtrip = packValues(unpacked, num_values, bit_width); + auto const roundtrip = PackValues(unpacked, num_values, bit_width); EXPECT_EQ(num_bytes, roundtrip.size()); for (int i = 0; i < num_bytes; ++i) { EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i; } } -const uint8_t* getNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { +const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { auto addr = reinterpret_cast(ptr); if (addr % alignment == 0) { @@ -124,23 +125,26 @@ struct UnpackingData { class UnpackingRandomRoundTrip : public ::testing::TestWithParam { protected: template - void testRoundtripAlignment(UnpackFunc unpack, std::size_t alignment_offset) { + void TestRoundtripAlignment(UnpackFunc unpack, std::size_t alignment_offset) { auto [num_values, bit_width] = GetParam(); - constexpr int32_t kExtraValues = sizeof(Int) * 8; - auto const packed = generateRandomPackedValues(num_values + kExtraValues, bit_width); + // Assume std::vector allocation is likely be aligned for greater than a byte. + // So we allocate more values than necessary and skip to the next byte with the + // desired (non) alignment to test the proper condition. + constexpr int32_t kExtraValues = sizeof(Int) * 8; + auto const packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); const uint8_t* packed_unaligned = - getNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset; + GetNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset; - checkUnpackPackRoundtrip(packed_unaligned, num_values, bit_width, unpack); + CheckUnpackPackRoundtrip(packed_unaligned, num_values, bit_width, unpack); } template - void testRoundtrip(UnpackFunc unpack) { + void TestRoundtrip(UnpackFunc unpack) { // Aligned test - testRoundtripAlignment(unpack, 0); + TestRoundtripAlignment(unpack, 0); // Unaligned test - testRoundtripAlignment(unpack, 1); + TestRoundtripAlignment(unpack, 1); } }; @@ -154,27 +158,35 @@ INSTANTIATE_TEST_SUITE_P( UnpackingData{64000, 32})); TEST_P(UnpackingRandomRoundTrip, unpack32Default) { - this->testRoundtrip(&unpack32_default); + this->TestRoundtrip(&unpack32_default); } TEST_P(UnpackingRandomRoundTrip, unpack64Default) { - this->testRoundtrip(&unpack64_default); + this->TestRoundtrip(&unpack64_default); } -#if defined(ARROW_HAVE_AVX2) -TEST_P(UnpackingRandomRoundTrip, unpack32Avx2) { this->testRoundtrip(&unpack32_avx2); } +#if defined(ARROW_HAVE_RUNTIME_AVX2) +TEST_P(UnpackingRandomRoundTrip, unpack32Avx2) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) { + GTEST_SKIP() << "Test requires AVX2"; + } + this->testRoundtrip(&unpack32_avx2); +} #endif -#if defined(ARROW_HAVE_AVX512) +#if defined(ARROW_HAVE_RUNTIME_AVX512) TEST_P(UnpackingRandomRoundTrip, unpack32Avx512) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) { + GTEST_SKIP() << "Test requires AVX512"; + } this->testRoundtrip(&unpack32_avx512); } #endif #if defined(ARROW_HAVE_NEON) -TEST_P(UnpackingRandomRoundTrip, unpack32Neon) { this->testRoundtrip(&unpack32_neon); } +TEST_P(UnpackingRandomRoundTrip, unpack32Neon) { this->TestRoundtrip(&unpack32_neon); } #endif -TEST_P(UnpackingRandomRoundTrip, unpack32) { this->testRoundtrip(&unpack32); } -TEST_P(UnpackingRandomRoundTrip, unpack64) { this->testRoundtrip(&unpack64); } +TEST_P(UnpackingRandomRoundTrip, unpack32) { this->TestRoundtrip(&unpack32); } +TEST_P(UnpackingRandomRoundTrip, unpack64) { this->TestRoundtrip(&unpack64); } } // namespace arrow::internal From 7724082ba4b8c23de3e62d74270a8ede9787ae69 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 9 Sep 2025 14:07:02 +0200 Subject: [PATCH 11/17] Benchmark skip based on runtime --- cpp/src/arrow/util/bpacking_benchmark.cc | 34 ++++++++++++++++-------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 62ac38ad096..7a971f86027 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -23,10 +23,11 @@ #include "arrow/testing/util.h" #include "arrow/util/bpacking_internal.h" -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) # include "arrow/util/bpacking_avx2_internal.h" +# include "arrow/util/cpu_info.h" #endif -#if defined(ARROW_HAVE_AVX512) +#if defined(ARROW_HAVE_RUNTIME_AVX512) # include "arrow/util/bpacking_avx512_internal.h" #endif #if defined(ARROW_HAVE_NEON) @@ -73,7 +74,12 @@ const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { } template -void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack) { +void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bool skip, + std::string skip_msg) { + if (skip) { + state.SkipWithMessage(skip_msg); + } + auto const bit_width = static_cast(state.range(0)); auto const num_values = static_cast(state.range(1)); @@ -108,12 +114,14 @@ static const std::vector> bitWidthsNumValues64 = { }; /// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. -void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc unpack) { - return BM_Unpack(state, aligned, unpack); +void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); } /// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. -void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc unpack) { - return BM_Unpack(state, aligned, unpack); +void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); } BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_default_unaligned, false, unpack32_default) @@ -121,13 +129,17 @@ BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_default_unaligned, false, unpack32_d BENCHMARK_CAPTURE(BM_UnpackUint64, unpack64_default_unaligned, false, unpack64_default) ->ArgsProduct(bitWidthsNumValues64); -#if defined(ARROW_HAVE_AVX2) -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx2_unaligned, false, unpack32_avx2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx2_unaligned, false, unpack32_avx2, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), + "Avx2 not available") ->ArgsProduct(bitWidthsNumValues32); #endif -#if defined(ARROW_HAVE_AVX512) -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx512_unaligned, false, unpack32_avx512) +#if defined(ARROW_HAVE_RUNTIME_AVX512) +BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx512_unaligned, false, unpack32_avx512, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), + "Avx512 not available") ->ArgsProduct(bitWidthsNumValues32); #endif From f5846fbf2d1221c057840a1a8bb7b9c67b8e56c6 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 9 Sep 2025 15:55:13 +0200 Subject: [PATCH 12/17] Add know value unpack --- cpp/src/arrow/util/bpacking_test.cc | 116 +++++++++++++++++++++------- 1 file changed, 87 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 90d2f1a8f45..2d93325054c 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -117,12 +117,12 @@ const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { return ptr + bytes_to_add; } -struct UnpackingData { +struct TestUnpackSize { int32_t num_values; int32_t bit_width; }; -class UnpackingRandomRoundTrip : public ::testing::TestWithParam { +class TestUnpack : public ::testing::TestWithParam { protected: template void TestRoundtripAlignment(UnpackFunc unpack, std::size_t alignment_offset) { @@ -132,7 +132,7 @@ class UnpackingRandomRoundTrip : public ::testing::TestWithParam // So we allocate more values than necessary and skip to the next byte with the // desired (non) alignment to test the proper condition. constexpr int32_t kExtraValues = sizeof(Int) * 8; - auto const packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); + const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); const uint8_t* packed_unaligned = GetNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset; @@ -140,53 +140,111 @@ class UnpackingRandomRoundTrip : public ::testing::TestWithParam } template - void TestRoundtrip(UnpackFunc unpack) { - // Aligned test - TestRoundtripAlignment(unpack, 0); - // Unaligned test - TestRoundtripAlignment(unpack, 1); + void TestUnpackZeros(UnpackFunc unpack) { + auto [num_values, bit_width] = GetParam(); + const auto num_bytes = GetNumBytes(num_values, bit_width); + + const std::vector packed(static_cast(num_bytes), uint8_t{0}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + const std::vector expected(static_cast(num_values), Int{0}); + EXPECT_EQ(unpacked, expected); + } + + template + void TestUnpackOnes(UnpackFunc unpack) { + auto [num_values, bit_width] = GetParam(); + const auto num_bytes = GetNumBytes(num_values, bit_width); + + const std::vector packed(static_cast(num_bytes), uint8_t{0xFF}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + // Generate bit_width ones + Int expected_value = 0; + for (int i = 0; i < bit_width; ++i) { + expected_value = (expected_value << 1) | 1; + } + const std::vector expected(static_cast(num_values), expected_value); + EXPECT_EQ(unpacked, expected); + } + + template + void TestUnpackAlternating(UnpackFunc unpack) { + const auto [num_values, bit_width] = GetParam(); + const auto num_bytes = GetNumBytes(num_values, bit_width); + + const std::vector packed(static_cast(num_bytes), uint8_t{0xAA}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + // Generate alternative bit sequence sratring with either 0 or 1 + Int one_zero_value = 0; + Int zero_one_value = 0; + for (int i = 0; i < bit_width; ++i) { + zero_one_value = (zero_one_value << 1) | (i % 2); + one_zero_value = (one_zero_value << 1) | ((i + 1) % 2); + } + + std::vector expected; + if (bit_width % 2 == 0) { + // For even bit_width, the same pattern repeats every time + expected.resize(static_cast(num_values), one_zero_value); + } else { + // For odd bit_width, we alternate a pattern leading with 0 and 1 + for (int i = 0; i < num_values; ++i) { + expected.push_back(i % 2 == 0 ? zero_one_value : one_zero_value); + } + } + EXPECT_EQ(unpacked, expected); + } + + template + void TestAll(UnpackFunc unpack) { + // Known values + TestUnpackZeros(unpack); + TestUnpackOnes(unpack); + TestUnpackAlternating(unpack); + + // Roundtrips + TestRoundtripAlignment(unpack, /* alignment_offset= */ 0); + TestRoundtripAlignment(unpack, /* alignment_offset= */ 1); } }; INSTANTIATE_TEST_SUITE_P( - MutpliesOf64Values, UnpackingRandomRoundTrip, - ::testing::Values(UnpackingData{64, 1}, UnpackingData{128, 1}, UnpackingData{2048, 1}, - UnpackingData{64, 31}, UnpackingData{128, 31}, - UnpackingData{2048, 31}, UnpackingData{64000, 7}, - UnpackingData{64000, 8}, UnpackingData{64000, 13}, - UnpackingData{64000, 16}, UnpackingData{64000, 31}, - UnpackingData{64000, 32})); - -TEST_P(UnpackingRandomRoundTrip, unpack32Default) { - this->TestRoundtrip(&unpack32_default); -} -TEST_P(UnpackingRandomRoundTrip, unpack64Default) { - this->TestRoundtrip(&unpack64_default); -} + MutpliesOf64Values, TestUnpack, + ::testing::Values(TestUnpackSize{64, 1}, TestUnpackSize{128, 1}, + TestUnpackSize{2048, 1}, TestUnpackSize{64, 31}, + TestUnpackSize{128, 31}, TestUnpackSize{2048, 31}, + TestUnpackSize{64000, 7}, TestUnpackSize{64000, 8}, + TestUnpackSize{64000, 13}, TestUnpackSize{64000, 16}, + TestUnpackSize{64000, 31}, TestUnpackSize{64000, 32})); + +TEST_P(TestUnpack, unpack32Default) { this->TestAll(&unpack32_default); } +TEST_P(TestUnpack, unpack64Default) { this->TestAll(&unpack64_default); } #if defined(ARROW_HAVE_RUNTIME_AVX2) -TEST_P(UnpackingRandomRoundTrip, unpack32Avx2) { +TEST_P(TestUnpack, unpack32Avx2) { if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) { GTEST_SKIP() << "Test requires AVX2"; } - this->testRoundtrip(&unpack32_avx2); + this->TestAll(&unpack32_avx2); } #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) -TEST_P(UnpackingRandomRoundTrip, unpack32Avx512) { +TEST_P(TestUnpack, unpack32Avx512) { if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) { GTEST_SKIP() << "Test requires AVX512"; } - this->testRoundtrip(&unpack32_avx512); + this->TestAll(&unpack32_avx512); } #endif #if defined(ARROW_HAVE_NEON) -TEST_P(UnpackingRandomRoundTrip, unpack32Neon) { this->TestRoundtrip(&unpack32_neon); } +TEST_P(TestUnpack, unpack32Neon) { this->TestAll(&unpack32_neon); } #endif -TEST_P(UnpackingRandomRoundTrip, unpack32) { this->TestRoundtrip(&unpack32); } -TEST_P(UnpackingRandomRoundTrip, unpack64) { this->TestRoundtrip(&unpack64); } +TEST_P(TestUnpack, unpack32) { this->TestAll(&unpack32); } +TEST_P(TestUnpack, unpack64) { this->TestAll(&unpack64); } } // namespace arrow::internal From 7e3b2795730afa1a10faad38c37438b583ea4cf2 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 10:37:30 +0200 Subject: [PATCH 13/17] Address reviewer comments --- cpp/src/arrow/util/bpacking_benchmark.cc | 28 +++++++++++----------- cpp/src/arrow/util/bpacking_test.cc | 30 +++++++++++++----------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 7a971f86027..6f95b9e8048 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -104,7 +104,7 @@ constexpr int32_t kMinRange = 64; constexpr int32_t kMaxRange = 32768; constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; -static const std::vector> bitWidthsNumValues32 = { +static const std::vector> kBitWidthsNumValues32 = { kBitWidths32, benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), }; @@ -124,38 +124,38 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); } -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_default_unaligned, false, unpack32_default) - ->ArgsProduct(bitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_UnpackUint64, unpack64_default_unaligned, false, unpack64_default) +BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_default) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_default) ->ArgsProduct(bitWidthsNumValues64); #if defined(ARROW_HAVE_RUNTIME_AVX2) -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx2_unaligned, false, unpack32_avx2, +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), "Avx2 not available") ->ArgsProduct(bitWidthsNumValues32); #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_avx512_unaligned, false, unpack32_avx512, +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), "Avx512 not available") ->ArgsProduct(bitWidthsNumValues32); #endif #if defined(ARROW_HAVE_NEON) -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_neon_unaligned, false, unpack32_neon) - ->ArgsProduct(bitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, unpack32_neon) + ->ArgsProduct(kBitWidthsNumValues32); #endif -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_aligned, true, unpack32) - ->ArgsProduct(bitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_UnpackUint32, unpack32_unaligned, false, unpack32) - ->ArgsProduct(bitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, unpack32) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, unpack32) + ->ArgsProduct(kBitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_UnpackUint64, unpack64_aligned, true, unpack64) +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, unpack64) ->ArgsProduct(bitWidthsNumValues64); -BENCHMARK_CAPTURE(BM_UnpackUint64, unpack64_unaligned, false, unpack64) +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, unpack64) ->ArgsProduct(bitWidthsNumValues64); } // namespace diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 2d93325054c..1d60ad58072 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. -#include #include #include +#include "arrow/result.h" +#include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/bit_stream_utils_internal.h" #include "arrow/util/bpacking_internal.h" @@ -42,10 +43,11 @@ template using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); /// Get the number of bytes associate with a packing. -constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { +Result GetNumBytes(int32_t num_values, int32_t bit_width) { auto const num_bits = num_values * bit_width; if (num_bits % 8 != 0) { - throw std::invalid_argument("Must pack a multiple of 8 bits."); + return Status::NotImplemented( + "The unpack functions only work on a multiple of 8 bits."); } return num_bits / 8; } @@ -53,7 +55,7 @@ constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { /// Generate random bytes as packed integers. std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { constexpr uint32_t kSeed = 3214; - auto const num_bytes = GetNumBytes(num_values, bit_width); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); std::vector out(num_bytes); random_bytes(num_bytes, kSeed, out.data()); @@ -76,7 +78,7 @@ std::vector UnpackValues(const uint8_t* packed, int32_t num_values, template std::vector PackValues(std::vector const& values, int32_t num_values, int32_t bit_width) { - auto const num_bytes = GetNumBytes(num_values, bit_width); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); std::vector out(static_cast(num_bytes)); bit_util::BitWriter writer(out.data(), num_bytes); @@ -93,7 +95,7 @@ std::vector PackValues(std::vector const& values, int32_t num_valu template void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, int32_t bit_width, UnpackFunc unpack) { - auto const num_bytes = GetNumBytes(num_values, bit_width); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); auto const unpacked = UnpackValues(packed, num_values, bit_width, unpack); EXPECT_EQ(unpacked.size(), num_values); @@ -142,7 +144,7 @@ class TestUnpack : public ::testing::TestWithParam { template void TestUnpackZeros(UnpackFunc unpack) { auto [num_values, bit_width] = GetParam(); - const auto num_bytes = GetNumBytes(num_values, bit_width); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); const std::vector packed(static_cast(num_bytes), uint8_t{0}); const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); @@ -154,7 +156,7 @@ class TestUnpack : public ::testing::TestWithParam { template void TestUnpackOnes(UnpackFunc unpack) { auto [num_values, bit_width] = GetParam(); - const auto num_bytes = GetNumBytes(num_values, bit_width); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); const std::vector packed(static_cast(num_bytes), uint8_t{0xFF}); const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); @@ -171,7 +173,7 @@ class TestUnpack : public ::testing::TestWithParam { template void TestUnpackAlternating(UnpackFunc unpack) { const auto [num_values, bit_width] = GetParam(); - const auto num_bytes = GetNumBytes(num_values, bit_width); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); const std::vector packed(static_cast(num_bytes), uint8_t{0xAA}); const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); @@ -211,13 +213,13 @@ class TestUnpack : public ::testing::TestWithParam { }; INSTANTIATE_TEST_SUITE_P( - MutpliesOf64Values, TestUnpack, + UnpackMultiplesOf64Values, TestUnpack, ::testing::Values(TestUnpackSize{64, 1}, TestUnpackSize{128, 1}, TestUnpackSize{2048, 1}, TestUnpackSize{64, 31}, - TestUnpackSize{128, 31}, TestUnpackSize{2048, 31}, - TestUnpackSize{64000, 7}, TestUnpackSize{64000, 8}, - TestUnpackSize{64000, 13}, TestUnpackSize{64000, 16}, - TestUnpackSize{64000, 31}, TestUnpackSize{64000, 32})); + TestUnpackSize{128, 31}, TestUnpackSize{2048, 1}, + TestUnpackSize{2048, 8}, TestUnpackSize{2048, 13}, + TestUnpackSize{2048, 16}, TestUnpackSize{2048, 31}, + TestUnpackSize{2048, 32})); TEST_P(TestUnpack, unpack32Default) { this->TestAll(&unpack32_default); } TEST_P(TestUnpack, unpack64Default) { this->TestAll(&unpack64_default); } From 93204eef88cf07a1414587fef3c7e7ee8df4dd5e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 10:38:26 +0200 Subject: [PATCH 14/17] Reformat --- cpp/src/arrow/util/bpacking_benchmark.cc | 10 +++++----- cpp/src/arrow/util/bpacking_test.cc | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index 6f95b9e8048..fe7314cec07 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -42,7 +42,7 @@ using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); /// Get the number of bytes associate with a packing. constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { - auto const num_bits = num_values * bit_width; + const auto num_bits = num_values * bit_width; if (num_bits % 8 != 0) { throw std::invalid_argument("Must pack a multiple of 8 bits."); } @@ -52,7 +52,7 @@ constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { /// Generate random bytes as packed integers. std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { constexpr uint32_t kSeed = 3214; - auto const num_bytes = GetNumBytes(num_values, bit_width); + const auto num_bytes = GetNumBytes(num_values, bit_width); std::vector out(num_bytes); random_bytes(num_bytes, kSeed, out.data()); @@ -80,14 +80,14 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bo state.SkipWithMessage(skip_msg); } - auto const bit_width = static_cast(state.range(0)); - auto const num_values = static_cast(state.range(1)); + const auto bit_width = static_cast(state.range(0)); + const auto num_values = static_cast(state.range(1)); // Assume std::vector allocation is likely be aligned for greater than a byte. // So we allocate more values than necessary and skip to the next byte with the // desired (non) alignment to test the proper condition. constexpr int32_t kExtraValues = sizeof(Int) * 8; - auto const packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); + const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); const uint8_t* packed_ptr = GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 1d60ad58072..586f8f1baaa 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -44,7 +44,7 @@ using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); /// Get the number of bytes associate with a packing. Result GetNumBytes(int32_t num_values, int32_t bit_width) { - auto const num_bits = num_values * bit_width; + const auto num_bits = num_values * bit_width; if (num_bits % 8 != 0) { return Status::NotImplemented( "The unpack functions only work on a multiple of 8 bits."); @@ -76,13 +76,13 @@ std::vector UnpackValues(const uint8_t* packed, int32_t num_values, /// Use BitWriter to pack values into a vector. template -std::vector PackValues(std::vector const& values, int32_t num_values, +std::vector PackValues(const std::vector& values, int32_t num_values, int32_t bit_width) { EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); std::vector out(static_cast(num_bytes)); bit_util::BitWriter writer(out.data(), num_bytes); - for (auto const& v : values) { + for (const auto& v : values) { bool written = writer.PutValue(v, bit_width); if (!written) { throw std::runtime_error("Cannot write move values"); @@ -97,9 +97,9 @@ void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, int32_t bit_width, UnpackFunc unpack) { EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); - auto const unpacked = UnpackValues(packed, num_values, bit_width, unpack); + const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack); EXPECT_EQ(unpacked.size(), num_values); - auto const roundtrip = PackValues(unpacked, num_values, bit_width); + const auto roundtrip = PackValues(unpacked, num_values, bit_width); EXPECT_EQ(num_bytes, roundtrip.size()); for (int i = 0; i < num_bytes; ++i) { EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i; From 90a449558dc03e60eaa1e1a7537c25be9b8817df Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 10:49:04 +0200 Subject: [PATCH 15/17] Rename unpackNN_default to unpackNN_scalar --- cpp/src/arrow/util/bpacking.cc | 10 +++++----- cpp/src/arrow/util/bpacking_benchmark.cc | 4 ++-- cpp/src/arrow/util/bpacking_internal.h | 8 ++++---- cpp/src/arrow/util/bpacking_test.cc | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index a54fb340393..990f76875aa 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -36,7 +36,7 @@ namespace arrow { namespace internal { -int unpack32_default(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { +int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { const uint32_t* in = reinterpret_cast(in_); batch_size = batch_size / 32 * 32; @@ -152,10 +152,10 @@ int unpack32_default(const uint8_t* in_, uint32_t* out, int batch_size, int num_ namespace { struct Unpack32DynamicFunction { - using FunctionType = decltype(&unpack32_default); + using FunctionType = decltype(&unpack32_scalar); static std::vector> implementations() { - return {{DispatchLevel::NONE, unpack32_default} + return {{DispatchLevel::NONE, unpack32_scalar} #if defined(ARROW_HAVE_RUNTIME_AVX2) , {DispatchLevel::AVX2, unpack32_avx2} @@ -179,7 +179,7 @@ int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { #endif } -int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { +int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -388,7 +388,7 @@ int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_b int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { // TODO: unpack64_neon, unpack64_avx2 and unpack64_avx512 - return unpack64_default(in, out, batch_size, num_bits); + return unpack64_scalar(in, out, batch_size, num_bits); } } // namespace internal diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index fe7314cec07..e5ad3636ef3 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -124,9 +124,9 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); } -BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_default) +BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar) ->ArgsProduct(kBitWidthsNumValues32); -BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_default) +BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar) ->ArgsProduct(bitWidthsNumValues64); #if defined(ARROW_HAVE_RUNTIME_AVX2) diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h index 53bb7a6b337..e003cd8c0c6 100644 --- a/cpp/src/arrow/util/bpacking_internal.h +++ b/cpp/src/arrow/util/bpacking_internal.h @@ -24,12 +24,12 @@ namespace arrow::internal { /// The scalar 32 bit unpacking. -ARROW_EXPORT int unpack32_default(const uint8_t* in, uint32_t* out, int batch_size, - int num_bits); +ARROW_EXPORT int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); /// The scalar 64 bit unpacking. -ARROW_EXPORT int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, - int num_bits); +ARROW_EXPORT int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, + int num_bits); ARROW_EXPORT int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 586f8f1baaa..7562b6162c4 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -221,8 +221,8 @@ INSTANTIATE_TEST_SUITE_P( TestUnpackSize{2048, 16}, TestUnpackSize{2048, 31}, TestUnpackSize{2048, 32})); -TEST_P(TestUnpack, unpack32Default) { this->TestAll(&unpack32_default); } -TEST_P(TestUnpack, unpack64Default) { this->TestAll(&unpack64_default); } +TEST_P(TestUnpack, unpack32Default) { this->TestAll(&unpack32_scalar); } +TEST_P(TestUnpack, unpack64Default) { this->TestAll(&unpack64_scalar); } #if defined(ARROW_HAVE_RUNTIME_AVX2) TEST_P(TestUnpack, unpack32Avx2) { From fc4b54bda9ffc9236d964449e8587eb3b9fe4ba0 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 16 Sep 2025 10:59:28 +0200 Subject: [PATCH 16/17] Fix renaming omission --- cpp/src/arrow/util/bpacking_benchmark.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc index e5ad3636ef3..f0ac22910c6 100644 --- a/cpp/src/arrow/util/bpacking_benchmark.cc +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -108,7 +108,7 @@ static const std::vector> kBitWidthsNumValues32 = { kBitWidths32, benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), }; -static const std::vector> bitWidthsNumValues64 = { +static const std::vector> kBitWidthsNumValues64 = { kBitWidths64, benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), }; @@ -127,20 +127,20 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar) ->ArgsProduct(kBitWidthsNumValues32); BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar) - ->ArgsProduct(bitWidthsNumValues64); + ->ArgsProduct(kBitWidthsNumValues64); #if defined(ARROW_HAVE_RUNTIME_AVX2) BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), "Avx2 not available") - ->ArgsProduct(bitWidthsNumValues32); + ->ArgsProduct(kBitWidthsNumValues32); #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512, !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), "Avx512 not available") - ->ArgsProduct(bitWidthsNumValues32); + ->ArgsProduct(kBitWidthsNumValues32); #endif #if defined(ARROW_HAVE_NEON) @@ -154,9 +154,9 @@ BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, unpack32) ->ArgsProduct(kBitWidthsNumValues32); BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, unpack64) - ->ArgsProduct(bitWidthsNumValues64); + ->ArgsProduct(kBitWidthsNumValues64); BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, unpack64) - ->ArgsProduct(bitWidthsNumValues64); + ->ArgsProduct(kBitWidthsNumValues64); } // namespace } // namespace arrow::internal From caeec86fd24fbe0cb5971baceecae8b91230129b Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 16 Sep 2025 11:45:34 +0200 Subject: [PATCH 17/17] Naming nits --- cpp/src/arrow/util/bpacking_test.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc index 7562b6162c4..c2dd4748a44 100644 --- a/cpp/src/arrow/util/bpacking_test.cc +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -221,11 +221,11 @@ INSTANTIATE_TEST_SUITE_P( TestUnpackSize{2048, 16}, TestUnpackSize{2048, 31}, TestUnpackSize{2048, 32})); -TEST_P(TestUnpack, unpack32Default) { this->TestAll(&unpack32_scalar); } -TEST_P(TestUnpack, unpack64Default) { this->TestAll(&unpack64_scalar); } +TEST_P(TestUnpack, Unpack32Scalar) { this->TestAll(&unpack32_scalar); } +TEST_P(TestUnpack, Unpack64Scalar) { this->TestAll(&unpack64_scalar); } #if defined(ARROW_HAVE_RUNTIME_AVX2) -TEST_P(TestUnpack, unpack32Avx2) { +TEST_P(TestUnpack, Unpack32Avx2) { if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) { GTEST_SKIP() << "Test requires AVX2"; } @@ -234,7 +234,7 @@ TEST_P(TestUnpack, unpack32Avx2) { #endif #if defined(ARROW_HAVE_RUNTIME_AVX512) -TEST_P(TestUnpack, unpack32Avx512) { +TEST_P(TestUnpack, Unpack32Avx512) { if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) { GTEST_SKIP() << "Test requires AVX512"; } @@ -243,10 +243,10 @@ TEST_P(TestUnpack, unpack32Avx512) { #endif #if defined(ARROW_HAVE_NEON) -TEST_P(TestUnpack, unpack32Neon) { this->TestAll(&unpack32_neon); } +TEST_P(TestUnpack, Unpack32Neon) { this->TestAll(&unpack32_neon); } #endif -TEST_P(TestUnpack, unpack32) { this->TestAll(&unpack32); } -TEST_P(TestUnpack, unpack64) { this->TestAll(&unpack64); } +TEST_P(TestUnpack, Unpack32) { this->TestAll(&unpack32); } +TEST_P(TestUnpack, Unpack64) { this->TestAll(&unpack64); } } // namespace arrow::internal