diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index df47389240e..56545f6aa79 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -98,6 +98,7 @@ add_arrow_test(bit-utility-test SOURCES bit_block_counter_test.cc bit_util_test.cc + bpacking_test.cc rle_encoding_test.cc) add_arrow_test(threading-utility-test @@ -117,6 +118,7 @@ add_arrow_test(crc32-test add_arrow_benchmark(bit_block_counter_benchmark) add_arrow_benchmark(bit_util_benchmark) +add_arrow_benchmark(bpacking_benchmark) add_arrow_benchmark(bitmap_reader_benchmark) add_arrow_benchmark(cache_benchmark) add_arrow_benchmark(compression_benchmark) diff --git a/cpp/src/arrow/util/bit_stream_utils_internal.h b/cpp/src/arrow/util/bit_stream_utils_internal.h index 9d67c278bcc..2b5ec3830ee 100644 --- a/cpp/src/arrow/util/bit_stream_utils_internal.h +++ b/cpp/src/arrow/util/bit_stream_utils_internal.h @@ -25,6 +25,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bpacking_internal.h" +#include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/ubsan.h" @@ -339,8 +340,8 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { if (sizeof(T) == 4) { int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - reinterpret_cast(v + i), batch_size - i, num_bits); + internal::unpack32(buffer + byte_offset, reinterpret_cast(v + i), + batch_size - i, num_bits); i += num_unpacked; byte_offset += num_unpacked * num_bits / 8; } else if (sizeof(T) == 8 && num_bits > 32) { @@ -360,8 +361,7 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) { while (i < batch_size) { int unpack_size = std::min(buffer_size, batch_size - i); int num_unpacked = - internal::unpack32(reinterpret_cast(buffer + byte_offset), - unpack_buffer, unpack_size, num_bits); + internal::unpack32(buffer + byte_offset, unpack_buffer, unpack_size, num_bits); if (num_unpacked == 0) { break; } diff --git a/cpp/src/arrow/util/bpacking.cc b/cpp/src/arrow/util/bpacking.cc index 326dd050fe1..990f76875aa 100644 --- a/cpp/src/arrow/util/bpacking.cc +++ b/cpp/src/arrow/util/bpacking.cc @@ -36,9 +36,9 @@ namespace arrow { namespace internal { -namespace { +int unpack32_scalar(const uint8_t* in_, uint32_t* out, int batch_size, int num_bits) { + const uint32_t* in = reinterpret_cast(in_); -int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -149,11 +149,13 @@ int unpack32_default(const uint32_t* in, uint32_t* out, int batch_size, int num_ return batch_size; } +namespace { + struct Unpack32DynamicFunction { - using FunctionType = decltype(&unpack32_default); + using FunctionType = decltype(&unpack32_scalar); static std::vector> implementations() { - return {{DispatchLevel::NONE, unpack32_default} + return {{DispatchLevel::NONE, unpack32_scalar} #if defined(ARROW_HAVE_RUNTIME_AVX2) , {DispatchLevel::AVX2, unpack32_avx2} @@ -168,7 +170,7 @@ struct Unpack32DynamicFunction { } // namespace -int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { +int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { #if defined(ARROW_HAVE_NEON) return unpack32_neon(in, out, batch_size, num_bits); #else @@ -177,9 +179,7 @@ int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { #endif } -namespace { - -int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { +int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { batch_size = batch_size / 32 * 32; int num_loops = batch_size / 32; @@ -386,11 +386,9 @@ int unpack64_default(const uint8_t* in, uint64_t* out, int batch_size, int num_b return batch_size; } -} // namespace - int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits) { // TODO: unpack64_neon, unpack64_avx2 and unpack64_avx512 - return unpack64_default(in, out, batch_size, num_bits); + return unpack64_scalar(in, out, batch_size, num_bits); } } // namespace internal diff --git a/cpp/src/arrow/util/bpacking64_default_internal.h b/cpp/src/arrow/util/bpacking64_default_internal.h index 4f45619b2a7..256cdda87e3 100644 --- a/cpp/src/arrow/util/bpacking64_default_internal.h +++ b/cpp/src/arrow/util/bpacking64_default_internal.h @@ -26,11 +26,10 @@ #pragma once -#include "arrow/util/bit_util.h" +#include "arrow/util/endian.h" #include "arrow/util/ubsan.h" -namespace arrow { -namespace internal { +namespace arrow::internal { inline const uint8_t* unpack0_64(const uint8_t* in, uint64_t* out) { for (int k = 0; k < 32; k += 1) { @@ -5638,5 +5637,4 @@ inline const uint8_t* unpack64_64(const uint8_t* in, uint64_t* out) { return in; } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx2.cc b/cpp/src/arrow/util/bpacking_avx2.cc index 02510a07b9f..84f091594c1 100644 --- a/cpp/src/arrow/util/bpacking_avx2.cc +++ b/cpp/src/arrow/util/bpacking_avx2.cc @@ -19,13 +19,11 @@ #include "arrow/util/bpacking_simd256_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx2_internal.h b/cpp/src/arrow/util/bpacking_avx2_internal.h index 7a7d8bf8c44..b2c213fe2aa 100644 --- a/cpp/src/arrow/util/bpacking_avx2_internal.h +++ b/cpp/src/arrow/util/bpacking_avx2_internal.h @@ -17,12 +17,13 @@ #pragma once -#include +#include "arrow/util/visibility.h" -namespace arrow { -namespace internal { +#include -int unpack32_avx2(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +namespace arrow::internal { -} // namespace internal -} // namespace arrow +ARROW_EXPORT int unpack32_avx2(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512.cc b/cpp/src/arrow/util/bpacking_avx512.cc index 6272ef1cde8..35de0dd5b47 100644 --- a/cpp/src/arrow/util/bpacking_avx512.cc +++ b/cpp/src/arrow/util/bpacking_avx512.cc @@ -19,13 +19,11 @@ #include "arrow/util/bpacking_simd512_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_avx512_internal.h b/cpp/src/arrow/util/bpacking_avx512_internal.h index 96723f803e0..847aa981433 100644 --- a/cpp/src/arrow/util/bpacking_avx512_internal.h +++ b/cpp/src/arrow/util/bpacking_avx512_internal.h @@ -17,12 +17,13 @@ #pragma once -#include +#include "arrow/util/visibility.h" -namespace arrow { -namespace internal { +#include -int unpack32_avx512(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +namespace arrow::internal { -} // namespace internal -} // namespace arrow +ARROW_EXPORT int unpack32_avx512(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_benchmark.cc b/cpp/src/arrow/util/bpacking_benchmark.cc new file mode 100644 index 00000000000..f0ac22910c6 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_benchmark.cc @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include + +#include "arrow/testing/util.h" +#include "arrow/util/bpacking_internal.h" + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "arrow/util/bpacking_avx2_internal.h" +# include "arrow/util/cpu_info.h" +#endif +#if defined(ARROW_HAVE_RUNTIME_AVX512) +# include "arrow/util/bpacking_avx512_internal.h" +#endif +#if defined(ARROW_HAVE_NEON) +# include "arrow/util/bpacking_neon_internal.h" +#endif + +namespace arrow::internal { +namespace { + +template +using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); + +/// Get the number of bytes associate with a packing. +constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) { + const auto num_bits = num_values * bit_width; + if (num_bits % 8 != 0) { + throw std::invalid_argument("Must pack a multiple of 8 bits."); + } + return num_bits / 8; +} + +/// Generate random bytes as packed integers. +std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { + constexpr uint32_t kSeed = 3214; + const auto num_bytes = GetNumBytes(num_values, bit_width); + + std::vector out(num_bytes); + random_bytes(num_bytes, kSeed, out.data()); + + return out; +} + +const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { + auto addr = reinterpret_cast(ptr); + + if (addr % alignment == 0) { + return ptr; + } + + auto remainder = addr % alignment; + auto bytes_to_add = alignment - remainder; + + return ptr + bytes_to_add; +} + +template +void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc unpack, bool skip, + std::string skip_msg) { + if (skip) { + state.SkipWithMessage(skip_msg); + } + + const auto bit_width = static_cast(state.range(0)); + const auto num_values = static_cast(state.range(1)); + + // Assume std::vector allocation is likely be aligned for greater than a byte. + // So we allocate more values than necessary and skip to the next byte with the + // desired (non) alignment to test the proper condition. + constexpr int32_t kExtraValues = sizeof(Int) * 8; + const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); + const uint8_t* packed_ptr = + GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1); + + std::vector unpacked(num_values, 0); + + for (auto _ : state) { + unpack(packed_ptr, unpacked.data(), num_values, bit_width); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(num_values * state.iterations()); +} + +constexpr int32_t kMinRange = 64; +constexpr int32_t kMaxRange = 32768; +constexpr std::initializer_list kBitWidths32 = {1, 2, 8, 20}; +constexpr std::initializer_list kBitWidths64 = {1, 2, 8, 20, 47}; +static const std::vector> kBitWidthsNumValues32 = { + kBitWidths32, + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), +}; +static const std::vector> kBitWidthsNumValues64 = { + kBitWidths64, + benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32), +}; + +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint32(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} +/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro. +void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc unpack, + bool skip = false, std::string skip_msg = "") { + return BM_Unpack(state, aligned, unpack, skip, std::move(skip_msg)); +} + +BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, unpack32_scalar) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, unpack64_scalar) + ->ArgsProduct(kBitWidthsNumValues64); + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, unpack32_avx2, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2), + "Avx2 not available") + ->ArgsProduct(kBitWidthsNumValues32); +#endif + +#if defined(ARROW_HAVE_RUNTIME_AVX512) +BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, unpack32_avx512, + !CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512), + "Avx512 not available") + ->ArgsProduct(kBitWidthsNumValues32); +#endif + +#if defined(ARROW_HAVE_NEON) +BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, unpack32_neon) + ->ArgsProduct(kBitWidthsNumValues32); +#endif + +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicAligned, true, unpack32) + ->ArgsProduct(kBitWidthsNumValues32); +BENCHMARK_CAPTURE(BM_UnpackUint32, DynamicUnaligned, false, unpack32) + ->ArgsProduct(kBitWidthsNumValues32); + +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicAligned, true, unpack64) + ->ArgsProduct(kBitWidthsNumValues64); +BENCHMARK_CAPTURE(BM_UnpackUint64, DynamicUnaligned, false, unpack64) + ->ArgsProduct(kBitWidthsNumValues64); + +} // namespace +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_internal.h b/cpp/src/arrow/util/bpacking_internal.h index dd85c1638c7..e003cd8c0c6 100644 --- a/cpp/src/arrow/util/bpacking_internal.h +++ b/cpp/src/arrow/util/bpacking_internal.h @@ -17,18 +17,23 @@ #pragma once -#include "arrow/util/endian.h" #include "arrow/util/visibility.h" -#include +#include -namespace arrow { -namespace internal { +namespace arrow::internal { + +/// The scalar 32 bit unpacking. +ARROW_EXPORT int unpack32_scalar(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +/// The scalar 64 bit unpacking. +ARROW_EXPORT int unpack64_scalar(const uint8_t* in, uint64_t* out, int batch_size, + int num_bits); ARROW_EXPORT -int unpack32(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +int unpack32(const uint8_t* in, uint32_t* out, int batch_size, int num_bits); ARROW_EXPORT int unpack64(const uint8_t* in, uint64_t* out, int batch_size, int num_bits); -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon.cc b/cpp/src/arrow/util/bpacking_neon.cc index 72b520e8cf1..407b309b7e8 100644 --- a/cpp/src/arrow/util/bpacking_neon.cc +++ b/cpp/src/arrow/util/bpacking_neon.cc @@ -19,13 +19,11 @@ #include "arrow/util/bpacking_simd128_generated_internal.h" #include "arrow/util/bpacking_simd_internal.h" -namespace arrow { -namespace internal { +namespace arrow::internal { -int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits) { - return unpack32_specialized>(in, out, batch_size, - num_bits); +int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, int num_bits) { + return unpack32_specialized>( + reinterpret_cast(in), out, batch_size, num_bits); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_neon_internal.h b/cpp/src/arrow/util/bpacking_neon_internal.h index 9d02cd568ac..683aa5cbc47 100644 --- a/cpp/src/arrow/util/bpacking_neon_internal.h +++ b/cpp/src/arrow/util/bpacking_neon_internal.h @@ -17,12 +17,13 @@ #pragma once -#include +#include "arrow/util/visibility.h" -namespace arrow { -namespace internal { +#include -int unpack32_neon(const uint32_t* in, uint32_t* out, int batch_size, int num_bits); +namespace arrow::internal { -} // namespace internal -} // namespace arrow +ARROW_EXPORT int unpack32_neon(const uint8_t* in, uint32_t* out, int batch_size, + int num_bits); + +} // namespace arrow::internal diff --git a/cpp/src/arrow/util/bpacking_test.cc b/cpp/src/arrow/util/bpacking_test.cc new file mode 100644 index 00000000000..c2dd4748a44 --- /dev/null +++ b/cpp/src/arrow/util/bpacking_test.cc @@ -0,0 +1,252 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "arrow/result.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +# include "arrow/util/bpacking_avx2_internal.h" +# include "arrow/util/cpu_info.h" +#endif +#if defined(ARROW_HAVE_RUNTIME_AVX512) +# include "arrow/util/bpacking_avx512_internal.h" +#endif +#if defined(ARROW_HAVE_NEON) +# include "arrow/util/bpacking_neon_internal.h" +#endif + +namespace arrow::internal { + +template +using UnpackFunc = int (*)(const uint8_t*, Int*, int, int); + +/// Get the number of bytes associate with a packing. +Result GetNumBytes(int32_t num_values, int32_t bit_width) { + const auto num_bits = num_values * bit_width; + if (num_bits % 8 != 0) { + return Status::NotImplemented( + "The unpack functions only work on a multiple of 8 bits."); + } + return num_bits / 8; +} + +/// Generate random bytes as packed integers. +std::vector GenerateRandomPackedValues(int32_t num_values, int32_t bit_width) { + constexpr uint32_t kSeed = 3214; + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + std::vector out(num_bytes); + random_bytes(num_bytes, kSeed, out.data()); + + return out; +} + +/// Convenience wrapper to unpack into a vector +template +std::vector UnpackValues(const uint8_t* packed, int32_t num_values, + int32_t bit_width, UnpackFunc unpack) { + std::vector out(num_values); + int values_read = unpack(packed, out.data(), num_values, bit_width); + ARROW_DCHECK_GE(values_read, 0); + out.resize(values_read); + return out; +} + +/// Use BitWriter to pack values into a vector. +template +std::vector PackValues(const std::vector& values, int32_t num_values, + int32_t bit_width) { + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + std::vector out(static_cast(num_bytes)); + bit_util::BitWriter writer(out.data(), num_bytes); + for (const auto& v : values) { + bool written = writer.PutValue(v, bit_width); + if (!written) { + throw std::runtime_error("Cannot write move values"); + } + } + + return out; +} + +template +void CheckUnpackPackRoundtrip(const uint8_t* packed, int32_t num_values, + int32_t bit_width, UnpackFunc unpack) { + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const auto unpacked = UnpackValues(packed, num_values, bit_width, unpack); + EXPECT_EQ(unpacked.size(), num_values); + const auto roundtrip = PackValues(unpacked, num_values, bit_width); + EXPECT_EQ(num_bytes, roundtrip.size()); + for (int i = 0; i < num_bytes; ++i) { + EXPECT_EQ(packed[i], roundtrip[i]) << "differ in position " << i; + } +} + +const uint8_t* GetNextAlignedByte(const uint8_t* ptr, std::size_t alignment) { + auto addr = reinterpret_cast(ptr); + + if (addr % alignment == 0) { + return ptr; + } + + auto remainder = addr % alignment; + auto bytes_to_add = alignment - remainder; + + return ptr + bytes_to_add; +} + +struct TestUnpackSize { + int32_t num_values; + int32_t bit_width; +}; + +class TestUnpack : public ::testing::TestWithParam { + protected: + template + void TestRoundtripAlignment(UnpackFunc unpack, std::size_t alignment_offset) { + auto [num_values, bit_width] = GetParam(); + + // Assume std::vector allocation is likely be aligned for greater than a byte. + // So we allocate more values than necessary and skip to the next byte with the + // desired (non) alignment to test the proper condition. + constexpr int32_t kExtraValues = sizeof(Int) * 8; + const auto packed = GenerateRandomPackedValues(num_values + kExtraValues, bit_width); + const uint8_t* packed_unaligned = + GetNextAlignedByte(packed.data(), sizeof(Int)) + alignment_offset; + + CheckUnpackPackRoundtrip(packed_unaligned, num_values, bit_width, unpack); + } + + template + void TestUnpackZeros(UnpackFunc unpack) { + auto [num_values, bit_width] = GetParam(); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const std::vector packed(static_cast(num_bytes), uint8_t{0}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + const std::vector expected(static_cast(num_values), Int{0}); + EXPECT_EQ(unpacked, expected); + } + + template + void TestUnpackOnes(UnpackFunc unpack) { + auto [num_values, bit_width] = GetParam(); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const std::vector packed(static_cast(num_bytes), uint8_t{0xFF}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + // Generate bit_width ones + Int expected_value = 0; + for (int i = 0; i < bit_width; ++i) { + expected_value = (expected_value << 1) | 1; + } + const std::vector expected(static_cast(num_values), expected_value); + EXPECT_EQ(unpacked, expected); + } + + template + void TestUnpackAlternating(UnpackFunc unpack) { + const auto [num_values, bit_width] = GetParam(); + EXPECT_OK_AND_ASSIGN(const auto num_bytes, GetNumBytes(num_values, bit_width)); + + const std::vector packed(static_cast(num_bytes), uint8_t{0xAA}); + const auto unpacked = UnpackValues(packed.data(), num_values, bit_width, unpack); + + // Generate alternative bit sequence sratring with either 0 or 1 + Int one_zero_value = 0; + Int zero_one_value = 0; + for (int i = 0; i < bit_width; ++i) { + zero_one_value = (zero_one_value << 1) | (i % 2); + one_zero_value = (one_zero_value << 1) | ((i + 1) % 2); + } + + std::vector expected; + if (bit_width % 2 == 0) { + // For even bit_width, the same pattern repeats every time + expected.resize(static_cast(num_values), one_zero_value); + } else { + // For odd bit_width, we alternate a pattern leading with 0 and 1 + for (int i = 0; i < num_values; ++i) { + expected.push_back(i % 2 == 0 ? zero_one_value : one_zero_value); + } + } + EXPECT_EQ(unpacked, expected); + } + + template + void TestAll(UnpackFunc unpack) { + // Known values + TestUnpackZeros(unpack); + TestUnpackOnes(unpack); + TestUnpackAlternating(unpack); + + // Roundtrips + TestRoundtripAlignment(unpack, /* alignment_offset= */ 0); + TestRoundtripAlignment(unpack, /* alignment_offset= */ 1); + } +}; + +INSTANTIATE_TEST_SUITE_P( + UnpackMultiplesOf64Values, TestUnpack, + ::testing::Values(TestUnpackSize{64, 1}, TestUnpackSize{128, 1}, + TestUnpackSize{2048, 1}, TestUnpackSize{64, 31}, + TestUnpackSize{128, 31}, TestUnpackSize{2048, 1}, + TestUnpackSize{2048, 8}, TestUnpackSize{2048, 13}, + TestUnpackSize{2048, 16}, TestUnpackSize{2048, 31}, + TestUnpackSize{2048, 32})); + +TEST_P(TestUnpack, Unpack32Scalar) { this->TestAll(&unpack32_scalar); } +TEST_P(TestUnpack, Unpack64Scalar) { this->TestAll(&unpack64_scalar); } + +#if defined(ARROW_HAVE_RUNTIME_AVX2) +TEST_P(TestUnpack, Unpack32Avx2) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2)) { + GTEST_SKIP() << "Test requires AVX2"; + } + this->TestAll(&unpack32_avx2); +} +#endif + +#if defined(ARROW_HAVE_RUNTIME_AVX512) +TEST_P(TestUnpack, Unpack32Avx512) { + if (!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512)) { + GTEST_SKIP() << "Test requires AVX512"; + } + this->TestAll(&unpack32_avx512); +} +#endif + +#if defined(ARROW_HAVE_NEON) +TEST_P(TestUnpack, Unpack32Neon) { this->TestAll(&unpack32_neon); } +#endif + +TEST_P(TestUnpack, Unpack32) { this->TestAll(&unpack32); } +TEST_P(TestUnpack, Unpack64) { this->TestAll(&unpack64); } + +} // namespace arrow::internal