From 5816eaf34853f768353ae7f43be9267a8d7a221d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 10 Jun 2025 16:09:41 +0200 Subject: [PATCH 01/19] Enable ByteStreamSplitDecodeSimd128<2> --- .../arrow/util/byte_stream_split_internal.h | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index d3214239ff9..7d091e38f83 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -39,6 +39,20 @@ namespace arrow::util::internal { // SIMD implementations // +template +constexpr T ReversePow2(T x) { + for (T n = 0, y = 1; n <= (8 * static_cast(sizeof(T))); ++n, y = y * 2) { + if (y == x) { + return n; + } + } + return 0; +} + +static_assert(ReversePow2(8) == 3); +static_assert(ReversePow2(4) == 2); +static_assert(ReversePow2(2) == 1); + #if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2) template void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_values, @@ -46,8 +60,8 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va using simd_batch = xsimd::make_sized_batch_t; assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); - constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2); + constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams); + static_assert(kNumStreamsLog2 != 0); constexpr int64_t kBlockSize = sizeof(simd_batch) * kNumStreams; const int64_t size = num_values * kNumStreams; @@ -579,7 +593,7 @@ inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_va memcpy(out, data, num_values); return; case 2: - return ByteStreamSplitDecodeScalar<2>(data, width, num_values, stride, out); + return ByteStreamSplitDecodePerhapsSimd<2>(data, width, num_values, stride, out); case 4: return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values, stride, out); case 8: From 0b68f4432e9210ac403c58270cdc40a481725f0e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Wed, 11 Jun 2025 18:03:10 +0200 Subject: [PATCH 02/19] Refactor ByteStreamSplitEncodeSimd128 to be generic over num_streams --- .../arrow/util/byte_stream_split_internal.h | 170 +++++++++++------- 1 file changed, 102 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 7d091e38f83..cd17e16bc46 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -109,6 +109,51 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va } } +template +struct grouped_bytes_impl; + +template <> +struct grouped_bytes_impl<1> { + using type = int8_t; +}; + +template <> +struct grouped_bytes_impl<2> { + using type = int16_t; +}; + +template <> +struct grouped_bytes_impl<4> { + using type = int32_t; +}; + +template <> +struct grouped_bytes_impl<8> { + using type = int64_t; +}; + +// Map a number of bytes to a type +template +using grouped_bytes_t = typename grouped_bytes_impl::type; + +// Like xsimd::zlip_lo, but zip groups of NBytes at once +template > +auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { + return xsimd::bitwise_cast( + xsimd::zip_lo(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); +} + +// Like xsimd::zlip_hi, but zip groups of NBytes at once +template > +auto zip_hi_n(Batch const& a, Batch const& b) -> Batch { + return xsimd::bitwise_cast( + xsimd::zip_hi(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); +} + template void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, const int64_t num_values, uint8_t* output_buffer_raw) { @@ -118,9 +163,6 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams; - simd_batch stage[3][kNumStreams]; - simd_batch final_result[kNumStreams]; - const int64_t size = num_values * kNumStreams; const int64_t num_blocks = size / kBlockSize; int8_t* output_buffer_streams[kNumStreams]; @@ -137,92 +179,84 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, output_buffer_raw[j * num_values + i] = byte_in_value; } } - // The current shuffling algorithm diverges for float and double types but the compiler - // should be able to remove the branch since only one path is taken for each template - // instantiation. - // Example run for 32-bit variables: - // Step 0: copy from unaligned input bytes: - // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ... - // Step 1: simd_batch::zip_lo and simd_batch::zip_hi: - // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ... - // Step 2: apply simd_batch::zip_lo and simd_batch::zip_hi again: - // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ... - // Step 3: simd_batch::zip_lo and simd_batch::zip_hi: - // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ... - // Step 4: simd_batch::zip_lo and simd_batch::zip_hi: - // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ... + + // Number of input values we can fit in a simd register + constexpr int NumValuesInBatch = sizeof(simd_batch) / kNumStreams; + static_assert(NumValuesInBatch > 0); + // Number of bytes we'll bring together in the first byte-level part of the algorithm. + // Since we zip with the next batch, the number of values in a batch determines how many + // bytes end up together before we can use a larger type + constexpr int NumBytes = 2 * NumValuesInBatch; + // Number of steps in the first part of the algorithm with byte-level zipping + constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1; + + simd_batch stage[NumStepsByte + 1][kNumStreams]; + + // Two step shuffling algorithm that starts with bytes and ends with a larger data type. + // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is + // also valid but not as performant. for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { // First copy the data to stage 0. for (int i = 0; i < kNumStreams; ++i) { stage[0][i] = simd_batch::load_unaligned( - reinterpret_cast(raw_values) + - (block_index * kNumStreams + i) * sizeof(simd_batch)); + &raw_values[(block_index * kNumStreams + i) * sizeof(simd_batch)]); } + // We first make byte-level shuffling, until we have gather enough bytes together + // and in the correct order to use a bigger data type. + // + // clang-format off + // Stage 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6 A7B7C7D7 | ... + // Stage 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7 C3C7D3D7 | ... + // Stage 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7 D1D3D5D7 | ... + // Stage 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3 D4D5D6D7 | ... + // clang-format on + // // The shuffling of bytes is performed through the unpack intrinsics. // In my measurements this gives better performance then an implementation // which uses the shuffle intrinsics. - for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) { - for (int i = 0; i < kNumStreams / 2; ++i) { - stage[stage_lvl + 1][i * 2] = - xsimd::zip_lo(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); - stage[stage_lvl + 1][i * 2 + 1] = - xsimd::zip_hi(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]); + // + // Loop order does not matter so we prefer higher locality + for (int i = 0; i < kNumStreams / 2; ++i) { + for (int step = 0; step < NumStepsByte; ++step) { + stage[step + 1][i * 2] = + xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]); + stage[step + 1][i * 2 + 1] = + xsimd::zip_hi(stage[step][i * 2], stage[step][i * 2 + 1]); } } + + // We know have the bytes packed in a larger data type and in the correct order to + // start using a bigger data type + // + // Example run for 32-bit variables it's int64_t with NumBytes=8 bytes: + // + // clang-format off + // Stage 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... + // clang-format on + simd_batch final_result[kNumStreams]; if constexpr (kNumStreams == 8) { - // This is the path for 64bits data. simd_batch tmp[8]; - using int32_batch = xsimd::make_sized_batch_t; - // This is a workaround, see: https://github.com/xtensor-stack/xsimd/issues/735 - auto from_int32_batch = [](int32_batch from) -> simd_batch { - simd_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; - auto to_int32_batch = [](simd_batch from) -> int32_batch { - int32_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; for (int i = 0; i < 4; ++i) { - tmp[i * 2] = from_int32_batch( - xsimd::zip_lo(to_int32_batch(stage[2][i]), to_int32_batch(stage[2][i + 4]))); - tmp[i * 2 + 1] = from_int32_batch( - xsimd::zip_hi(to_int32_batch(stage[2][i]), to_int32_batch(stage[2][i + 4]))); + tmp[i * 2] = + zip_lo_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]); + tmp[i * 2 + 1] = + zip_hi_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]); } for (int i = 0; i < 4; ++i) { - final_result[i * 2] = from_int32_batch( - xsimd::zip_lo(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4]))); - final_result[i * 2 + 1] = from_int32_batch( - xsimd::zip_hi(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4]))); + final_result[i * 2] = zip_lo_n(tmp[i], tmp[i + 4]); + final_result[i * 2 + 1] = zip_hi_n(tmp[i], tmp[i + 4]); } } else { - // This is the path for 32bits data. - using int64_batch = xsimd::make_sized_batch_t; - // This is a workaround, see: https://github.com/xtensor-stack/xsimd/issues/735 - auto from_int64_batch = [](int64_batch from) -> simd_batch { - simd_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; - auto to_int64_batch = [](simd_batch from) -> int64_batch { - int64_batch dest; - memcpy(&dest, &from, sizeof(simd_batch)); - return dest; - }; - simd_batch tmp[4]; for (int i = 0; i < 2; ++i) { - tmp[i * 2] = xsimd::zip_lo(stage[2][i * 2], stage[2][i * 2 + 1]); - tmp[i * 2 + 1] = xsimd::zip_hi(stage[2][i * 2], stage[2][i * 2 + 1]); - } - for (int i = 0; i < 2; ++i) { - final_result[i * 2] = from_int64_batch( - xsimd::zip_lo(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2]))); - final_result[i * 2 + 1] = from_int64_batch( - xsimd::zip_hi(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2]))); + final_result[i * 2] = + zip_lo_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]); + final_result[i * 2 + 1] = + zip_hi_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]); } } + + // Save the encoded data to the output buffer for (int i = 0; i < kNumStreams; ++i) { xsimd::store_unaligned(&output_buffer_streams[i][block_index * sizeof(simd_batch)], final_result[i]); From 82ac6dcbb222ddc77a245f16ac85251ee5a64bba Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 09:51:34 +0200 Subject: [PATCH 03/19] Remove disjunction in ByteStreamSplitEncodeSimd128 --- .../arrow/util/byte_stream_split_internal.h | 36 ++++++++----------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index cd17e16bc46..4b5cdcaf035 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -189,13 +189,17 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, constexpr int NumBytes = 2 * NumValuesInBatch; // Number of steps in the first part of the algorithm with byte-level zipping constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1; - - simd_batch stage[NumStepsByte + 1][kNumStreams]; + // Number of steps in the first part of the algorithm with large data type zipping + constexpr int NumStepsLarge = ReversePow2(sizeof(simd_batch) / NumBytes); + // Total number of steps + constexpr int NumSteps = NumStepsByte + NumStepsLarge; // Two step shuffling algorithm that starts with bytes and ends with a larger data type. // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is // also valid but not as performant. for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { + simd_batch stage[NumSteps + 1][kNumStreams]; + // First copy the data to stage 0. for (int i = 0; i < kNumStreams; ++i) { stage[0][i] = simd_batch::load_unaligned( @@ -234,32 +238,20 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // clang-format off // Stage 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... // clang-format on - simd_batch final_result[kNumStreams]; - if constexpr (kNumStreams == 8) { - simd_batch tmp[8]; - for (int i = 0; i < 4; ++i) { - tmp[i * 2] = - zip_lo_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]); - tmp[i * 2 + 1] = - zip_hi_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]); - } - for (int i = 0; i < 4; ++i) { - final_result[i * 2] = zip_lo_n(tmp[i], tmp[i + 4]); - final_result[i * 2 + 1] = zip_hi_n(tmp[i], tmp[i + 4]); - } - } else { - for (int i = 0; i < 2; ++i) { - final_result[i * 2] = - zip_lo_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]); - final_result[i * 2 + 1] = - zip_hi_n(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]); + constexpr int kNumStreamsHalf = kNumStreams / 2; + for (int step = NumStepsByte; step < NumSteps; ++step) { + for (int i = 0; i < kNumStreamsHalf; ++i) { + stage[step + 1][i * 2] = + zip_lo_n(stage[step][i], stage[step][i + kNumStreamsHalf]); + stage[step + 1][i * 2 + 1] = + zip_hi_n(stage[step][i], stage[step][i + kNumStreamsHalf]); } } // Save the encoded data to the output buffer for (int i = 0; i < kNumStreams; ++i) { xsimd::store_unaligned(&output_buffer_streams[i][block_index * sizeof(simd_batch)], - final_result[i]); + stage[NumSteps][i]); } } } From 2ebe8a5a0bdd1587978327e231383bc0ee14883e Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 10:10:03 +0200 Subject: [PATCH 04/19] Enable ByteStreamSplitEncodeSimd128<2> --- .../arrow/util/byte_stream_split_internal.h | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 4b5cdcaf035..0af5e850600 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -140,18 +140,26 @@ using grouped_bytes_t = typename grouped_bytes_impl::type; template > auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { - return xsimd::bitwise_cast( - xsimd::zip_lo(xsimd::bitwise_cast>(a), - xsimd::bitwise_cast>(b))); + if constexpr (NBytes == BatchSize) { + return a; + } else { + return xsimd::bitwise_cast( + xsimd::zip_lo(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); + } } // Like xsimd::zlip_hi, but zip groups of NBytes at once template > auto zip_hi_n(Batch const& a, Batch const& b) -> Batch { - return xsimd::bitwise_cast( - xsimd::zip_hi(xsimd::bitwise_cast>(a), - xsimd::bitwise_cast>(b))); + if constexpr (NBytes == BatchSize) { + return b; + } else { + return xsimd::bitwise_cast( + xsimd::zip_hi(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); + } } template @@ -160,7 +168,6 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, using simd_batch = xsimd::make_sized_batch_t; assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams; const int64_t size = num_values * kNumStreams; @@ -595,7 +602,7 @@ inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width, memcpy(out, raw_values, num_values); return; case 2: - return ByteStreamSplitEncodeScalar<2>(raw_values, width, num_values, out); + return ByteStreamSplitEncodeSimd128<2>(raw_values, width, num_values, out); case 4: return ByteStreamSplitEncodePerhapsSimd<4>(raw_values, width, num_values, out); case 8: @@ -619,7 +626,7 @@ inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_va memcpy(out, data, num_values); return; case 2: - return ByteStreamSplitDecodePerhapsSimd<2>(data, width, num_values, stride, out); + return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out); case 4: return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values, stride, out); case 8: From ca35643e5f217f3ad0451bcaec364fc2ddbcfd5b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 10:49:48 +0200 Subject: [PATCH 05/19] Fix conversion warning --- cpp/src/arrow/util/byte_stream_split_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 0af5e850600..40a4632434e 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -197,7 +197,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // Number of steps in the first part of the algorithm with byte-level zipping constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1; // Number of steps in the first part of the algorithm with large data type zipping - constexpr int NumStepsLarge = ReversePow2(sizeof(simd_batch) / NumBytes); + constexpr int NumStepsLarge = ReversePow2(static_cast(sizeof(simd_batch)) / NumBytes); // Total number of steps constexpr int NumSteps = NumStepsByte + NumStepsLarge; From 17fe80b5ef6d9c8a9cffecac9df3303f1d129e5f Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 11:49:13 +0200 Subject: [PATCH 06/19] Fmt --- cpp/src/arrow/util/byte_stream_split_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 40a4632434e..66927b60806 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -197,7 +197,8 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // Number of steps in the first part of the algorithm with byte-level zipping constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1; // Number of steps in the first part of the algorithm with large data type zipping - constexpr int NumStepsLarge = ReversePow2(static_cast(sizeof(simd_batch)) / NumBytes); + constexpr int NumStepsLarge = + ReversePow2(static_cast(sizeof(simd_batch)) / NumBytes); // Total number of steps constexpr int NumSteps = NumStepsByte + NumStepsLarge; From dae5965b5537f2efac41a48f9083ca18a9ff611d Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 12:29:36 +0200 Subject: [PATCH 07/19] Shorten comments --- .../arrow/util/byte_stream_split_internal.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 66927b60806..151566bd932 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -217,12 +217,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // We first make byte-level shuffling, until we have gather enough bytes together // and in the correct order to use a bigger data type. // - // clang-format off - // Stage 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6 A7B7C7D7 | ... - // Stage 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7 C3C7D3D7 | ... - // Stage 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7 D1D3D5D7 | ... - // Stage 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3 D4D5D6D7 | ... - // clang-format on + // Example with 32bit data on 128 bit register: + // + // 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6 A7B7C7D7 | ... + // 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7 C3C7D3D7 | ... + // 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7 D1D3D5D7 | ... + // 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3 D4D5D6D7 | ... // // The shuffling of bytes is performed through the unpack intrinsics. // In my measurements this gives better performance then an implementation @@ -241,11 +241,10 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // We know have the bytes packed in a larger data type and in the correct order to // start using a bigger data type // - // Example run for 32-bit variables it's int64_t with NumBytes=8 bytes: + // Example with 32bit data on 128 bit register. + // The large data type is int64_t with NumBytes=8 bytes: // - // clang-format off - // Stage 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... - // clang-format on + // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... constexpr int kNumStreamsHalf = kNumStreams / 2; for (int step = NumStepsByte; step < NumSteps; ++step) { for (int i = 0; i < kNumStreamsHalf; ++i) { From df3cc7fdcd85abfb1f1c767c3bc8d1fd7134f6b4 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 14:00:07 +0200 Subject: [PATCH 08/19] Use kPascalCase for constants --- .../arrow/util/byte_stream_split_internal.h | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 151566bd932..c433e869783 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -109,7 +109,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va } } -template +template struct grouped_bytes_impl; template <> @@ -133,32 +133,32 @@ struct grouped_bytes_impl<8> { }; // Map a number of bytes to a type -template -using grouped_bytes_t = typename grouped_bytes_impl::type; +template +using grouped_bytes_t = typename grouped_bytes_impl::type; // Like xsimd::zlip_lo, but zip groups of NBytes at once -template > +template > auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { - if constexpr (NBytes == BatchSize) { + if constexpr (kNumBytes == kBatchSize) { return a; } else { return xsimd::bitwise_cast( - xsimd::zip_lo(xsimd::bitwise_cast>(a), - xsimd::bitwise_cast>(b))); + xsimd::zip_lo(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); } } // Like xsimd::zlip_hi, but zip groups of NBytes at once -template > +template > auto zip_hi_n(Batch const& a, Batch const& b) -> Batch { - if constexpr (NBytes == BatchSize) { + if constexpr (kNumBytes == kBatchSize) { return b; } else { return xsimd::bitwise_cast( - xsimd::zip_hi(xsimd::bitwise_cast>(a), - xsimd::bitwise_cast>(b))); + xsimd::zip_hi(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); } } @@ -188,19 +188,19 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, } // Number of input values we can fit in a simd register - constexpr int NumValuesInBatch = sizeof(simd_batch) / kNumStreams; - static_assert(NumValuesInBatch > 0); + constexpr int kNumValuesInBatch = sizeof(simd_batch) / kNumStreams; + static_assert(kNumValuesInBatch > 0); // Number of bytes we'll bring together in the first byte-level part of the algorithm. // Since we zip with the next batch, the number of values in a batch determines how many // bytes end up together before we can use a larger type - constexpr int NumBytes = 2 * NumValuesInBatch; + constexpr int kNumBytes = 2 * kNumValuesInBatch; // Number of steps in the first part of the algorithm with byte-level zipping - constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1; + constexpr int kNumStepsByte = ReversePow2(kNumValuesInBatch) + 1; // Number of steps in the first part of the algorithm with large data type zipping - constexpr int NumStepsLarge = - ReversePow2(static_cast(sizeof(simd_batch)) / NumBytes); + constexpr int kNumStepsLarge = + ReversePow2(static_cast(sizeof(simd_batch)) / kNumBytes); // Total number of steps - constexpr int NumSteps = NumStepsByte + NumStepsLarge; + constexpr int NumSteps = kNumStepsByte + kNumStepsLarge; // Two step shuffling algorithm that starts with bytes and ends with a larger data type. // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is @@ -230,7 +230,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // // Loop order does not matter so we prefer higher locality for (int i = 0; i < kNumStreams / 2; ++i) { - for (int step = 0; step < NumStepsByte; ++step) { + for (int step = 0; step < kNumStepsByte; ++step) { stage[step + 1][i * 2] = xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]); stage[step + 1][i * 2 + 1] = @@ -246,12 +246,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... constexpr int kNumStreamsHalf = kNumStreams / 2; - for (int step = NumStepsByte; step < NumSteps; ++step) { + for (int step = kNumStepsByte; step < NumSteps; ++step) { for (int i = 0; i < kNumStreamsHalf; ++i) { stage[step + 1][i * 2] = - zip_lo_n(stage[step][i], stage[step][i + kNumStreamsHalf]); + zip_lo_n(stage[step][i], stage[step][i + kNumStreamsHalf]); stage[step + 1][i * 2 + 1] = - zip_hi_n(stage[step][i], stage[step][i + kNumStreamsHalf]); + zip_hi_n(stage[step][i], stage[step][i + kNumStreamsHalf]); } } From d4fb898947cc0d52884701cec8768be58f42f6ab Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 14:08:14 +0200 Subject: [PATCH 09/19] Use static enabling of ByteStreamSplitEncodeSimd128<2> --- cpp/src/arrow/util/byte_stream_split_internal.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index c433e869783..f4c35a6f616 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -431,6 +431,10 @@ template void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t num_values, int64_t stride, uint8_t* out) { # if defined(ARROW_HAVE_AVX2) + // Not implemented + if constexpr (kNumStreams == 2) { + return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out); + } return ByteStreamSplitDecodeAvx2(data, width, num_values, stride, out); # elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) return ByteStreamSplitDecodeSimd128(data, width, num_values, stride, out); @@ -444,6 +448,11 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width, const int64_t num_values, uint8_t* output_buffer_raw) { # if defined(ARROW_HAVE_AVX2) + // Not implemented + if constexpr (kNumStreams == 2) { + return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, + output_buffer_raw); + } return ByteStreamSplitEncodeAvx2(raw_values, width, num_values, output_buffer_raw); # elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) @@ -602,7 +611,7 @@ inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width, memcpy(out, raw_values, num_values); return; case 2: - return ByteStreamSplitEncodeSimd128<2>(raw_values, width, num_values, out); + return ByteStreamSplitEncodePerhapsSimd<2>(raw_values, width, num_values, out); case 4: return ByteStreamSplitEncodePerhapsSimd<4>(raw_values, width, num_values, out); case 8: @@ -626,7 +635,7 @@ inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_va memcpy(out, data, num_values); return; case 2: - return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out); + return ByteStreamSplitDecodePerhapsSimd<2>(data, width, num_values, stride, out); case 4: return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values, stride, out); case 8: From 2f37deb8f9e13d9bdb3c0f6409a3535e3f344752 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 14:16:39 +0200 Subject: [PATCH 10/19] Safer computation of simd batch size --- .../arrow/util/byte_stream_split_internal.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index f4c35a6f616..71f51c5dabc 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -62,7 +62,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va assert(width == kNumStreams); constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams); static_assert(kNumStreamsLog2 != 0); - constexpr int64_t kBlockSize = sizeof(simd_batch) * kNumStreams; + constexpr int64_t kBlockSize = simd_batch::size * kNumStreams; const int64_t size = num_values * kNumStreams; const int64_t num_blocks = size / kBlockSize; @@ -90,8 +90,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va for (int64_t i = 0; i < num_blocks; ++i) { for (int j = 0; j < kNumStreams; ++j) { - stage[0][j] = - simd_batch::load_unaligned(&data[i * sizeof(simd_batch) + j * stride]); + stage[0][j] = simd_batch::load_unaligned(&data[i * simd_batch::size + j * stride]); } for (int step = 0; step < kNumStreamsLog2; ++step) { for (int j = 0; j < kNumStreamsHalf; ++j) { @@ -103,7 +102,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va } for (int j = 0; j < kNumStreams; ++j) { xsimd::store_unaligned( - reinterpret_cast(out + (i * kNumStreams + j) * sizeof(simd_batch)), + reinterpret_cast(out + (i * kNumStreams + j) * simd_batch::size), stage[kNumStreamsLog2][j]); } } @@ -168,7 +167,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, using simd_batch = xsimd::make_sized_batch_t; assert(width == kNumStreams); - constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams; + constexpr int kBlockSize = simd_batch::size * kNumStreams; const int64_t size = num_values * kNumStreams; const int64_t num_blocks = size / kBlockSize; @@ -188,7 +187,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, } // Number of input values we can fit in a simd register - constexpr int kNumValuesInBatch = sizeof(simd_batch) / kNumStreams; + constexpr int kNumValuesInBatch = simd_batch::size / kNumStreams; static_assert(kNumValuesInBatch > 0); // Number of bytes we'll bring together in the first byte-level part of the algorithm. // Since we zip with the next batch, the number of values in a batch determines how many @@ -198,12 +197,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, constexpr int kNumStepsByte = ReversePow2(kNumValuesInBatch) + 1; // Number of steps in the first part of the algorithm with large data type zipping constexpr int kNumStepsLarge = - ReversePow2(static_cast(sizeof(simd_batch)) / kNumBytes); + ReversePow2(static_cast(simd_batch::size) / kNumBytes); // Total number of steps constexpr int NumSteps = kNumStepsByte + kNumStepsLarge; // Two step shuffling algorithm that starts with bytes and ends with a larger data type. - // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is + // An algorithm similar to the decoding one with log2(simd_batch::size) + 1 stages is // also valid but not as performant. for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { simd_batch stage[NumSteps + 1][kNumStreams]; @@ -211,7 +210,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // First copy the data to stage 0. for (int i = 0; i < kNumStreams; ++i) { stage[0][i] = simd_batch::load_unaligned( - &raw_values[(block_index * kNumStreams + i) * sizeof(simd_batch)]); + &raw_values[(block_index * kNumStreams + i) * simd_batch::size]); } // We first make byte-level shuffling, until we have gather enough bytes together @@ -257,7 +256,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // Save the encoded data to the output buffer for (int i = 0; i < kNumStreams; ++i) { - xsimd::store_unaligned(&output_buffer_streams[i][block_index * sizeof(simd_batch)], + xsimd::store_unaligned(&output_buffer_streams[i][block_index * simd_batch::size], stage[NumSteps][i]); } } From a7ff16a2c85df3d24b82cc36ab52a97315334a72 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 12 Jun 2025 14:18:57 +0200 Subject: [PATCH 11/19] Fix fail compilation --- cpp/src/arrow/util/byte_stream_split_internal.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 71f51c5dabc..023f1e1c7cf 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -433,8 +433,9 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t nu // Not implemented if constexpr (kNumStreams == 2) { return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out); + } else { + return ByteStreamSplitDecodeAvx2(data, width, num_values, stride, out); } - return ByteStreamSplitDecodeAvx2(data, width, num_values, stride, out); # elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) return ByteStreamSplitDecodeSimd128(data, width, num_values, stride, out); # else @@ -451,9 +452,10 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width, if constexpr (kNumStreams == 2) { return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, output_buffer_raw); + } else { + return ByteStreamSplitEncodeAvx2(raw_values, width, num_values, + output_buffer_raw); } - return ByteStreamSplitEncodeAvx2(raw_values, width, num_values, - output_buffer_raw); # elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON) return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, output_buffer_raw); From 83ebd5caf439d08a572c6b235ca92a8717e57c66 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Fri, 13 Jun 2025 11:11:58 +0200 Subject: [PATCH 12/19] Small Encode improvement --- cpp/src/arrow/util/byte_stream_split_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 023f1e1c7cf..5223c9f2a95 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -85,10 +85,10 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va // Stage 1: AAAA BBBB CCCC DDDD // Stage 2: ACAC ACAC BDBD BDBD // Stage 3: ABCD ABCD ABCD ABCD - simd_batch stage[kNumStreamsLog2 + 1][kNumStreams]; constexpr int kNumStreamsHalf = kNumStreams / 2U; for (int64_t i = 0; i < num_blocks; ++i) { + simd_batch stage[kNumStreamsLog2 + 1][kNumStreams]; for (int j = 0; j < kNumStreams; ++j) { stage[0][j] = simd_batch::load_unaligned(&data[i * simd_batch::size + j * stride]); } From 59ddd69f6e776e8aed3dad7a5f1f189b8be1fae5 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 16 Jun 2025 11:58:23 +0200 Subject: [PATCH 13/19] Add int16_t benchamarks --- cpp/src/parquet/encoding_benchmark.cc | 37 +++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 28cfcd98a3c..6392f533785 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -508,6 +509,11 @@ BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Apply(ByteStreamSplitApply); #if defined(ARROW_HAVE_SSE4_2) +static void BM_ByteStreamSplitDecode_Int16_Sse2(benchmark::State& state) { + BM_ByteStreamSplitDecode( + state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); +} + static void BM_ByteStreamSplitDecode_Float_Sse2(benchmark::State& state) { BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); @@ -518,6 +524,11 @@ static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) { state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); } +static void BM_ByteStreamSplitEncode_Int16_Sse2(benchmark::State& state) { + BM_ByteStreamSplitEncode( + state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); +} + static void BM_ByteStreamSplitEncode_Float_Sse2(benchmark::State& state) { BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); @@ -528,13 +539,20 @@ static void BM_ByteStreamSplitEncode_Double_Sse2(benchmark::State& state) { state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); } +BENCHMARK(BM_ByteStreamSplitDecode_Int16_Sse2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitDecode_Double_Sse2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Int16_Sse2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitEncode_Float_Sse2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply); #endif #if defined(ARROW_HAVE_AVX2) +static void BM_ByteStreamSplitDecode_Int16_Avx2(benchmark::State& state) { + BM_ByteStreamSplitDecode( + state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); +} + static void BM_ByteStreamSplitDecode_Float_Avx2(benchmark::State& state) { BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); @@ -545,6 +563,11 @@ static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) { state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); } +static void BM_ByteStreamSplitEncode_Int16_Avx2(benchmark::State& state) { + BM_ByteStreamSplitEncode( + state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); +} + static void BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) { BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); @@ -555,13 +578,20 @@ static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) { state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); } +BENCHMARK(BM_ByteStreamSplitDecode_Int16_Avx2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Apply(ByteStreamSplitApply); +BENCHMARK(BM_ByteStreamSplitEncode_Int16_Avx2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Apply(ByteStreamSplitApply); #endif #if defined(ARROW_HAVE_NEON) +static void BM_ByteStreamSplitDecode_Int16_Neon(benchmark::State& state) { + BM_ByteStreamSplitDecode( + state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); +} + static void BM_ByteStreamSplitDecode_Float_Neon(benchmark::State& state) { BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); @@ -572,6 +602,11 @@ static void BM_ByteStreamSplitDecode_Double_Neon(benchmark::State& state) { state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); } +static void BM_ByteStreamSplitEncode_Int16_Neon(benchmark::State& state) { + BM_ByteStreamSplitEncode( + state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); +} + static void BM_ByteStreamSplitEncode_Float_Neon(benchmark::State& state) { BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); @@ -582,8 +617,10 @@ static void BM_ByteStreamSplitEncode_Double_Neon(benchmark::State& state) { state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); } +BENCHMARK(BM_ByteStreamSplitDecode_Int16_Neon)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_ByteStreamSplitDecode_Float_Neon)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_ByteStreamSplitDecode_Double_Neon)->Range(MIN_RANGE, MAX_RANGE); +BENCHMARK(BM_ByteStreamSplitEncode_Int16_Neon)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_ByteStreamSplitEncode_Float_Neon)->Range(MIN_RANGE, MAX_RANGE); BENCHMARK(BM_ByteStreamSplitEncode_Double_Neon)->Range(MIN_RANGE, MAX_RANGE); #endif From c8765deb010f288418900c5ffb5aea51740e080c Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 19 Jun 2025 10:28:27 +0200 Subject: [PATCH 14/19] Fix int16_t byte stream split benchmarks --- cpp/src/parquet/encoding_benchmark.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 6392f533785..1ac8ce05701 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -510,7 +510,7 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Apply(ByteStreamSplitApply); #if defined(ARROW_HAVE_SSE4_2) static void BM_ByteStreamSplitDecode_Int16_Sse2(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128); } @@ -525,7 +525,7 @@ static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) { } static void BM_ByteStreamSplitEncode_Int16_Sse2(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128); } @@ -549,7 +549,7 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply); #if defined(ARROW_HAVE_AVX2) static void BM_ByteStreamSplitDecode_Int16_Avx2(benchmark::State& state) { - BM_ByteStreamSplitDecode( + BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); } @@ -564,7 +564,7 @@ static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) { } static void BM_ByteStreamSplitEncode_Int16_Avx2(benchmark::State& state) { - BM_ByteStreamSplitEncode( + BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); } From 71f0dc0d4b3ca816af7a36e70aa0e96ac4a866ed Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 19 Jun 2025 14:02:55 +0200 Subject: [PATCH 15/19] Remove misleading benchmarks --- .../arrow/util/byte_stream_split_internal.h | 8 ++------ cpp/src/parquet/encoding_benchmark.cc | 18 ------------------ 2 files changed, 2 insertions(+), 24 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 5223c9f2a95..39f44276887 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -355,13 +355,9 @@ template void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, int width, const int64_t num_values, uint8_t* output_buffer_raw) { assert(width == kNumStreams); - static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams."); + static_assert(kNumStreams == 4, "Invalid number of streams."); constexpr int kBlockSize = sizeof(__m256i) * kNumStreams; - if constexpr (kNumStreams == 8) // Back to SSE, currently no path for double. - return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, - output_buffer_raw); - const int64_t size = num_values * kNumStreams; if (size < kBlockSize) // Back to SSE for small size return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, @@ -449,7 +445,7 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width, uint8_t* output_buffer_raw) { # if defined(ARROW_HAVE_AVX2) // Not implemented - if constexpr (kNumStreams == 2) { + if constexpr (kNumStreams == 2 || kNumStreams == 8) { return ByteStreamSplitEncodeSimd128(raw_values, width, num_values, output_buffer_raw); } else { diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index 1ac8ce05701..aee30bf7623 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -548,11 +548,6 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply); #endif #if defined(ARROW_HAVE_AVX2) -static void BM_ByteStreamSplitDecode_Int16_Avx2(benchmark::State& state) { - BM_ByteStreamSplitDecode( - state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); -} - static void BM_ByteStreamSplitDecode_Float_Avx2(benchmark::State& state) { BM_ByteStreamSplitDecode( state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); @@ -563,27 +558,14 @@ static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) { state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2); } -static void BM_ByteStreamSplitEncode_Int16_Avx2(benchmark::State& state) { - BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); -} - static void BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) { BM_ByteStreamSplitEncode( state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); } -static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) { - BM_ByteStreamSplitEncode( - state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2); -} - -BENCHMARK(BM_ByteStreamSplitDecode_Int16_Avx2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Apply(ByteStreamSplitApply); -BENCHMARK(BM_ByteStreamSplitEncode_Int16_Avx2)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Apply(ByteStreamSplitApply); -BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Apply(ByteStreamSplitApply); #endif #if defined(ARROW_HAVE_NEON) From a40523313d515461357f0aec43190770a27e82c7 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Thu, 19 Jun 2025 17:15:55 +0200 Subject: [PATCH 16/19] Fix casing --- cpp/src/arrow/util/byte_stream_split_internal.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 39f44276887..63729501cdd 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -199,13 +199,13 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, constexpr int kNumStepsLarge = ReversePow2(static_cast(simd_batch::size) / kNumBytes); // Total number of steps - constexpr int NumSteps = kNumStepsByte + kNumStepsLarge; + constexpr int kNumSteps = kNumStepsByte + kNumStepsLarge; // Two step shuffling algorithm that starts with bytes and ends with a larger data type. // An algorithm similar to the decoding one with log2(simd_batch::size) + 1 stages is // also valid but not as performant. for (int64_t block_index = 0; block_index < num_blocks; ++block_index) { - simd_batch stage[NumSteps + 1][kNumStreams]; + simd_batch stage[kNumSteps + 1][kNumStreams]; // First copy the data to stage 0. for (int i = 0; i < kNumStreams; ++i) { @@ -245,7 +245,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... constexpr int kNumStreamsHalf = kNumStreams / 2; - for (int step = kNumStepsByte; step < NumSteps; ++step) { + for (int step = kNumStepsByte; step < kNumSteps; ++step) { for (int i = 0; i < kNumStreamsHalf; ++i) { stage[step + 1][i * 2] = zip_lo_n(stage[step][i], stage[step][i + kNumStreamsHalf]); @@ -257,7 +257,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // Save the encoded data to the output buffer for (int i = 0; i < kNumStreams; ++i) { xsimd::store_unaligned(&output_buffer_streams[i][block_index * simd_batch::size], - stage[NumSteps][i]); + stage[kNumSteps][i]); } } } From 9a87321835ff7df54dbfd4b10a6cc8797999be6f Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 23 Jun 2025 17:38:12 +0200 Subject: [PATCH 17/19] Review comments --- .../arrow/util/byte_stream_split_internal.h | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index 63729501cdd..d7f05e44a16 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -59,9 +59,13 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va int64_t stride, uint8_t* out) { using simd_batch = xsimd::make_sized_batch_t; + static_assert(kNumStreams <= simd_batch::size, + "The algorithm works when the number of streams is smaller than the SIMD " + "batch size."); assert(width == kNumStreams); constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams); - static_assert(kNumStreamsLog2 != 0); + static_assert(kNumStreamsLog2 != 0, + "The algorithm works for a number of streams being a power of two."); constexpr int64_t kBlockSize = simd_batch::size * kNumStreams; const int64_t size = num_values * kNumStreams; @@ -135,7 +139,7 @@ struct grouped_bytes_impl<8> { template using grouped_bytes_t = typename grouped_bytes_impl::type; -// Like xsimd::zlip_lo, but zip groups of NBytes at once +// Like xsimd::zip_lo, but zip groups of kNumBytes at once. template > auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { @@ -148,7 +152,7 @@ auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { } } -// Like xsimd::zlip_hi, but zip groups of NBytes at once +// Like xsimd::zip_hi, but zip groups of kNumBytes at once. template > auto zip_hi_n(Batch const& a, Batch const& b) -> Batch { @@ -167,7 +171,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, using simd_batch = xsimd::make_sized_batch_t; assert(width == kNumStreams); + static_assert(kNumStreams <= simd_batch::size, + "The algorithm works when the number of streams is smaller than the SIMD " + "batch size."); constexpr int kBlockSize = simd_batch::size * kNumStreams; + static_assert(ReversePow2(kNumStreams) != 0, + "The algorithm works for a number of streams being a power of two."); const int64_t size = num_values * kNumStreams; const int64_t num_blocks = size / kBlockSize; @@ -200,6 +209,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, ReversePow2(static_cast(simd_batch::size) / kNumBytes); // Total number of steps constexpr int kNumSteps = kNumStepsByte + kNumStepsLarge; + static_assert(kNumSteps == ReversePow2(simd_batch::size)); // Two step shuffling algorithm that starts with bytes and ends with a larger data type. // An algorithm similar to the decoding one with log2(simd_batch::size) + 1 stages is @@ -228,7 +238,8 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // which uses the shuffle intrinsics. // // Loop order does not matter so we prefer higher locality - for (int i = 0; i < kNumStreams / 2; ++i) { + constexpr int kNumStreamsHalf = kNumStreams / 2; + for (int i = 0; i < kNumStreamsHalf; ++i) { for (int step = 0; step < kNumStepsByte; ++step) { stage[step + 1][i * 2] = xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]); @@ -244,7 +255,6 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width, // The large data type is int64_t with NumBytes=8 bytes: // // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ... - constexpr int kNumStreamsHalf = kNumStreams / 2; for (int step = kNumStepsByte; step < kNumSteps; ++step) { for (int i = 0; i < kNumStreamsHalf; ++i) { stage[step + 1][i * 2] = From 754f3ab246f62aea6fcf74c92b00b4dff3d7b516 Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Mon, 23 Jun 2025 17:48:22 +0200 Subject: [PATCH 18/19] Fix and extend tests --- cpp/src/arrow/util/byte_stream_split_test.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc index 9755cd8b8d0..ec995a0a9aa 100644 --- a/cpp/src/arrow/util/byte_stream_split_test.cc +++ b/cpp/src/arrow/util/byte_stream_split_test.cc @@ -136,7 +136,7 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { return input; } - template + template static std::vector MakeDecodeFuncs() { std::vector funcs; funcs.push_back({"scalar_dynamic", &ByteStreamSplitDecodeScalarDynamic}); @@ -146,7 +146,10 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { funcs.push_back({"simd", &ByteStreamSplitDecodeSimd}); funcs.push_back({"simd128", &ByteStreamSplitDecodeSimd128}); # if defined(ARROW_HAVE_AVX2) - funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2}); + // The only available implementations + if constexpr (kWidth == 4 || kWidth == 8) { + funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2}); + } # endif } #endif // defined(ARROW_HAVE_SIMD_SPLIT) @@ -164,7 +167,10 @@ class TestByteStreamSplitSpecialized : public ::testing::Test { funcs.push_back({"simd", &ByteStreamSplitEncodeSimd}); funcs.push_back({"simd128", &ByteStreamSplitEncodeSimd128}); # if defined(ARROW_HAVE_AVX2) - funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); + // The only available implementation + if constexpr (kWidth == 4) { + funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2}); + } # endif } #endif // defined(ARROW_HAVE_SIMD_SPLIT) From 3ea4dd3de5bd4618af53e73fb37139118101c69b Mon Sep 17 00:00:00 2001 From: AntoinePrv Date: Tue, 24 Jun 2025 09:18:11 +0200 Subject: [PATCH 19/19] Move and rename SizedInt to type_traits --- .../arrow/util/byte_stream_split_internal.h | 40 +++++-------------- cpp/src/arrow/util/type_traits.h | 27 +++++++++++++ 2 files changed, 36 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h index d7f05e44a16..2eb678cbfbe 100644 --- a/cpp/src/arrow/util/byte_stream_split_internal.h +++ b/cpp/src/arrow/util/byte_stream_split_internal.h @@ -20,6 +20,7 @@ #include "arrow/util/endian.h" #include "arrow/util/simd.h" #include "arrow/util/small_vector.h" +#include "arrow/util/type_traits.h" #include "arrow/util/ubsan.h" #include @@ -112,43 +113,18 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va } } -template -struct grouped_bytes_impl; - -template <> -struct grouped_bytes_impl<1> { - using type = int8_t; -}; - -template <> -struct grouped_bytes_impl<2> { - using type = int16_t; -}; - -template <> -struct grouped_bytes_impl<4> { - using type = int32_t; -}; - -template <> -struct grouped_bytes_impl<8> { - using type = int64_t; -}; - -// Map a number of bytes to a type -template -using grouped_bytes_t = typename grouped_bytes_impl::type; - // Like xsimd::zip_lo, but zip groups of kNumBytes at once. template > auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { + using arrow::internal::SizedInt; + if constexpr (kNumBytes == kBatchSize) { return a; } else { return xsimd::bitwise_cast( - xsimd::zip_lo(xsimd::bitwise_cast>(a), - xsimd::bitwise_cast>(b))); + xsimd::zip_lo(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); } } @@ -156,12 +132,14 @@ auto zip_lo_n(Batch const& a, Batch const& b) -> Batch { template > auto zip_hi_n(Batch const& a, Batch const& b) -> Batch { + using arrow::internal::SizedInt; + if constexpr (kNumBytes == kBatchSize) { return b; } else { return xsimd::bitwise_cast( - xsimd::zip_hi(xsimd::bitwise_cast>(a), - xsimd::bitwise_cast>(b))); + xsimd::zip_hi(xsimd::bitwise_cast>(a), + xsimd::bitwise_cast>(b))); } } diff --git a/cpp/src/arrow/util/type_traits.h b/cpp/src/arrow/util/type_traits.h index c1906152423..9c3b388dab2 100644 --- a/cpp/src/arrow/util/type_traits.h +++ b/cpp/src/arrow/util/type_traits.h @@ -42,5 +42,32 @@ template struct is_null_pointer : std::is_same::type> { }; +template +struct SizedIntImpl; + +template <> +struct SizedIntImpl<1> { + using type = int8_t; +}; + +template <> +struct SizedIntImpl<2> { + using type = int16_t; +}; + +template <> +struct SizedIntImpl<4> { + using type = int32_t; +}; + +template <> +struct SizedIntImpl<8> { + using type = int64_t; +}; + +// Map a number of bytes to a type +template +using SizedInt = typename SizedIntImpl::type; + } // namespace internal } // namespace arrow