From 5816eaf34853f768353ae7f43be9267a8d7a221d Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Tue, 10 Jun 2025 16:09:41 +0200
Subject: [PATCH 01/19] Enable ByteStreamSplitDecodeSimd128<2>

---
 .../arrow/util/byte_stream_split_internal.h   | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)
diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index d3214239ff9..7d091e38f83 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -39,6 +39,20 @@ namespace arrow::util::internal {
 // SIMD implementations
 //
 
+template <typename T>
+constexpr T ReversePow2(T x) {
+  for (T n = 0, y = 1; n <= (8 * static_cast<T>(sizeof(T))); ++n, y = y * 2) {
+    if (y == x) {
+      return n;
+    }
+  }
+  return 0;
+}
+
+static_assert(ReversePow2(8) == 3);
+static_assert(ReversePow2(4) == 2);
+static_assert(ReversePow2(2) == 1);
+
 #if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
 template <int kNumStreams>
 void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_values,
@@ -46,8 +60,8 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
   using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
 
   assert(width == kNumStreams);
-  static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
-  constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2);
+  constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams);
+  static_assert(kNumStreamsLog2 != 0);
   constexpr int64_t kBlockSize = sizeof(simd_batch) * kNumStreams;
 
   const int64_t size = num_values * kNumStreams;
@@ -579,7 +593,7 @@ inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_va
       memcpy(out, data, num_values);
       return;
     case 2:
-      return ByteStreamSplitDecodeScalar<2>(data, width, num_values, stride, out);
+      return ByteStreamSplitDecodePerhapsSimd<2>(data, width, num_values, stride, out);
     case 4:
       return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values, stride, out);
     case 8:

From 0b68f4432e9210ac403c58270cdc40a481725f0e Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Wed, 11 Jun 2025 18:03:10 +0200
Subject: [PATCH 02/19] Refactor ByteStreamSplitEncodeSimd128 to be generic
 over num_streams

---
 .../arrow/util/byte_stream_split_internal.h   | 170 +++++++++++-------
 1 file changed, 102 insertions(+), 68 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 7d091e38f83..cd17e16bc46 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -109,6 +109,51 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
   }
 }
 
+template <int N>
+struct grouped_bytes_impl;
+
+template <>
+struct grouped_bytes_impl<1> {
+  using type = int8_t;
+};
+
+template <>
+struct grouped_bytes_impl<2> {
+  using type = int16_t;
+};
+
+template <>
+struct grouped_bytes_impl<4> {
+  using type = int32_t;
+};
+
+template <>
+struct grouped_bytes_impl<8> {
+  using type = int64_t;
+};
+
+// Map a number of bytes to a type
+template <int N>
+using grouped_bytes_t = typename grouped_bytes_impl<N>::type;
+
+// Like xsimd::zlip_lo, but zip groups of NBytes at once
+template <int NBytes, int BatchSize = 16,
+          typename Batch = xsimd::make_sized_batch_t<int8_t, BatchSize>>
+auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
+  return xsimd::bitwise_cast<int8_t>(
+      xsimd::zip_lo(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
+                    xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+}
+
+// Like xsimd::zlip_hi, but zip groups of NBytes at once
+template <int NBytes, int BatchSize = 16,
+          typename Batch = xsimd::make_sized_batch_t<int8_t, BatchSize>>
+auto zip_hi_n(Batch const& a, Batch const& b) -> Batch {
+  return xsimd::bitwise_cast<int8_t>(
+      xsimd::zip_hi(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
+                    xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+}
+
 template <int kNumStreams>
 void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
                                   const int64_t num_values, uint8_t* output_buffer_raw) {
@@ -118,9 +163,6 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
   constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams;
 
-  simd_batch stage[3][kNumStreams];
-  simd_batch final_result[kNumStreams];
-
   const int64_t size = num_values * kNumStreams;
   const int64_t num_blocks = size / kBlockSize;
   int8_t* output_buffer_streams[kNumStreams];
@@ -137,92 +179,84 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
       output_buffer_raw[j * num_values + i] = byte_in_value;
     }
   }
-  // The current shuffling algorithm diverges for float and double types but the compiler
-  // should be able to remove the branch since only one path is taken for each template
-  // instantiation.
-  // Example run for 32-bit variables:
-  // Step 0: copy from unaligned input bytes:
-  //   0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
-  // Step 1: simd_batch<int8_t, 8>::zip_lo and simd_batch<int8_t, 8>::zip_hi:
-  //   0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
-  // Step 2: apply simd_batch<int8_t, 8>::zip_lo and  simd_batch<int8_t, 8>::zip_hi again:
-  //   0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...
-  // Step 3: simd_batch<int8_t, 8>::zip_lo and simd_batch<int8_t, 8>::zip_hi:
-  //   0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ...
-  // Step 4: simd_batch<int64_t, 2>::zip_lo and simd_batch<int64_t, 2>::zip_hi:
-  //   0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
+
+  // Number of input values we can fit in a simd register
+  constexpr int NumValuesInBatch = sizeof(simd_batch) / kNumStreams;
+  static_assert(NumValuesInBatch > 0);
+  // Number of bytes we'll bring together in the first byte-level part of the algorithm.
+  // Since we zip with the next batch, the number of values in a batch determines how many
+  // bytes end up together before we can use a larger type
+  constexpr int NumBytes = 2 * NumValuesInBatch;
+  // Number of steps in the first part of the algorithm with byte-level zipping
+  constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1;
+
+  simd_batch stage[NumStepsByte + 1][kNumStreams];
+
+  // Two step shuffling algorithm that starts with bytes and ends with a larger data type.
+  // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is
+  // also valid but not as performant.
   for (int64_t block_index = 0; block_index < num_blocks; ++block_index) {
     // First copy the data to stage 0.
     for (int i = 0; i < kNumStreams; ++i) {
       stage[0][i] = simd_batch::load_unaligned(
-          reinterpret_cast<const int8_t*>(raw_values) +
-          (block_index * kNumStreams + i) * sizeof(simd_batch));
+          &raw_values[(block_index * kNumStreams + i) * sizeof(simd_batch)]);
     }
 
+    // We first make byte-level shuffling, until we have gather enough bytes together
+    // and in the correct order to use a bigger data type.
+    //
+    // clang-format off
+    // Stage 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6 A7B7C7D7 | ...
+    // Stage 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7 C3C7D3D7 | ...
+    // Stage 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7 D1D3D5D7 | ...
+    // Stage 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3 D4D5D6D7 | ...
+    // clang-format on
+    //
     // The shuffling of bytes is performed through the unpack intrinsics.
     // In my measurements this gives better performance then an implementation
     // which uses the shuffle intrinsics.
-    for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) {
-      for (int i = 0; i < kNumStreams / 2; ++i) {
-        stage[stage_lvl + 1][i * 2] =
-            xsimd::zip_lo(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
-        stage[stage_lvl + 1][i * 2 + 1] =
-            xsimd::zip_hi(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 + 1]);
+    //
+    // Loop order does not matter so we prefer higher locality
+    for (int i = 0; i < kNumStreams / 2; ++i) {
+      for (int step = 0; step < NumStepsByte; ++step) {
+        stage[step + 1][i * 2] =
+            xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]);
+        stage[step + 1][i * 2 + 1] =
+            xsimd::zip_hi(stage[step][i * 2], stage[step][i * 2 + 1]);
       }
     }
+
+    // We know have the bytes packed in a larger data type and in the correct order to
+    // start using a bigger data type
+    //
+    // Example run for 32-bit variables it's int64_t with NumBytes=8 bytes:
+    //
+    // clang-format off
+    // Stage 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ...
+    // clang-format on
+    simd_batch final_result[kNumStreams];
     if constexpr (kNumStreams == 8) {
-      // This is the path for 64bits data.
       simd_batch tmp[8];
-      using int32_batch = xsimd::make_sized_batch_t<int32_t, 4>;
-      // This is a workaround, see: https://github.com/xtensor-stack/xsimd/issues/735
-      auto from_int32_batch = [](int32_batch from) -> simd_batch {
-        simd_batch dest;
-        memcpy(&dest, &from, sizeof(simd_batch));
-        return dest;
-      };
-      auto to_int32_batch = [](simd_batch from) -> int32_batch {
-        int32_batch dest;
-        memcpy(&dest, &from, sizeof(simd_batch));
-        return dest;
-      };
       for (int i = 0; i < 4; ++i) {
-        tmp[i * 2] = from_int32_batch(
-            xsimd::zip_lo(to_int32_batch(stage[2][i]), to_int32_batch(stage[2][i + 4])));
-        tmp[i * 2 + 1] = from_int32_batch(
-            xsimd::zip_hi(to_int32_batch(stage[2][i]), to_int32_batch(stage[2][i + 4])));
+        tmp[i * 2] =
+            zip_lo_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]);
+        tmp[i * 2 + 1] =
+            zip_hi_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]);
       }
       for (int i = 0; i < 4; ++i) {
-        final_result[i * 2] = from_int32_batch(
-            xsimd::zip_lo(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4])));
-        final_result[i * 2 + 1] = from_int32_batch(
-            xsimd::zip_hi(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4])));
+        final_result[i * 2] = zip_lo_n<NumBytes>(tmp[i], tmp[i + 4]);
+        final_result[i * 2 + 1] = zip_hi_n<NumBytes>(tmp[i], tmp[i + 4]);
       }
     } else {
-      // This is the path for 32bits data.
-      using int64_batch = xsimd::make_sized_batch_t<int64_t, 2>;
-      // This is a workaround, see: https://github.com/xtensor-stack/xsimd/issues/735
-      auto from_int64_batch = [](int64_batch from) -> simd_batch {
-        simd_batch dest;
-        memcpy(&dest, &from, sizeof(simd_batch));
-        return dest;
-      };
-      auto to_int64_batch = [](simd_batch from) -> int64_batch {
-        int64_batch dest;
-        memcpy(&dest, &from, sizeof(simd_batch));
-        return dest;
-      };
-      simd_batch tmp[4];
       for (int i = 0; i < 2; ++i) {
-        tmp[i * 2] = xsimd::zip_lo(stage[2][i * 2], stage[2][i * 2 + 1]);
-        tmp[i * 2 + 1] = xsimd::zip_hi(stage[2][i * 2], stage[2][i * 2 + 1]);
-      }
-      for (int i = 0; i < 2; ++i) {
-        final_result[i * 2] = from_int64_batch(
-            xsimd::zip_lo(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2])));
-        final_result[i * 2 + 1] = from_int64_batch(
-            xsimd::zip_hi(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2])));
+        final_result[i * 2] =
+            zip_lo_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]);
+        final_result[i * 2 + 1] =
+            zip_hi_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]);
       }
     }
+
+    // Save the encoded data to the output buffer
     for (int i = 0; i < kNumStreams; ++i) {
       xsimd::store_unaligned(&output_buffer_streams[i][block_index * sizeof(simd_batch)],
                              final_result[i]);

From 82ac6dcbb222ddc77a245f16ac85251ee5a64bba Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 09:51:34 +0200
Subject: [PATCH 03/19] Remove disjunction in ByteStreamSplitEncodeSimd128

---
 .../arrow/util/byte_stream_split_internal.h   | 36 ++++++++-----------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index cd17e16bc46..4b5cdcaf035 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -189,13 +189,17 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   constexpr int NumBytes = 2 * NumValuesInBatch;
   // Number of steps in the first part of the algorithm with byte-level zipping
   constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1;
-
-  simd_batch stage[NumStepsByte + 1][kNumStreams];
+  // Number of steps in the first part of the algorithm with large data type zipping
+  constexpr int NumStepsLarge = ReversePow2(sizeof(simd_batch) / NumBytes);
+  // Total number of steps
+  constexpr int NumSteps = NumStepsByte + NumStepsLarge;
 
   // Two step shuffling algorithm that starts with bytes and ends with a larger data type.
   // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is
   // also valid but not as performant.
   for (int64_t block_index = 0; block_index < num_blocks; ++block_index) {
+    simd_batch stage[NumSteps + 1][kNumStreams];
+
     // First copy the data to stage 0.
     for (int i = 0; i < kNumStreams; ++i) {
       stage[0][i] = simd_batch::load_unaligned(
@@ -234,32 +238,20 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     // clang-format off
     // Stage 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ...
     // clang-format on
-    simd_batch final_result[kNumStreams];
-    if constexpr (kNumStreams == 8) {
-      simd_batch tmp[8];
-      for (int i = 0; i < 4; ++i) {
-        tmp[i * 2] =
-            zip_lo_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]);
-        tmp[i * 2 + 1] =
-            zip_hi_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 4]);
-      }
-      for (int i = 0; i < 4; ++i) {
-        final_result[i * 2] = zip_lo_n<NumBytes>(tmp[i], tmp[i + 4]);
-        final_result[i * 2 + 1] = zip_hi_n<NumBytes>(tmp[i], tmp[i + 4]);
-      }
-    } else {
-      for (int i = 0; i < 2; ++i) {
-        final_result[i * 2] =
-            zip_lo_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]);
-        final_result[i * 2 + 1] =
-            zip_hi_n<NumBytes>(stage[NumStepsByte][i], stage[NumStepsByte][i + 2]);
+    constexpr int kNumStreamsHalf = kNumStreams / 2;
+    for (int step = NumStepsByte; step < NumSteps; ++step) {
+      for (int i = 0; i < kNumStreamsHalf; ++i) {
+        stage[step + 1][i * 2] =
+            zip_lo_n<NumBytes>(stage[step][i], stage[step][i + kNumStreamsHalf]);
+        stage[step + 1][i * 2 + 1] =
+            zip_hi_n<NumBytes>(stage[step][i], stage[step][i + kNumStreamsHalf]);
       }
     }
 
     // Save the encoded data to the output buffer
     for (int i = 0; i < kNumStreams; ++i) {
       xsimd::store_unaligned(&output_buffer_streams[i][block_index * sizeof(simd_batch)],
-                             final_result[i]);
+                             stage[NumSteps][i]);
     }
   }
 }

From 2ebe8a5a0bdd1587978327e231383bc0ee14883e Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 10:10:03 +0200
Subject: [PATCH 04/19] Enable ByteStreamSplitEncodeSimd128<2>

---
 .../arrow/util/byte_stream_split_internal.h   | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 4b5cdcaf035..0af5e850600 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -140,18 +140,26 @@ using grouped_bytes_t = typename grouped_bytes_impl<N>::type;
 template <int NBytes, int BatchSize = 16,
           typename Batch = xsimd::make_sized_batch_t<int8_t, BatchSize>>
 auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
-  return xsimd::bitwise_cast<int8_t>(
-      xsimd::zip_lo(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
-                    xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+  if constexpr (NBytes == BatchSize) {
+    return a;
+  } else {
+    return xsimd::bitwise_cast<int8_t>(
+        xsimd::zip_lo(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
+                      xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+  }
 }
 
 // Like xsimd::zlip_hi, but zip groups of NBytes at once
 template <int NBytes, int BatchSize = 16,
           typename Batch = xsimd::make_sized_batch_t<int8_t, BatchSize>>
 auto zip_hi_n(Batch const& a, Batch const& b) -> Batch {
-  return xsimd::bitwise_cast<int8_t>(
-      xsimd::zip_hi(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
-                    xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+  if constexpr (NBytes == BatchSize) {
+    return b;
+  } else {
+    return xsimd::bitwise_cast<int8_t>(
+        xsimd::zip_hi(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
+                      xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+  }
 }
 
 template <int kNumStreams>
@@ -160,7 +168,6 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
 
   assert(width == kNumStreams);
-  static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
   constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams;
 
   const int64_t size = num_values * kNumStreams;
@@ -595,7 +602,7 @@ inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width,
       memcpy(out, raw_values, num_values);
       return;
     case 2:
-      return ByteStreamSplitEncodeScalar<2>(raw_values, width, num_values, out);
+      return ByteStreamSplitEncodeSimd128<2>(raw_values, width, num_values, out);
     case 4:
       return ByteStreamSplitEncodePerhapsSimd<4>(raw_values, width, num_values, out);
     case 8:
@@ -619,7 +626,7 @@ inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_va
       memcpy(out, data, num_values);
       return;
     case 2:
-      return ByteStreamSplitDecodePerhapsSimd<2>(data, width, num_values, stride, out);
+      return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out);
     case 4:
       return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values, stride, out);
     case 8:

From ca35643e5f217f3ad0451bcaec364fc2ddbcfd5b Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 10:49:48 +0200
Subject: [PATCH 05/19] Fix conversion warning

---
 cpp/src/arrow/util/byte_stream_split_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 0af5e850600..40a4632434e 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -197,7 +197,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   // Number of steps in the first part of the algorithm with byte-level zipping
   constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1;
   // Number of steps in the first part of the algorithm with large data type zipping
-  constexpr int NumStepsLarge = ReversePow2(sizeof(simd_batch) / NumBytes);
+  constexpr int NumStepsLarge = ReversePow2(static_cast<int>(sizeof(simd_batch)) / NumBytes);
   // Total number of steps
   constexpr int NumSteps = NumStepsByte + NumStepsLarge;
 

From 17fe80b5ef6d9c8a9cffecac9df3303f1d129e5f Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 11:49:13 +0200
Subject: [PATCH 06/19] Fmt

---
 cpp/src/arrow/util/byte_stream_split_internal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 40a4632434e..66927b60806 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -197,7 +197,8 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   // Number of steps in the first part of the algorithm with byte-level zipping
   constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1;
   // Number of steps in the first part of the algorithm with large data type zipping
-  constexpr int NumStepsLarge = ReversePow2(static_cast<int>(sizeof(simd_batch)) / NumBytes);
+  constexpr int NumStepsLarge =
+      ReversePow2(static_cast<int>(sizeof(simd_batch)) / NumBytes);
   // Total number of steps
   constexpr int NumSteps = NumStepsByte + NumStepsLarge;
 

From dae5965b5537f2efac41a48f9083ca18a9ff611d Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 12:29:36 +0200
Subject: [PATCH 07/19] Shorten comments

---
 .../arrow/util/byte_stream_split_internal.h   | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 66927b60806..151566bd932 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -217,12 +217,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     // We first make byte-level shuffling, until we have gather enough bytes together
     // and in the correct order to use a bigger data type.
     //
-    // clang-format off
-    // Stage 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6 A7B7C7D7 | ...
-    // Stage 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7 C3C7D3D7 | ...
-    // Stage 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7 D1D3D5D7 | ...
-    // Stage 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3 D4D5D6D7 | ...
-    // clang-format on
+    // Example with 32bit data on 128 bit register:
+    //
+    // 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6 A7B7C7D7 | ...
+    // 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7 C3C7D3D7 | ...
+    // 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7 D1D3D5D7 | ...
+    // 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3 D4D5D6D7 | ...
     //
     // The shuffling of bytes is performed through the unpack intrinsics.
     // In my measurements this gives better performance then an implementation
@@ -241,11 +241,10 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     // We know have the bytes packed in a larger data type and in the correct order to
     // start using a bigger data type
     //
-    // Example run for 32-bit variables it's int64_t with NumBytes=8 bytes:
+    // Example with 32bit data on 128 bit register.
+    // The large data type is int64_t with NumBytes=8 bytes:
     //
-    // clang-format off
-    // Stage 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ...
-    // clang-format on
+    // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ...
     constexpr int kNumStreamsHalf = kNumStreams / 2;
     for (int step = NumStepsByte; step < NumSteps; ++step) {
       for (int i = 0; i < kNumStreamsHalf; ++i) {

From df3cc7fdcd85abfb1f1c767c3bc8d1fd7134f6b4 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:00:07 +0200
Subject: [PATCH 08/19] Use kPascalCase for constants

---
 .../arrow/util/byte_stream_split_internal.h   | 48 +++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 151566bd932..c433e869783 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -109,7 +109,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
   }
 }
 
-template <int N>
+template <int kNumBytes>
 struct grouped_bytes_impl;
 
 template <>
@@ -133,32 +133,32 @@ struct grouped_bytes_impl<8> {
 };
 
 // Map a number of bytes to a type
-template <int N>
-using grouped_bytes_t = typename grouped_bytes_impl<N>::type;
+template <int kNumBytes>
+using grouped_bytes_t = typename grouped_bytes_impl<kNumBytes>::type;
 
 // Like xsimd::zlip_lo, but zip groups of NBytes at once
-template <int NBytes, int BatchSize = 16,
-          typename Batch = xsimd::make_sized_batch_t<int8_t, BatchSize>>
+template <int kNumBytes, int kBatchSize = 16,
+          typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
 auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
-  if constexpr (NBytes == BatchSize) {
+  if constexpr (kNumBytes == kBatchSize) {
     return a;
   } else {
     return xsimd::bitwise_cast<int8_t>(
-        xsimd::zip_lo(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
-                      xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+        xsimd::zip_lo(xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(a),
+                      xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(b)));
   }
 }
 
 // Like xsimd::zlip_hi, but zip groups of NBytes at once
-template <int NBytes, int BatchSize = 16,
-          typename Batch = xsimd::make_sized_batch_t<int8_t, BatchSize>>
+template <int kNumBytes, int kBatchSize = 16,
+          typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
 auto zip_hi_n(Batch const& a, Batch const& b) -> Batch {
-  if constexpr (NBytes == BatchSize) {
+  if constexpr (kNumBytes == kBatchSize) {
     return b;
   } else {
     return xsimd::bitwise_cast<int8_t>(
-        xsimd::zip_hi(xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(a),
-                      xsimd::bitwise_cast<grouped_bytes_t<NBytes>>(b)));
+        xsimd::zip_hi(xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(a),
+                      xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(b)));
   }
 }
 
@@ -188,19 +188,19 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   }
 
   // Number of input values we can fit in a simd register
-  constexpr int NumValuesInBatch = sizeof(simd_batch) / kNumStreams;
-  static_assert(NumValuesInBatch > 0);
+  constexpr int kNumValuesInBatch = sizeof(simd_batch) / kNumStreams;
+  static_assert(kNumValuesInBatch > 0);
   // Number of bytes we'll bring together in the first byte-level part of the algorithm.
   // Since we zip with the next batch, the number of values in a batch determines how many
   // bytes end up together before we can use a larger type
-  constexpr int NumBytes = 2 * NumValuesInBatch;
+  constexpr int kNumBytes = 2 * kNumValuesInBatch;
   // Number of steps in the first part of the algorithm with byte-level zipping
-  constexpr int NumStepsByte = ReversePow2(NumValuesInBatch) + 1;
+  constexpr int kNumStepsByte = ReversePow2(kNumValuesInBatch) + 1;
   // Number of steps in the first part of the algorithm with large data type zipping
-  constexpr int NumStepsLarge =
-      ReversePow2(static_cast<int>(sizeof(simd_batch)) / NumBytes);
+  constexpr int kNumStepsLarge =
+      ReversePow2(static_cast<int>(sizeof(simd_batch)) / kNumBytes);
   // Total number of steps
-  constexpr int NumSteps = NumStepsByte + NumStepsLarge;
+  constexpr int NumSteps = kNumStepsByte + kNumStepsLarge;
 
   // Two step shuffling algorithm that starts with bytes and ends with a larger data type.
   // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is
@@ -230,7 +230,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     //
     // Loop order does not matter so we prefer higher locality
     for (int i = 0; i < kNumStreams / 2; ++i) {
-      for (int step = 0; step < NumStepsByte; ++step) {
+      for (int step = 0; step < kNumStepsByte; ++step) {
         stage[step + 1][i * 2] =
             xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]);
         stage[step + 1][i * 2 + 1] =
@@ -246,12 +246,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     //
     // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ...
     constexpr int kNumStreamsHalf = kNumStreams / 2;
-    for (int step = NumStepsByte; step < NumSteps; ++step) {
+    for (int step = kNumStepsByte; step < NumSteps; ++step) {
       for (int i = 0; i < kNumStreamsHalf; ++i) {
         stage[step + 1][i * 2] =
-            zip_lo_n<NumBytes>(stage[step][i], stage[step][i + kNumStreamsHalf]);
+            zip_lo_n<kNumBytes>(stage[step][i], stage[step][i + kNumStreamsHalf]);
         stage[step + 1][i * 2 + 1] =
-            zip_hi_n<NumBytes>(stage[step][i], stage[step][i + kNumStreamsHalf]);
+            zip_hi_n<kNumBytes>(stage[step][i], stage[step][i + kNumStreamsHalf]);
       }
     }
 

From d4fb898947cc0d52884701cec8768be58f42f6ab Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:08:14 +0200
Subject: [PATCH 09/19] Use static enabling of ByteStreamSplitEncodeSimd128<2>

---
 cpp/src/arrow/util/byte_stream_split_internal.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index c433e869783..f4c35a6f616 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -431,6 +431,10 @@ template <int kNumStreams>
 void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t num_values,
                                       int64_t stride, uint8_t* out) {
 #  if defined(ARROW_HAVE_AVX2)
+  // Not implemented
+  if constexpr (kNumStreams == 2) {
+    return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out);
+  }
   return ByteStreamSplitDecodeAvx2<kNumStreams>(data, width, num_values, stride, out);
 #  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
   return ByteStreamSplitDecodeSimd128<kNumStreams>(data, width, num_values, stride, out);
@@ -444,6 +448,11 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width,
                                       const int64_t num_values,
                                       uint8_t* output_buffer_raw) {
 #  if defined(ARROW_HAVE_AVX2)
+  // Not implemented
+  if constexpr (kNumStreams == 2) {
+    return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width, num_values,
+                                                     output_buffer_raw);
+  }
   return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, width, num_values,
                                                 output_buffer_raw);
 #  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
@@ -602,7 +611,7 @@ inline void ByteStreamSplitEncode(const uint8_t* raw_values, int width,
       memcpy(out, raw_values, num_values);
       return;
     case 2:
-      return ByteStreamSplitEncodeSimd128<2>(raw_values, width, num_values, out);
+      return ByteStreamSplitEncodePerhapsSimd<2>(raw_values, width, num_values, out);
     case 4:
       return ByteStreamSplitEncodePerhapsSimd<4>(raw_values, width, num_values, out);
     case 8:
@@ -626,7 +635,7 @@ inline void ByteStreamSplitDecode(const uint8_t* data, int width, int64_t num_va
       memcpy(out, data, num_values);
       return;
     case 2:
-      return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out);
+      return ByteStreamSplitDecodePerhapsSimd<2>(data, width, num_values, stride, out);
     case 4:
       return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values, stride, out);
     case 8:

From 2f37deb8f9e13d9bdb3c0f6409a3535e3f344752 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:16:39 +0200
Subject: [PATCH 10/19] Safer computation of simd batch size

---
 .../arrow/util/byte_stream_split_internal.h   | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index f4c35a6f616..71f51c5dabc 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -62,7 +62,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
   assert(width == kNumStreams);
   constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams);
   static_assert(kNumStreamsLog2 != 0);
-  constexpr int64_t kBlockSize = sizeof(simd_batch) * kNumStreams;
+  constexpr int64_t kBlockSize = simd_batch::size * kNumStreams;
 
   const int64_t size = num_values * kNumStreams;
   const int64_t num_blocks = size / kBlockSize;
@@ -90,8 +90,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
 
   for (int64_t i = 0; i < num_blocks; ++i) {
     for (int j = 0; j < kNumStreams; ++j) {
-      stage[0][j] =
-          simd_batch::load_unaligned(&data[i * sizeof(simd_batch) + j * stride]);
+      stage[0][j] = simd_batch::load_unaligned(&data[i * simd_batch::size + j * stride]);
     }
     for (int step = 0; step < kNumStreamsLog2; ++step) {
       for (int j = 0; j < kNumStreamsHalf; ++j) {
@@ -103,7 +102,7 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
     }
     for (int j = 0; j < kNumStreams; ++j) {
       xsimd::store_unaligned(
-          reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) * sizeof(simd_batch)),
+          reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) * simd_batch::size),
           stage[kNumStreamsLog2][j]);
     }
   }
@@ -168,7 +167,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
 
   assert(width == kNumStreams);
-  constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams;
+  constexpr int kBlockSize = simd_batch::size * kNumStreams;
 
   const int64_t size = num_values * kNumStreams;
   const int64_t num_blocks = size / kBlockSize;
@@ -188,7 +187,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   }
 
   // Number of input values we can fit in a simd register
-  constexpr int kNumValuesInBatch = sizeof(simd_batch) / kNumStreams;
+  constexpr int kNumValuesInBatch = simd_batch::size / kNumStreams;
   static_assert(kNumValuesInBatch > 0);
   // Number of bytes we'll bring together in the first byte-level part of the algorithm.
   // Since we zip with the next batch, the number of values in a batch determines how many
@@ -198,12 +197,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   constexpr int kNumStepsByte = ReversePow2(kNumValuesInBatch) + 1;
   // Number of steps in the first part of the algorithm with large data type zipping
   constexpr int kNumStepsLarge =
-      ReversePow2(static_cast<int>(sizeof(simd_batch)) / kNumBytes);
+      ReversePow2(static_cast<int>(simd_batch::size) / kNumBytes);
   // Total number of steps
   constexpr int NumSteps = kNumStepsByte + kNumStepsLarge;
 
   // Two step shuffling algorithm that starts with bytes and ends with a larger data type.
-  // An algorithm similar to the decoding one with log2(sizeof(simd_batch)) + 1 stages is
+  // An algorithm similar to the decoding one with log2(simd_batch::size) + 1 stages is
   // also valid but not as performant.
   for (int64_t block_index = 0; block_index < num_blocks; ++block_index) {
     simd_batch stage[NumSteps + 1][kNumStreams];
@@ -211,7 +210,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     // First copy the data to stage 0.
     for (int i = 0; i < kNumStreams; ++i) {
       stage[0][i] = simd_batch::load_unaligned(
-          &raw_values[(block_index * kNumStreams + i) * sizeof(simd_batch)]);
+          &raw_values[(block_index * kNumStreams + i) * simd_batch::size]);
     }
 
     // We first make byte-level shuffling, until we have gather enough bytes together
@@ -257,7 +256,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
 
     // Save the encoded data to the output buffer
     for (int i = 0; i < kNumStreams; ++i) {
-      xsimd::store_unaligned(&output_buffer_streams[i][block_index * sizeof(simd_batch)],
+      xsimd::store_unaligned(&output_buffer_streams[i][block_index * simd_batch::size],
                              stage[NumSteps][i]);
     }
   }

From a7ff16a2c85df3d24b82cc36ab52a97315334a72 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 12 Jun 2025 14:18:57 +0200
Subject: [PATCH 11/19] Fix fail compilation

---
 cpp/src/arrow/util/byte_stream_split_internal.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 71f51c5dabc..023f1e1c7cf 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -433,8 +433,9 @@ void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t nu
   // Not implemented
   if constexpr (kNumStreams == 2) {
     return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride, out);
+  } else {
+    return ByteStreamSplitDecodeAvx2<kNumStreams>(data, width, num_values, stride, out);
   }
-  return ByteStreamSplitDecodeAvx2<kNumStreams>(data, width, num_values, stride, out);
 #  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
   return ByteStreamSplitDecodeSimd128<kNumStreams>(data, width, num_values, stride, out);
 #  else
@@ -451,9 +452,10 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width,
   if constexpr (kNumStreams == 2) {
     return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width, num_values,
                                                      output_buffer_raw);
+  } else {
+    return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, width, num_values,
+                                                  output_buffer_raw);
   }
-  return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, width, num_values,
-                                                output_buffer_raw);
 #  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
   return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width, num_values,
                                                    output_buffer_raw);

From 83ebd5caf439d08a572c6b235ca92a8717e57c66 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Fri, 13 Jun 2025 11:11:58 +0200
Subject: [PATCH 12/19] Small Encode improvement

---
 cpp/src/arrow/util/byte_stream_split_internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 023f1e1c7cf..5223c9f2a95 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -85,10 +85,10 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
   // Stage 1: AAAA BBBB CCCC DDDD
   // Stage 2: ACAC ACAC BDBD BDBD
   // Stage 3: ABCD ABCD ABCD ABCD
-  simd_batch stage[kNumStreamsLog2 + 1][kNumStreams];
   constexpr int kNumStreamsHalf = kNumStreams / 2U;
 
   for (int64_t i = 0; i < num_blocks; ++i) {
+    simd_batch stage[kNumStreamsLog2 + 1][kNumStreams];
     for (int j = 0; j < kNumStreams; ++j) {
       stage[0][j] = simd_batch::load_unaligned(&data[i * simd_batch::size + j * stride]);
     }

From 59ddd69f6e776e8aed3dad7a5f1f189b8be1fae5 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Mon, 16 Jun 2025 11:58:23 +0200
Subject: [PATCH 13/19] Add int16_t benchamarks

---
 cpp/src/parquet/encoding_benchmark.cc | 37 +++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index 28cfcd98a3c..6392f533785 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -19,6 +19,7 @@
 
 #include <array>
 #include <cmath>
+#include <cstdint>
 #include <limits>
 #include <random>
 
@@ -508,6 +509,11 @@ BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Apply(ByteStreamSplitApply);
 
 #if defined(ARROW_HAVE_SSE4_2)
+static void BM_ByteStreamSplitDecode_Int16_Sse2(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<float>(
+      state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(int16_t)>);
+}
+
 static void BM_ByteStreamSplitDecode_Float_Sse2(benchmark::State& state) {
   BM_ByteStreamSplitDecode<float>(
       state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(float)>);
@@ -518,6 +524,11 @@ static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) {
       state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(double)>);
 }
 
+static void BM_ByteStreamSplitEncode_Int16_Sse2(benchmark::State& state) {
+  BM_ByteStreamSplitEncode<float>(
+      state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(int16_t)>);
+}
+
 static void BM_ByteStreamSplitEncode_Float_Sse2(benchmark::State& state) {
   BM_ByteStreamSplitEncode<float>(
       state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(float)>);
@@ -528,13 +539,20 @@ static void BM_ByteStreamSplitEncode_Double_Sse2(benchmark::State& state) {
       state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(double)>);
 }
 
+BENCHMARK(BM_ByteStreamSplitDecode_Int16_Sse2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitDecode_Double_Sse2)->Apply(ByteStreamSplitApply);
+BENCHMARK(BM_ByteStreamSplitEncode_Int16_Sse2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitEncode_Float_Sse2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply);
 #endif
 
 #if defined(ARROW_HAVE_AVX2)
+static void BM_ByteStreamSplitDecode_Int16_Avx2(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<float>(
+      state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2<sizeof(int16_t)>);
+}
+
 static void BM_ByteStreamSplitDecode_Float_Avx2(benchmark::State& state) {
   BM_ByteStreamSplitDecode<float>(
       state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2<sizeof(float)>);
@@ -545,6 +563,11 @@ static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) {
       state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2<sizeof(double)>);
 }
 
+static void BM_ByteStreamSplitEncode_Int16_Avx2(benchmark::State& state) {
+  BM_ByteStreamSplitEncode<float>(
+      state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(int16_t)>);
+}
+
 static void BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) {
   BM_ByteStreamSplitEncode<float>(
       state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(float)>);
@@ -555,13 +578,20 @@ static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) {
       state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(double)>);
 }
 
+BENCHMARK(BM_ByteStreamSplitDecode_Int16_Avx2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Apply(ByteStreamSplitApply);
+BENCHMARK(BM_ByteStreamSplitEncode_Int16_Avx2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Apply(ByteStreamSplitApply);
 #endif
 
 #if defined(ARROW_HAVE_NEON)
+static void BM_ByteStreamSplitDecode_Int16_Neon(benchmark::State& state) {
+  BM_ByteStreamSplitDecode<int16_t>(
+      state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(int16_t)>);
+}
+
 static void BM_ByteStreamSplitDecode_Float_Neon(benchmark::State& state) {
   BM_ByteStreamSplitDecode<float>(
       state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(float)>);
@@ -572,6 +602,11 @@ static void BM_ByteStreamSplitDecode_Double_Neon(benchmark::State& state) {
       state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(double)>);
 }
 
+static void BM_ByteStreamSplitEncode_Int16_Neon(benchmark::State& state) {
+  BM_ByteStreamSplitEncode<int16_t>(
+      state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(int16_t)>);
+}
+
 static void BM_ByteStreamSplitEncode_Float_Neon(benchmark::State& state) {
   BM_ByteStreamSplitEncode<float>(
       state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(float)>);
@@ -582,8 +617,10 @@ static void BM_ByteStreamSplitEncode_Double_Neon(benchmark::State& state) {
       state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(double)>);
 }
 
+BENCHMARK(BM_ByteStreamSplitDecode_Int16_Neon)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_ByteStreamSplitDecode_Float_Neon)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_ByteStreamSplitDecode_Double_Neon)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Int16_Neon)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_ByteStreamSplitEncode_Float_Neon)->Range(MIN_RANGE, MAX_RANGE);
 BENCHMARK(BM_ByteStreamSplitEncode_Double_Neon)->Range(MIN_RANGE, MAX_RANGE);
 #endif

From c8765deb010f288418900c5ffb5aea51740e080c Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 19 Jun 2025 10:28:27 +0200
Subject: [PATCH 14/19] Fix int16_t byte stream split benchmarks

---
 cpp/src/parquet/encoding_benchmark.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index 6392f533785..1ac8ce05701 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -510,7 +510,7 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Apply(ByteStreamSplitApply);
 
 #if defined(ARROW_HAVE_SSE4_2)
 static void BM_ByteStreamSplitDecode_Int16_Sse2(benchmark::State& state) {
-  BM_ByteStreamSplitDecode<float>(
+  BM_ByteStreamSplitDecode<int16_t>(
       state, ::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(int16_t)>);
 }
 
@@ -525,7 +525,7 @@ static void BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) {
 }
 
 static void BM_ByteStreamSplitEncode_Int16_Sse2(benchmark::State& state) {
-  BM_ByteStreamSplitEncode<float>(
+  BM_ByteStreamSplitEncode<int16_t>(
       state, ::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(int16_t)>);
 }
 
@@ -549,7 +549,7 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply);
 
 #if defined(ARROW_HAVE_AVX2)
 static void BM_ByteStreamSplitDecode_Int16_Avx2(benchmark::State& state) {
-  BM_ByteStreamSplitDecode<float>(
+  BM_ByteStreamSplitDecode<int16_t>(
       state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2<sizeof(int16_t)>);
 }
 
@@ -564,7 +564,7 @@ static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) {
 }
 
 static void BM_ByteStreamSplitEncode_Int16_Avx2(benchmark::State& state) {
-  BM_ByteStreamSplitEncode<float>(
+  BM_ByteStreamSplitEncode<int16_t>(
       state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(int16_t)>);
 }
 

From 71f0dc0d4b3ca816af7a36e70aa0e96ac4a866ed Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 19 Jun 2025 14:02:55 +0200
Subject: [PATCH 15/19] Remove misleading benchmarks

---
 .../arrow/util/byte_stream_split_internal.h    |  8 ++------
 cpp/src/parquet/encoding_benchmark.cc          | 18 ------------------
 2 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 5223c9f2a95..39f44276887 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -355,13 +355,9 @@ template <int kNumStreams>
 void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, int width,
                                const int64_t num_values, uint8_t* output_buffer_raw) {
   assert(width == kNumStreams);
-  static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of streams.");
+  static_assert(kNumStreams == 4, "Invalid number of streams.");
   constexpr int kBlockSize = sizeof(__m256i) * kNumStreams;
 
-  if constexpr (kNumStreams == 8)  // Back to SSE, currently no path for double.
-    return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width, num_values,
-                                                     output_buffer_raw);
-
   const int64_t size = num_values * kNumStreams;
   if (size < kBlockSize)  // Back to SSE for small size
     return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width, num_values,
@@ -449,7 +445,7 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t* raw_values, int width,
                                       uint8_t* output_buffer_raw) {
 #  if defined(ARROW_HAVE_AVX2)
   // Not implemented
-  if constexpr (kNumStreams == 2) {
+  if constexpr (kNumStreams == 2 || kNumStreams == 8) {
     return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width, num_values,
                                                      output_buffer_raw);
   } else {
diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc
index 1ac8ce05701..aee30bf7623 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -548,11 +548,6 @@ BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply);
 #endif
 
 #if defined(ARROW_HAVE_AVX2)
-static void BM_ByteStreamSplitDecode_Int16_Avx2(benchmark::State& state) {
-  BM_ByteStreamSplitDecode<int16_t>(
-      state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2<sizeof(int16_t)>);
-}
-
 static void BM_ByteStreamSplitDecode_Float_Avx2(benchmark::State& state) {
   BM_ByteStreamSplitDecode<float>(
       state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2<sizeof(float)>);
@@ -563,27 +558,14 @@ static void BM_ByteStreamSplitDecode_Double_Avx2(benchmark::State& state) {
       state, ::arrow::util::internal::ByteStreamSplitDecodeAvx2<sizeof(double)>);
 }
 
-static void BM_ByteStreamSplitEncode_Int16_Avx2(benchmark::State& state) {
-  BM_ByteStreamSplitEncode<int16_t>(
-      state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(int16_t)>);
-}
-
 static void BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) {
   BM_ByteStreamSplitEncode<float>(
       state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(float)>);
 }
 
-static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) {
-  BM_ByteStreamSplitEncode<double>(
-      state, ::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(double)>);
-}
-
-BENCHMARK(BM_ByteStreamSplitDecode_Int16_Avx2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Apply(ByteStreamSplitApply);
-BENCHMARK(BM_ByteStreamSplitEncode_Int16_Avx2)->Apply(ByteStreamSplitApply);
 BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Apply(ByteStreamSplitApply);
-BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Apply(ByteStreamSplitApply);
 #endif
 
 #if defined(ARROW_HAVE_NEON)

From a40523313d515461357f0aec43190770a27e82c7 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Thu, 19 Jun 2025 17:15:55 +0200
Subject: [PATCH 16/19] Fix casing

---
 cpp/src/arrow/util/byte_stream_split_internal.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 39f44276887..63729501cdd 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -199,13 +199,13 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   constexpr int kNumStepsLarge =
       ReversePow2(static_cast<int>(simd_batch::size) / kNumBytes);
   // Total number of steps
-  constexpr int NumSteps = kNumStepsByte + kNumStepsLarge;
+  constexpr int kNumSteps = kNumStepsByte + kNumStepsLarge;
 
   // Two step shuffling algorithm that starts with bytes and ends with a larger data type.
   // An algorithm similar to the decoding one with log2(simd_batch::size) + 1 stages is
   // also valid but not as performant.
   for (int64_t block_index = 0; block_index < num_blocks; ++block_index) {
-    simd_batch stage[NumSteps + 1][kNumStreams];
+    simd_batch stage[kNumSteps + 1][kNumStreams];
 
     // First copy the data to stage 0.
     for (int i = 0; i < kNumStreams; ++i) {
@@ -245,7 +245,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     //
     // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ...
     constexpr int kNumStreamsHalf = kNumStreams / 2;
-    for (int step = kNumStepsByte; step < NumSteps; ++step) {
+    for (int step = kNumStepsByte; step < kNumSteps; ++step) {
       for (int i = 0; i < kNumStreamsHalf; ++i) {
         stage[step + 1][i * 2] =
             zip_lo_n<kNumBytes>(stage[step][i], stage[step][i + kNumStreamsHalf]);
@@ -257,7 +257,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     // Save the encoded data to the output buffer
     for (int i = 0; i < kNumStreams; ++i) {
       xsimd::store_unaligned(&output_buffer_streams[i][block_index * simd_batch::size],
-                             stage[NumSteps][i]);
+                             stage[kNumSteps][i]);
     }
   }
 }

From 9a87321835ff7df54dbfd4b10a6cc8797999be6f Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Mon, 23 Jun 2025 17:38:12 +0200
Subject: [PATCH 17/19] Review comments

---
 .../arrow/util/byte_stream_split_internal.h   | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index 63729501cdd..d7f05e44a16 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -59,9 +59,13 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
                                   int64_t stride, uint8_t* out) {
   using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
 
+  static_assert(kNumStreams <= simd_batch::size,
+                "The algorithm works when the number of streams is smaller than the SIMD "
+                "batch size.");
   assert(width == kNumStreams);
   constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams);
-  static_assert(kNumStreamsLog2 != 0);
+  static_assert(kNumStreamsLog2 != 0,
+                "The algorithm works for a number of streams being a power of two.");
   constexpr int64_t kBlockSize = simd_batch::size * kNumStreams;
 
   const int64_t size = num_values * kNumStreams;
@@ -135,7 +139,7 @@ struct grouped_bytes_impl<8> {
 template <int kNumBytes>
 using grouped_bytes_t = typename grouped_bytes_impl<kNumBytes>::type;
 
-// Like xsimd::zlip_lo, but zip groups of NBytes at once
+// Like xsimd::zip_lo, but zip groups of kNumBytes at once.
 template <int kNumBytes, int kBatchSize = 16,
           typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
 auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
@@ -148,7 +152,7 @@ auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
   }
 }
 
-// Like xsimd::zlip_hi, but zip groups of NBytes at once
+// Like xsimd::zip_hi, but zip groups of kNumBytes at once.
 template <int kNumBytes, int kBatchSize = 16,
           typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
 auto zip_hi_n(Batch const& a, Batch const& b) -> Batch {
@@ -167,7 +171,12 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
   using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
 
   assert(width == kNumStreams);
+  static_assert(kNumStreams <= simd_batch::size,
+                "The algorithm works when the number of streams is smaller than the SIMD "
+                "batch size.");
   constexpr int kBlockSize = simd_batch::size * kNumStreams;
+  static_assert(ReversePow2(kNumStreams) != 0,
+                "The algorithm works for a number of streams being a power of two.");
 
   const int64_t size = num_values * kNumStreams;
   const int64_t num_blocks = size / kBlockSize;
@@ -200,6 +209,7 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
       ReversePow2(static_cast<int>(simd_batch::size) / kNumBytes);
   // Total number of steps
   constexpr int kNumSteps = kNumStepsByte + kNumStepsLarge;
+  static_assert(kNumSteps == ReversePow2(simd_batch::size));
 
   // Two step shuffling algorithm that starts with bytes and ends with a larger data type.
   // An algorithm similar to the decoding one with log2(simd_batch::size) + 1 stages is
@@ -228,7 +238,8 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     // which uses the shuffle intrinsics.
     //
     // Loop order does not matter so we prefer higher locality
-    for (int i = 0; i < kNumStreams / 2; ++i) {
+    constexpr int kNumStreamsHalf = kNumStreams / 2;
+    for (int i = 0; i < kNumStreamsHalf; ++i) {
       for (int step = 0; step < kNumStepsByte; ++step) {
         stage[step + 1][i * 2] =
             xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]);
@@ -244,7 +255,6 @@ void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
     // The large data type is int64_t with NumBytes=8 bytes:
     //
     // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB BCBDBEBF | ...
-    constexpr int kNumStreamsHalf = kNumStreams / 2;
     for (int step = kNumStepsByte; step < kNumSteps; ++step) {
       for (int i = 0; i < kNumStreamsHalf; ++i) {
         stage[step + 1][i * 2] =

From 754f3ab246f62aea6fcf74c92b00b4dff3d7b516 Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Mon, 23 Jun 2025 17:48:22 +0200
Subject: [PATCH 18/19] Fix and extend tests

---
 cpp/src/arrow/util/byte_stream_split_test.cc | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc b/cpp/src/arrow/util/byte_stream_split_test.cc
index 9755cd8b8d0..ec995a0a9aa 100644
--- a/cpp/src/arrow/util/byte_stream_split_test.cc
+++ b/cpp/src/arrow/util/byte_stream_split_test.cc
@@ -136,7 +136,7 @@ class TestByteStreamSplitSpecialized : public ::testing::Test {
     return input;
   }
 
-  template <bool kSimdImplemented = (kWidth == 4 || kWidth == 8)>
+  template <bool kSimdImplemented = (kWidth == 2 || kWidth == 4 || kWidth == 8)>
   static std::vector<DecodeFunc> MakeDecodeFuncs() {
     std::vector<DecodeFunc> funcs;
     funcs.push_back({"scalar_dynamic", &ByteStreamSplitDecodeScalarDynamic});
@@ -146,7 +146,10 @@ class TestByteStreamSplitSpecialized : public ::testing::Test {
       funcs.push_back({"simd", &ByteStreamSplitDecodeSimd<kWidth>});
       funcs.push_back({"simd128", &ByteStreamSplitDecodeSimd128<kWidth>});
 #  if defined(ARROW_HAVE_AVX2)
-      funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2<kWidth>});
+      // The only available implementations
+      if constexpr (kWidth == 4 || kWidth == 8) {
+        funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2<kWidth>});
+      }
 #  endif
     }
 #endif  // defined(ARROW_HAVE_SIMD_SPLIT)
@@ -164,7 +167,10 @@ class TestByteStreamSplitSpecialized : public ::testing::Test {
       funcs.push_back({"simd", &ByteStreamSplitEncodeSimd<kWidth>});
       funcs.push_back({"simd128", &ByteStreamSplitEncodeSimd128<kWidth>});
 #  if defined(ARROW_HAVE_AVX2)
-      funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2<kWidth>});
+      // The only available implementation
+      if constexpr (kWidth == 4) {
+        funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2<kWidth>});
+      }
 #  endif
     }
 #endif  // defined(ARROW_HAVE_SIMD_SPLIT)

From 3ea4dd3de5bd4618af53e73fb37139118101c69b Mon Sep 17 00:00:00 2001
From: AntoinePrv <AntoinePrv@users.noreply.github.com>
Date: Tue, 24 Jun 2025 09:18:11 +0200
Subject: [PATCH 19/19] Move and rename SizedInt to type_traits

---
 .../arrow/util/byte_stream_split_internal.h   | 40 +++++--------------
 cpp/src/arrow/util/type_traits.h              | 27 +++++++++++++
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h b/cpp/src/arrow/util/byte_stream_split_internal.h
index d7f05e44a16..2eb678cbfbe 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -20,6 +20,7 @@
 #include "arrow/util/endian.h"
 #include "arrow/util/simd.h"
 #include "arrow/util/small_vector.h"
+#include "arrow/util/type_traits.h"
 #include "arrow/util/ubsan.h"
 
 #include <algorithm>
@@ -112,43 +113,18 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t num_va
   }
 }
 
-template <int kNumBytes>
-struct grouped_bytes_impl;
-
-template <>
-struct grouped_bytes_impl<1> {
-  using type = int8_t;
-};
-
-template <>
-struct grouped_bytes_impl<2> {
-  using type = int16_t;
-};
-
-template <>
-struct grouped_bytes_impl<4> {
-  using type = int32_t;
-};
-
-template <>
-struct grouped_bytes_impl<8> {
-  using type = int64_t;
-};
-
-// Map a number of bytes to a type
-template <int kNumBytes>
-using grouped_bytes_t = typename grouped_bytes_impl<kNumBytes>::type;
-
 // Like xsimd::zip_lo, but zip groups of kNumBytes at once.
 template <int kNumBytes, int kBatchSize = 16,
           typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
 auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
+  using arrow::internal::SizedInt;
+
   if constexpr (kNumBytes == kBatchSize) {
     return a;
   } else {
     return xsimd::bitwise_cast<int8_t>(
-        xsimd::zip_lo(xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(a),
-                      xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(b)));
+        xsimd::zip_lo(xsimd::bitwise_cast<SizedInt<kNumBytes>>(a),
+                      xsimd::bitwise_cast<SizedInt<kNumBytes>>(b)));
   }
 }
 
@@ -156,12 +132,14 @@ auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
 template <int kNumBytes, int kBatchSize = 16,
           typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
 auto zip_hi_n(Batch const& a, Batch const& b) -> Batch {
+  using arrow::internal::SizedInt;
+
   if constexpr (kNumBytes == kBatchSize) {
     return b;
   } else {
     return xsimd::bitwise_cast<int8_t>(
-        xsimd::zip_hi(xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(a),
-                      xsimd::bitwise_cast<grouped_bytes_t<kNumBytes>>(b)));
+        xsimd::zip_hi(xsimd::bitwise_cast<SizedInt<kNumBytes>>(a),
+                      xsimd::bitwise_cast<SizedInt<kNumBytes>>(b)));
   }
 }
 
diff --git a/cpp/src/arrow/util/type_traits.h b/cpp/src/arrow/util/type_traits.h
index c1906152423..9c3b388dab2 100644
--- a/cpp/src/arrow/util/type_traits.h
+++ b/cpp/src/arrow/util/type_traits.h
@@ -42,5 +42,32 @@ template <typename T>
 struct is_null_pointer : std::is_same<std::nullptr_t, typename std::remove_cv<T>::type> {
 };
 
+template <int kNumBytes>
+struct SizedIntImpl;
+
+template <>
+struct SizedIntImpl<1> {
+  using type = int8_t;
+};
+
+template <>
+struct SizedIntImpl<2> {
+  using type = int16_t;
+};
+
+template <>
+struct SizedIntImpl<4> {
+  using type = int32_t;
+};
+
+template <>
+struct SizedIntImpl<8> {
+  using type = int64_t;
+};
+
+// Map a number of bytes to a type
+template <int kNumBytes>
+using SizedInt = typename SizedIntImpl<kNumBytes>::type;
+
 }  // namespace internal
 }  // namespace arrow