From 6ed85e631b67591ed87ba6421dd8328530190f38 Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Tue, 7 Mar 2023 14:57:31 -0500
Subject: [PATCH 01/11] Moved code from compute/exec to compute to reduce
 entanglement with acero

---
 cpp/src/arrow/CMakeLists.txt                  |   4 +-
 cpp/src/arrow/compute/CMakeLists.txt          |   3 +-
 cpp/src/arrow/compute/exec/CMakeLists.txt     |   3 +-
 cpp/src/arrow/compute/exec/asof_join_node.cc  |   2 +-
 .../arrow/compute/exec/bloom_filter_test.cc   |   2 +-
 cpp/src/arrow/compute/exec/hash_join_node.cc  |   2 +-
 cpp/src/arrow/compute/exec/swiss_join.cc      |   2 +-
 cpp/src/arrow/compute/exec/util.h             | 172 +++++------
 .../arrow/compute/kernels/hash_aggregate.cc   |   2 +-
 cpp/src/arrow/compute/{exec => }/key_hash.cc  |   2 +-
 cpp/src/arrow/compute/{exec => }/key_hash.h   |   0
 .../arrow/compute/{exec => }/key_hash_avx2.cc |   2 +-
 .../arrow/compute/{exec => }/key_hash_test.cc |   2 +-
 cpp/src/arrow/compute/light_array.h           |   3 +-
 cpp/src/arrow/compute/light_array_test.cc     | 284 +++++++++---------
 cpp/src/arrow/compute/row/grouper.cc          |   2 +-
 cpp/src/arrow/compute/util.h                  | 131 ++++++++
 17 files changed, 378 insertions(+), 240 deletions(-)
 rename cpp/src/arrow/compute/{exec => }/key_hash.cc (99%)
 rename cpp/src/arrow/compute/{exec => }/key_hash.h (100%)
 rename cpp/src/arrow/compute/{exec => }/key_hash_avx2.cc (99%)
 rename cpp/src/arrow/compute/{exec => }/key_hash_test.cc (99%)
 create mode 100644 cpp/src/arrow/compute/util.h

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 2a4748d0a40..bf406ca29fb 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -401,7 +401,7 @@ list(APPEND
      compute/exec/hash_join.cc
      compute/exec/hash_join_dict.cc
      compute/exec/hash_join_node.cc
-     compute/exec/key_hash.cc
+        compute/key_hash.cc
      compute/exec/key_map.cc
      compute/exec/map_node.cc
      compute/exec/options.cc
@@ -442,7 +442,7 @@ list(APPEND
      compute/row/row_internal.cc)
 
 append_avx2_src(compute/exec/bloom_filter_avx2.cc)
-append_avx2_src(compute/exec/key_hash_avx2.cc)
+append_avx2_src(compute/key_hash_avx2.cc)
 append_avx2_src(compute/exec/key_map_avx2.cc)
 append_avx2_src(compute/exec/swiss_join_avx2.cc)
 append_avx2_src(compute/exec/util_avx2.cc)
diff --git a/cpp/src/arrow/compute/CMakeLists.txt b/cpp/src/arrow/compute/CMakeLists.txt
index cdf019b798b..4accec15d9c 100644
--- a/cpp/src/arrow/compute/CMakeLists.txt
+++ b/cpp/src/arrow/compute/CMakeLists.txt
@@ -82,7 +82,8 @@ add_arrow_compute_test(internals_test
                        exec_test.cc
                        kernel_test.cc
                        light_array_test.cc
-                       registry_test.cc)
+                       registry_test.cc
+                       key_hash_test.cc)
 
 add_arrow_benchmark(function_benchmark PREFIX "arrow-compute")
 
diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt
index 9f3eedb63de..cc8e7175a2a 100644
--- a/cpp/src/arrow/compute/exec/CMakeLists.txt
+++ b/cpp/src/arrow/compute/exec/CMakeLists.txt
@@ -46,8 +46,7 @@ add_arrow_compute_test(hash_join_node_test
                        "arrow-compute"
                        SOURCES
                        hash_join_node_test.cc
-                       bloom_filter_test.cc
-                       key_hash_test.cc)
+                       bloom_filter_test.cc)
 add_arrow_compute_test(pivot_longer_node_test
                        PREFIX
                        "arrow-compute"
diff --git a/cpp/src/arrow/compute/exec/asof_join_node.cc b/cpp/src/arrow/compute/exec/asof_join_node.cc
index 00a733be25e..4aa0fb72f05 100644
--- a/cpp/src/arrow/compute/exec/asof_join_node.cc
+++ b/cpp/src/arrow/compute/exec/asof_join_node.cc
@@ -30,7 +30,7 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_primitive.h"
 #include "arrow/compute/exec/exec_plan.h"
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/query_context.h"
 #include "arrow/compute/exec/schema_util.h"
diff --git a/cpp/src/arrow/compute/exec/bloom_filter_test.cc b/cpp/src/arrow/compute/exec/bloom_filter_test.cc
index 50993b4cb10..5dc35ed42ab 100644
--- a/cpp/src/arrow/compute/exec/bloom_filter_test.cc
+++ b/cpp/src/arrow/compute/exec/bloom_filter_test.cc
@@ -23,7 +23,7 @@
 #include <thread>
 #include <unordered_set>
 #include "arrow/compute/exec/bloom_filter.h"
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/task_util.h"
 #include "arrow/compute/exec/test_util.h"
 #include "arrow/compute/exec/util.h"
diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc
index 6155ebd603f..c270b868ecc 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node.cc
@@ -24,7 +24,7 @@
 #include "arrow/compute/exec/hash_join.h"
 #include "arrow/compute/exec/hash_join_dict.h"
 #include "arrow/compute/exec/hash_join_node.h"
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/schema_util.h"
 #include "arrow/compute/exec/util.h"
diff --git a/cpp/src/arrow/compute/exec/swiss_join.cc b/cpp/src/arrow/compute/exec/swiss_join.cc
index de9b720c480..69479325bbb 100644
--- a/cpp/src/arrow/compute/exec/swiss_join.cc
+++ b/cpp/src/arrow/compute/exec/swiss_join.cc
@@ -22,7 +22,7 @@
 #include <mutex>
 #include "arrow/array/util.h"  // MakeArrayFromScalar
 #include "arrow/compute/exec/hash_join.h"
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/swiss_join_internal.h"
 #include "arrow/compute/exec/util.h"
 #include "arrow/compute/kernels/row_encoder_internal.h"
diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h
index a2018277cdc..def9e0c714b 100644
--- a/cpp/src/arrow/compute/exec/util.h
+++ b/cpp/src/arrow/compute/exec/util.h
@@ -28,6 +28,7 @@
 #include "arrow/compute/exec/expression.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/type_fwd.h"
+#include "arrow/compute/util.h"
 #include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
@@ -38,31 +39,33 @@
 #include "arrow/util/thread_pool.h"
 #include "arrow/util/type_fwd.h"
 
-#if defined(__clang__) || defined(__GNUC__)
-#define BYTESWAP(x) __builtin_bswap64(x)
-#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
-#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
-#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-#elif defined(_MSC_VER)
-#include <intrin.h>
-#define BYTESWAP(x) _byteswap_uint64(x)
-#define ROTL(x, n) _rotl((x), (n))
-#define ROTL64(x, n) _rotl64((x), (n))
-#if defined(_M_X64) || defined(_M_I86)
-#include <mmintrin.h>  // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx
-#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-#else
-#define PREFETCH(ptr) (void)(ptr) /* disabled */
-#endif
-#endif
+// DIPO
+//#if defined(__clang__) || defined(__GNUC__)
+//#define BYTESWAP(x) __builtin_bswap64(x)
+//#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
+//#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
+//#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+//#elif defined(_MSC_VER)
+//#include <intrin.h>
+//#define BYTESWAP(x) _byteswap_uint64(x)
+//#define ROTL(x, n) _rotl((x), (n))
+//#define ROTL64(x, n) _rotl64((x), (n))
+//#if defined(_M_X64) || defined(_M_I86)
+//#include <mmintrin.h>  // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx
+//#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+//#else
+//#define PREFETCH(ptr) (void)(ptr) /* disabled */
+//#endif
+//#endif
 
 namespace arrow {
 namespace util {
 
-template <typename T>
-inline void CheckAlignment(const void* ptr) {
-  ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
-}
+// DIPO
+//template <typename T>
+//inline void CheckAlignment(const void* ptr) {
+//  ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
+//}
 
 // Some platforms typedef int64_t as long int instead of long long int,
 // which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
@@ -84,69 +87,70 @@ class MiniBatch {
   static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
 };
 
-/// Storage used to allocate temporary vectors of a batch size.
-/// Temporary vectors should resemble allocating temporary variables on the stack
-/// but in the context of vectorized processing where we need to store a vector of
-/// temporaries instead of a single value.
-class TempVectorStack {
-  template <typename>
-  friend class TempVectorHolder;
-
- public:
-  Status Init(MemoryPool* pool, int64_t size) {
-    num_vectors_ = 0;
-    top_ = 0;
-    buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t);
-    ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
-    // Ensure later operations don't accidentally read uninitialized memory.
-    std::memset(buffer->mutable_data(), 0xFF, size);
-    buffer_ = std::move(buffer);
-    return Status::OK();
-  }
-
- private:
-  int64_t PaddedAllocationSize(int64_t num_bytes) {
-    // Round up allocation size to multiple of 8 bytes
-    // to avoid returning temp vectors with unaligned address.
-    //
-    // Also add padding at the end to facilitate loads and stores
-    // using SIMD when number of vector elements is not divisible
-    // by the number of SIMD lanes.
-    //
-    return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
-  }
-  void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
-    int64_t old_top = top_;
-    top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
-    // Stack overflow check
-    ARROW_DCHECK(top_ <= buffer_size_);
-    *data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
-    // We set 8 bytes before the beginning of the allocated range and
-    // 8 bytes after the end to check for stack overflow (which would
-    // result in those known bytes being corrupted).
-    reinterpret_cast<uint64_t*>(buffer_->mutable_data() + old_top)[0] = kGuard1;
-    reinterpret_cast<uint64_t*>(buffer_->mutable_data() + top_)[-1] = kGuard2;
-    *id = num_vectors_++;
-  }
-  void release(int id, uint32_t num_bytes) {
-    ARROW_DCHECK(num_vectors_ == id + 1);
-    int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
-    ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[-1] ==
-                 kGuard2);
-    ARROW_DCHECK(top_ >= size);
-    top_ -= size;
-    ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[0] ==
-                 kGuard1);
-    --num_vectors_;
-  }
-  static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
-  static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
-  static constexpr int64_t kPadding = 64;
-  int num_vectors_;
-  int64_t top_;
-  std::unique_ptr<Buffer> buffer_;
-  int64_t buffer_size_;
-};
+// DIPO
+///// Storage used to allocate temporary vectors of a batch size.
+///// Temporary vectors should resemble allocating temporary variables on the stack
+///// but in the context of vectorized processing where we need to store a vector of
+///// temporaries instead of a single value.
+//class TempVectorStack {
+//  template <typename>
+//  friend class TempVectorHolder;
+//
+// public:
+//  Status Init(MemoryPool* pool, int64_t size) {
+//    num_vectors_ = 0;
+//    top_ = 0;
+//    buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t);
+//    ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
+//    // Ensure later operations don't accidentally read uninitialized memory.
+//    std::memset(buffer->mutable_data(), 0xFF, size);
+//    buffer_ = std::move(buffer);
+//    return Status::OK();
+//  }
+//
+// private:
+//  int64_t PaddedAllocationSize(int64_t num_bytes) {
+//    // Round up allocation size to multiple of 8 bytes
+//    // to avoid returning temp vectors with unaligned address.
+//    //
+//    // Also add padding at the end to facilitate loads and stores
+//    // using SIMD when number of vector elements is not divisible
+//    // by the number of SIMD lanes.
+//    //
+//    return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
+//  }
+//  void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
+//    int64_t old_top = top_;
+//    top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
+//    // Stack overflow check
+//    ARROW_DCHECK(top_ <= buffer_size_);
+//    *data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
+//    // We set 8 bytes before the beginning of the allocated range and
+//    // 8 bytes after the end to check for stack overflow (which would
+//    // result in those known bytes being corrupted).
+//    reinterpret_cast<uint64_t*>(buffer_->mutable_data() + old_top)[0] = kGuard1;
+//    reinterpret_cast<uint64_t*>(buffer_->mutable_data() + top_)[-1] = kGuard2;
+//    *id = num_vectors_++;
+//  }
+//  void release(int id, uint32_t num_bytes) {
+//    ARROW_DCHECK(num_vectors_ == id + 1);
+//    int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
+//    ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[-1] ==
+//                 kGuard2);
+//    ARROW_DCHECK(top_ >= size);
+//    top_ -= size;
+//    ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[0] ==
+//                 kGuard1);
+//    --num_vectors_;
+//  }
+//  static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
+//  static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
+//  static constexpr int64_t kPadding = 64;
+//  int num_vectors_;
+//  int64_t top_;
+//  std::unique_ptr<Buffer> buffer_;
+//  int64_t buffer_size_;
+//};
 
 template <typename T>
 class TempVectorHolder {
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index c0459a14859..eecfb054321 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -29,7 +29,7 @@
 #include "arrow/buffer_builder.h"
 #include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/api_vector.h"
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/key_map.h"
 #include "arrow/compute/exec/util.h"
 #include "arrow/compute/exec_internal.h"
diff --git a/cpp/src/arrow/compute/exec/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc
similarity index 99%
rename from cpp/src/arrow/compute/exec/key_hash.cc
rename to cpp/src/arrow/compute/key_hash.cc
index 5ff0d4cf1e5..993d68b1fa3 100644
--- a/cpp/src/arrow/compute/exec/key_hash.cc
+++ b/cpp/src/arrow/compute/key_hash.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/exec/key_hash.h"
+#include "key_hash.h"
 
 #include <memory.h>
 
diff --git a/cpp/src/arrow/compute/exec/key_hash.h b/cpp/src/arrow/compute/key_hash.h
similarity index 100%
rename from cpp/src/arrow/compute/exec/key_hash.h
rename to cpp/src/arrow/compute/key_hash.h
diff --git a/cpp/src/arrow/compute/exec/key_hash_avx2.cc b/cpp/src/arrow/compute/key_hash_avx2.cc
similarity index 99%
rename from cpp/src/arrow/compute/exec/key_hash_avx2.cc
rename to cpp/src/arrow/compute/key_hash_avx2.cc
index d36df9fc9f3..f30c3460bda 100644
--- a/cpp/src/arrow/compute/exec/key_hash_avx2.cc
+++ b/cpp/src/arrow/compute/key_hash_avx2.cc
@@ -17,7 +17,7 @@
 
 #include <immintrin.h>
 
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/util/bit_util.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/exec/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc
similarity index 99%
rename from cpp/src/arrow/compute/exec/key_hash_test.cc
rename to cpp/src/arrow/compute/key_hash_test.cc
index 47f0f34560e..1ee9eb25312 100644
--- a/cpp/src/arrow/compute/exec/key_hash_test.cc
+++ b/cpp/src/arrow/compute/key_hash_test.cc
@@ -21,7 +21,7 @@
 #include <map>
 #include <unordered_set>
 #include "arrow/array/builder_binary.h"
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/test_util.h"
 #include "arrow/compute/exec/util.h"
 #include "arrow/util/cpu_info.h"
diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h
index 389b63cca41..33b48161733 100644
--- a/cpp/src/arrow/compute/light_array.h
+++ b/cpp/src/arrow/compute/light_array.h
@@ -21,7 +21,8 @@
 
 #include "arrow/array.h"
 #include "arrow/compute/exec.h"
-#include "arrow/compute/exec/util.h"
+// DIPO #include "arrow/compute/exec/util.h"
+#include "arrow/compute/util.h"
 #include "arrow/type.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/logging.h"
diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
index dcc7841a091..015d407e810 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -20,7 +20,7 @@
 #include <gtest/gtest.h>
 #include <numeric>
 
-#include "arrow/compute/exec/test_util.h"
+// DIPO #include "arrow/compute/exec/test_util.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
@@ -216,26 +216,27 @@ TEST(KeyColumnArray, SliceBool) {
   }
 }
 
-TEST(KeyColumnArray, FromExecBatch) {
-  ExecBatch batch =
-      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-  std::vector<KeyColumnArray> arrays;
-  ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays));
-
-  ASSERT_EQ(2, arrays.size());
-  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
-  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
-  ASSERT_EQ(3, arrays[0].length());
-  ASSERT_EQ(3, arrays[1].length());
-
-  ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays));
-
-  ASSERT_EQ(2, arrays.size());
-  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
-  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
-  ASSERT_EQ(1, arrays[0].length());
-  ASSERT_EQ(1, arrays[1].length());
-}
+// DIPO
+//TEST(KeyColumnArray, FromExecBatch) {
+//  ExecBatch batch =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+//  std::vector<KeyColumnArray> arrays;
+//  ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays));
+//
+//  ASSERT_EQ(2, arrays.size());
+//  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
+//  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
+//  ASSERT_EQ(3, arrays[0].length());
+//  ASSERT_EQ(3, arrays[1].length());
+//
+//  ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays));
+//
+//  ASSERT_EQ(2, arrays.size());
+//  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
+//  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
+//  ASSERT_EQ(1, arrays[0].length());
+//  ASSERT_EQ(1, arrays[1].length());
+//}
 
 TEST(ResizableArrayData, Basic) {
   std::unique_ptr<MemoryPool> pool = MemoryPool::CreateDefault();
@@ -314,126 +315,127 @@ TEST(ResizableArrayData, Binary) {
     ASSERT_EQ(0, pool->bytes_allocated());
   }
 }
-
-TEST(ExecBatchBuilder, AppendBatches) {
-  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-  MemoryPool* pool = owned_pool.get();
-  ExecBatch batch_one =
-      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-  ExecBatch batch_two =
-      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
-  ExecBatch combined = ExecBatchFromJSON(
-      {int64(), boolean()},
-      "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]");
-  {
-    ExecBatchBuilder builder;
-    uint16_t row_ids[3] = {0, 1, 2};
-    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
-    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2));
-    ExecBatch built = builder.Flush();
-    ASSERT_EQ(combined, built);
-    ASSERT_NE(0, pool->bytes_allocated());
-  }
-  ASSERT_EQ(0, pool->bytes_allocated());
-}
-
-TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
-  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-  MemoryPool* pool = owned_pool.get();
-  ExecBatch batch_one =
-      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-  ExecBatch batch_two =
-      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
-  ExecBatch combined = ExecBatchFromJSON(
-      {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]");
-  {
-    ExecBatchBuilder builder;
-    uint16_t row_ids[2] = {0, 1};
-    ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2));
-    ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2));
-    ExecBatch built = builder.Flush();
-    ASSERT_EQ(combined, built);
-    ASSERT_NE(0, pool->bytes_allocated());
-  }
-  ASSERT_EQ(0, pool->bytes_allocated());
-}
-
-TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
-  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-  MemoryPool* pool = owned_pool.get();
-  ExecBatch batch_one =
-      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-  ExecBatch batch_two =
-      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
-  ExecBatch first_col_only =
-      ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]");
-  ExecBatch last_col_only = ExecBatchFromJSON(
-      {boolean()}, "[[true], [false], [null], [true], [true], [false]]");
-  {
-    ExecBatchBuilder builder;
-    uint16_t row_ids[3] = {0, 1, 2};
-    int first_col_ids[1] = {0};
-    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
-                                     first_col_ids));
-    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
-                                     first_col_ids));
-    ExecBatch built = builder.Flush();
-    ASSERT_EQ(first_col_only, built);
-    ASSERT_NE(0, pool->bytes_allocated());
-  }
-  {
-    ExecBatchBuilder builder;
-    uint16_t row_ids[3] = {0, 1, 2};
-    // If we don't specify col_ids and num_cols is 1 it is implicitly the first col
-    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1));
-    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1));
-    ExecBatch built = builder.Flush();
-    ASSERT_EQ(first_col_only, built);
-    ASSERT_NE(0, pool->bytes_allocated());
-  }
-  {
-    ExecBatchBuilder builder;
-    uint16_t row_ids[3] = {0, 1, 2};
-    int last_col_ids[1] = {1};
-    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
-                                     last_col_ids));
-    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
-                                     last_col_ids));
-    ExecBatch built = builder.Flush();
-    ASSERT_EQ(last_col_only, built);
-    ASSERT_NE(0, pool->bytes_allocated());
-  }
-  ASSERT_EQ(0, pool->bytes_allocated());
-}
-
-TEST(ExecBatchBuilder, AppendNulls) {
-  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-  MemoryPool* pool = owned_pool.get();
-  ExecBatch batch_one =
-      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-  ExecBatch combined = ExecBatchFromJSON(
-      {int64(), boolean()},
-      "[[1, true], [2, false], [null, null], [null, null], [null, null]]");
-  ExecBatch just_nulls =
-      ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]");
-  {
-    ExecBatchBuilder builder;
-    uint16_t row_ids[3] = {0, 1, 2};
-    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
-    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
-    ExecBatch built = builder.Flush();
-    ASSERT_EQ(combined, built);
-    ASSERT_NE(0, pool->bytes_allocated());
-  }
-  {
-    ExecBatchBuilder builder;
-    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
-    ExecBatch built = builder.Flush();
-    ASSERT_EQ(just_nulls, built);
-    ASSERT_NE(0, pool->bytes_allocated());
-  }
-  ASSERT_EQ(0, pool->bytes_allocated());
-}
+// DIPO
+//TEST(ExecBatchBuilder, AppendBatches) {
+//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+//  MemoryPool* pool = owned_pool.get();
+//  ExecBatch batch_one =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+//  ExecBatch batch_two =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
+//  ExecBatch combined = ExecBatchFromJSON(
+//      {int64(), boolean()},
+//      "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]");
+//  {
+//    ExecBatchBuilder builder;
+//    uint16_t row_ids[3] = {0, 1, 2};
+//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
+//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2));
+//    ExecBatch built = builder.Flush();
+//    ASSERT_EQ(combined, built);
+//    ASSERT_NE(0, pool->bytes_allocated());
+//  }
+//  ASSERT_EQ(0, pool->bytes_allocated());
+//}
+
+// DIPO
+//TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
+//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+//  MemoryPool* pool = owned_pool.get();
+//  ExecBatch batch_one =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+//  ExecBatch batch_two =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
+//  ExecBatch combined = ExecBatchFromJSON(
+//      {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]");
+//  {
+//    ExecBatchBuilder builder;
+//    uint16_t row_ids[2] = {0, 1};
+//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2));
+//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2));
+//    ExecBatch built = builder.Flush();
+//    ASSERT_EQ(combined, built);
+//    ASSERT_NE(0, pool->bytes_allocated());
+//  }
+//  ASSERT_EQ(0, pool->bytes_allocated());
+//}
+// DIPO
+//TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
+//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+//  MemoryPool* pool = owned_pool.get();
+//  ExecBatch batch_one =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+//  ExecBatch batch_two =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
+//  ExecBatch first_col_only =
+//      ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]");
+//  ExecBatch last_col_only = ExecBatchFromJSON(
+//      {boolean()}, "[[true], [false], [null], [true], [true], [false]]");
+//  {
+//    ExecBatchBuilder builder;
+//    uint16_t row_ids[3] = {0, 1, 2};
+//    int first_col_ids[1] = {0};
+//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
+//                                     first_col_ids));
+//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
+//                                     first_col_ids));
+//    ExecBatch built = builder.Flush();
+//    ASSERT_EQ(first_col_only, built);
+//    ASSERT_NE(0, pool->bytes_allocated());
+//  }
+//  {
+//    ExecBatchBuilder builder;
+//    uint16_t row_ids[3] = {0, 1, 2};
+//    // If we don't specify col_ids and num_cols is 1 it is implicitly the first col
+//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1));
+//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1));
+//    ExecBatch built = builder.Flush();
+//    ASSERT_EQ(first_col_only, built);
+//    ASSERT_NE(0, pool->bytes_allocated());
+//  }
+//  {
+//    ExecBatchBuilder builder;
+//    uint16_t row_ids[3] = {0, 1, 2};
+//    int last_col_ids[1] = {1};
+//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
+//                                     last_col_ids));
+//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
+//                                     last_col_ids));
+//    ExecBatch built = builder.Flush();
+//    ASSERT_EQ(last_col_only, built);
+//    ASSERT_NE(0, pool->bytes_allocated());
+//  }
+//  ASSERT_EQ(0, pool->bytes_allocated());
+//}
+//
+//TEST(ExecBatchBuilder, AppendNulls) {
+//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+//  MemoryPool* pool = owned_pool.get();
+//  ExecBatch batch_one =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+//  ExecBatch combined = ExecBatchFromJSON(
+//      {int64(), boolean()},
+//      "[[1, true], [2, false], [null, null], [null, null], [null, null]]");
+//  ExecBatch just_nulls =
+//      ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]");
+//  {
+//    ExecBatchBuilder builder;
+//    uint16_t row_ids[3] = {0, 1, 2};
+//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
+//    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
+//    ExecBatch built = builder.Flush();
+//    ASSERT_EQ(combined, built);
+//    ASSERT_NE(0, pool->bytes_allocated());
+//  }
+//  {
+//    ExecBatchBuilder builder;
+//    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
+//    ExecBatch built = builder.Flush();
+//    ASSERT_EQ(just_nulls, built);
+//    ASSERT_NE(0, pool->bytes_allocated());
+//  }
+//  ASSERT_EQ(0, pool->bytes_allocated());
+//}
 
 TEST(ExecBatchBuilder, AppendNullsBeyondLimit) {
   std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index d003137d3e5..579c448319a 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -20,7 +20,7 @@
 #include <memory>
 #include <mutex>
 
-#include "arrow/compute/exec/key_hash.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/key_map.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec_internal.h"
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
new file mode 100644
index 00000000000..42477857a18
--- /dev/null
+++ b/cpp/src/arrow/compute/util.h
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <optional>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/type_fwd.h"
+
+#if defined(__clang__) || defined(__GNUC__)
+#define BYTESWAP(x) __builtin_bswap64(x)
+#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
+#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
+#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#define BYTESWAP(x) _byteswap_uint64(x)
+#define ROTL(x, n) _rotl((x), (n))
+#define ROTL64(x, n) _rotl64((x), (n))
+#if defined(_M_X64) || defined(_M_I86)
+#include <mmintrin.h>  // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx
+#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#else
+#define PREFETCH(ptr) (void)(ptr) /* disabled */
+#endif
+#endif
+
+namespace arrow {
+namespace util {
+
+template<typename T>
+inline void CheckAlignment(const void *ptr) {
+      ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
+}
+
+/// Storage used to allocate temporary vectors of a batch size.
+/// Temporary vectors should resemble allocating temporary variables on the stack
+/// but in the context of vectorized processing where we need to store a vector of
+/// temporaries instead of a single value.
+class TempVectorStack {
+  template<typename>
+  friend
+  class TempVectorHolder;
+
+ public:
+  Status Init(MemoryPool *pool, int64_t size) {
+    num_vectors_ = 0;
+    top_ = 0;
+    buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t);
+    ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
+    // Ensure later operations don't accidentally read uninitialized memory.
+    std::memset(buffer->mutable_data(), 0xFF, size);
+    buffer_ = std::move(buffer);
+    return Status::OK();
+  }
+
+ private:
+  int64_t PaddedAllocationSize(int64_t num_bytes) {
+    // Round up allocation size to multiple of 8 bytes
+    // to avoid returning temp vectors with unaligned address.
+    //
+    // Also add padding at the end to facilitate loads and stores
+    // using SIMD when number of vector elements is not divisible
+    // by the number of SIMD lanes.
+    //
+    return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
+  }
+  void alloc(uint32_t num_bytes, uint8_t **data, int *id) {
+    int64_t old_top = top_;
+    top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
+    // Stack overflow check
+        ARROW_DCHECK(top_ <= buffer_size_);
+    *data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
+    // We set 8 bytes before the beginning of the allocated range and
+    // 8 bytes after the end to check for stack overflow (which would
+    // result in those known bytes being corrupted).
+    reinterpret_cast<uint64_t *>(buffer_->mutable_data() + old_top)[0] = kGuard1;
+    reinterpret_cast<uint64_t *>(buffer_->mutable_data() + top_)[-1] = kGuard2;
+    *id = num_vectors_++;
+  }
+  void release(int id, uint32_t num_bytes) {
+        ARROW_DCHECK(num_vectors_ == id + 1);
+    int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
+        ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[-1] ==
+        kGuard2);
+        ARROW_DCHECK(top_ >= size);
+    top_ -= size;
+        ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[0] ==
+        kGuard1);
+    --num_vectors_;
+  }
+  static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
+  static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
+  static constexpr int64_t kPadding = 64;
+  int num_vectors_;
+  int64_t top_;
+  std::unique_ptr<Buffer> buffer_;
+  int64_t buffer_size_;
+};
+
+}
+}
\ No newline at end of file

From c985f97ebfc82f5d92131a1acc1d9e87efe7f4c9 Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 07:53:16 -0500
Subject: [PATCH 02/11] Moved code from compute/exec to compute to reduce
 entanglement with acero

---
 cpp/src/arrow/CMakeLists.txt                  |   9 +-
 .../arrow/compute/exec/swiss_join_internal.h  |   2 +-
 cpp/src/arrow/compute/exec/util.cc            | 606 +++++++++---------
 cpp/src/arrow/compute/exec/util.h             | 202 +++---
 .../arrow/compute/kernels/hash_aggregate.cc   |   2 +-
 cpp/src/arrow/compute/{exec => }/key_map.cc   |  50 +-
 cpp/src/arrow/compute/{exec => }/key_map.h    |  50 +-
 .../arrow/compute/{exec => }/key_map_avx2.cc  |   2 +-
 cpp/src/arrow/compute/row/compare_internal.cc |   4 +-
 cpp/src/arrow/compute/row/compare_internal.h  |   3 +-
 cpp/src/arrow/compute/row/encode_internal.cc  |   3 +-
 cpp/src/arrow/compute/row/encode_internal.h   |   6 +-
 cpp/src/arrow/compute/row/grouper.cc          |   7 +-
 cpp/src/arrow/compute/row/grouper.h           |   5 +-
 cpp/src/arrow/compute/row/row_internal.cc     |   3 +-
 cpp/src/arrow/compute/util.cc                 | 363 +++++++++++
 cpp/src/arrow/compute/util.h                  | 133 +++-
 cpp/src/arrow/compute/util_internal.h         |  30 +
 18 files changed, 981 insertions(+), 499 deletions(-)
 rename cpp/src/arrow/compute/{exec => }/key_map.cc (94%)
 rename cpp/src/arrow/compute/{exec => }/key_map.h (80%)
 rename cpp/src/arrow/compute/{exec => }/key_map_avx2.cc (99%)
 create mode 100644 cpp/src/arrow/compute/util.cc
 create mode 100644 cpp/src/arrow/compute/util_internal.h

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index bf406ca29fb..dc41f238d2c 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -401,8 +401,6 @@ list(APPEND
      compute/exec/hash_join.cc
      compute/exec/hash_join_dict.cc
      compute/exec/hash_join_node.cc
-        compute/key_hash.cc
-     compute/exec/key_map.cc
      compute/exec/map_node.cc
      compute/exec/options.cc
      compute/exec/order_by_impl.cc
@@ -420,6 +418,8 @@ list(APPEND
      compute/function.cc
      compute/function_internal.cc
      compute/kernel.cc
+     compute/key_hash.cc
+     compute/key_map.cc
      compute/light_array.cc
      compute/ordering.cc
      compute/registry.cc
@@ -439,11 +439,12 @@ list(APPEND
      compute/row/encode_internal.cc
      compute/row/compare_internal.cc
      compute/row/grouper.cc
-     compute/row/row_internal.cc)
+     compute/row/row_internal.cc
+     compute/util.cc)
 
 append_avx2_src(compute/exec/bloom_filter_avx2.cc)
 append_avx2_src(compute/key_hash_avx2.cc)
-append_avx2_src(compute/exec/key_map_avx2.cc)
+append_avx2_src(compute/key_map_avx2.cc)
 append_avx2_src(compute/exec/swiss_join_avx2.cc)
 append_avx2_src(compute/exec/util_avx2.cc)
 append_avx2_src(compute/row/compare_internal_avx2.cc)
diff --git a/cpp/src/arrow/compute/exec/swiss_join_internal.h b/cpp/src/arrow/compute/exec/swiss_join_internal.h
index 355aff70944..4c765874bea 100644
--- a/cpp/src/arrow/compute/exec/swiss_join_internal.h
+++ b/cpp/src/arrow/compute/exec/swiss_join_internal.h
@@ -18,7 +18,7 @@
 #pragma once
 
 #include <cstdint>
-#include "arrow/compute/exec/key_map.h"
+#include "arrow/compute/key_map.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/partition_util.h"
 #include "arrow/compute/exec/schema_util.h"
diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc
index 752f8cac764..98c193a8920 100644
--- a/cpp/src/arrow/compute/exec/util.cc
+++ b/cpp/src/arrow/compute/exec/util.cc
@@ -26,311 +26,311 @@
 
 namespace arrow {
 
-using bit_util::CountTrailingZeros;
+//using bit_util::CountTrailingZeros;
 
 namespace util {
-
-inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
-  // This will not be correct on big-endian architectures.
-#if !ARROW_LITTLE_ENDIAN
-  ARROW_DCHECK(false);
-#endif
-  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
-  if (num_bytes == 8) {
-    return util::SafeLoad(reinterpret_cast<const uint64_t*>(bytes));
-  } else {
-    uint64_t word = 0;
-    for (int i = 0; i < num_bytes; ++i) {
-      word |= static_cast<uint64_t>(bytes[i]) << (8 * i);
-    }
-    return word;
-  }
-}
-
-inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) {
-  // This will not be correct on big-endian architectures.
-#if !ARROW_LITTLE_ENDIAN
-  ARROW_DCHECK(false);
-#endif
-  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
-  if (num_bytes == 8) {
-    util::SafeStore(reinterpret_cast<uint64_t*>(bytes), value);
-  } else {
-    for (int i = 0; i < num_bytes; ++i) {
-      bytes[i] = static_cast<uint8_t>(value >> (8 * i));
-    }
-  }
-}
-
-inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
-                                             int* num_indexes, uint16_t* indexes) {
-  int n = *num_indexes;
-  while (word) {
-    indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
-    word &= word - 1;
-  }
-  *num_indexes = n;
-}
-
-inline void bit_util::bits_filter_indexes_helper(uint64_t word,
-                                                 const uint16_t* input_indexes,
-                                                 int* num_indexes, uint16_t* indexes) {
-  int n = *num_indexes;
-  while (word) {
-    indexes[n++] = input_indexes[CountTrailingZeros(word)];
-    word &= word - 1;
-  }
-  *num_indexes = n;
-}
-
-template <int bit_to_search, bool filter_input_indexes>
-void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
-                                        const uint8_t* bits,
-                                        const uint16_t* input_indexes, int* num_indexes,
-                                        uint16_t* indexes, uint16_t base_index) {
-  // 64 bits at a time
-  constexpr int unroll = 64;
-  int tail = num_bits % unroll;
-#if defined(ARROW_HAVE_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-    if (filter_input_indexes) {
-      bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
-                               num_indexes, indexes);
-    } else {
-      bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes,
-                           base_index);
-    }
-  } else {
-#endif
-    *num_indexes = 0;
-    for (int i = 0; i < num_bits / unroll; ++i) {
-      uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
-      if (bit_to_search == 0) {
-        word = ~word;
-      }
-      if (filter_input_indexes) {
-        bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
-      } else {
-        bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes);
-      }
-    }
-#if defined(ARROW_HAVE_AVX2)
-  }
-#endif
-  // Optionally process the last partial word with masking out bits outside range
-  if (tail) {
-    const uint8_t* bits_tail = bits + (num_bits - tail) / 8;
-    uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8);
-    if (bit_to_search == 0) {
-      word = ~word;
-    }
-    word &= ~0ULL >> (64 - tail);
-    if (filter_input_indexes) {
-      bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
-                                 indexes);
-    } else {
-      bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes);
-    }
-  }
-}
-
-void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits,
-                               const uint8_t* bits, int* num_indexes, uint16_t* indexes,
-                               int bit_offset) {
-  bits += bit_offset / 8;
-  bit_offset %= 8;
-  *num_indexes = 0;
-  uint16_t base_index = 0;
-  if (bit_offset != 0) {
-    uint64_t bits_head = bits[0] >> bit_offset;
-    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-    bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
-                    reinterpret_cast<const uint8_t*>(&bits_head), num_indexes, indexes);
-    if (num_bits <= bits_in_first_byte) {
-      return;
-    }
-    num_bits -= bits_in_first_byte;
-    indexes += *num_indexes;
-    bits += 1;
-    base_index = bits_in_first_byte;
-  }
-
-  int num_indexes_new = 0;
-  if (bit_to_search == 0) {
-    bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
-                                       &num_indexes_new, indexes, base_index);
-  } else {
-    ARROW_DCHECK(bit_to_search == 1);
-    bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
-                                       &num_indexes_new, indexes, base_index);
-  }
-  *num_indexes += num_indexes_new;
-}
-
-void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
-                                   const int num_bits, const uint8_t* bits,
-                                   const uint16_t* input_indexes, int* num_indexes,
-                                   uint16_t* indexes, int bit_offset) {
-  bits += bit_offset / 8;
-  bit_offset %= 8;
-  if (bit_offset != 0) {
-    int num_indexes_head = 0;
-    uint64_t bits_head = bits[0] >> bit_offset;
-    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-    bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
-                        reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
-                        &num_indexes_head, indexes);
-    int num_indexes_tail = 0;
-    if (num_bits > bits_in_first_byte) {
-      bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
-                          bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
-                          indexes + num_indexes_head);
-    }
-    *num_indexes = num_indexes_head + num_indexes_tail;
-    return;
-  }
-
-  if (bit_to_search == 0) {
-    bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
-                                      num_indexes, indexes);
-  } else {
-    ARROW_DCHECK(bit_to_search == 1);
-    bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
-                                      num_indexes, indexes);
-  }
-}
-
-void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits,
-                                  const uint8_t* bits, int* num_indexes_bit0,
-                                  uint16_t* indexes_bit0, uint16_t* indexes_bit1,
-                                  int bit_offset) {
-  bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
-                  bit_offset);
-  int num_indexes_bit1;
-  bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
-                  bit_offset);
-}
-
-void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits,
-                             const uint8_t* bits, uint8_t* bytes, int bit_offset) {
-  bits += bit_offset / 8;
-  bit_offset %= 8;
-  if (bit_offset != 0) {
-    uint64_t bits_head = bits[0] >> bit_offset;
-    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-    bits_to_bytes(hardware_flags, bits_in_first_byte,
-                  reinterpret_cast<const uint8_t*>(&bits_head), bytes);
-    if (num_bits > bits_in_first_byte) {
-      bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
-                    bytes + bits_in_first_byte);
-    }
-    return;
-  }
-
-  int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-    // The function call below processes whole 32 bit chunks together.
-    num_processed = num_bits - (num_bits % 32);
-    bits_to_bytes_avx2(num_processed, bits, bytes);
-  }
-#endif
-  // Processing 8 bits at a time
-  constexpr int unroll = 8;
-  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
-    uint8_t bits_next = bits[i];
-    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
-    // from the previous.
-    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
-                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
-                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
-    unpacked |= (bits_next & 1);
-    unpacked &= 0x0101010101010101ULL;
-    unpacked *= 255;
-    util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
-  }
-  int tail = num_bits % unroll;
-  if (tail) {
-    uint8_t bits_next = bits[(num_bits - tail) / unroll];
-    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
-    // from the previous.
-    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
-                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
-                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
-    unpacked |= (bits_next & 1);
-    unpacked &= 0x0101010101010101ULL;
-    unpacked *= 255;
-    SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked);
-  }
-}
-
-void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits,
-                             const uint8_t* bytes, uint8_t* bits, int bit_offset) {
-  bits += bit_offset / 8;
-  bit_offset %= 8;
-  if (bit_offset != 0) {
-    uint64_t bits_head;
-    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-    bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
-                  reinterpret_cast<uint8_t*>(&bits_head));
-    uint8_t mask = (1 << bit_offset) - 1;
-    *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
-
-    if (num_bits > bits_in_first_byte) {
-      bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
-                    bytes + bits_in_first_byte, bits + 1);
-    }
-    return;
-  }
-
-  int num_processed = 0;
-#if defined(ARROW_HAVE_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-    // The function call below processes whole 32 bit chunks together.
-    num_processed = num_bits - (num_bits % 32);
-    bytes_to_bits_avx2(num_processed, bytes, bits);
-  }
-#endif
-  // Process 8 bits at a time
-  constexpr int unroll = 8;
-  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
-    uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
-    bytes_next &= 0x0101010101010101ULL;
-    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
-    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
-    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
-    bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
-  }
-  int tail = num_bits % unroll;
-  if (tail) {
-    uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail);
-    bytes_next &= 0x0101010101010101ULL;
-    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
-    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
-    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
-    bits[num_bits / 8] = static_cast<uint8_t>(bytes_next & 0xff);
-  }
-}
-
-bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
-                                  uint32_t num_bytes) {
-#if defined(ARROW_HAVE_AVX2)
-  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-    return are_all_bytes_zero_avx2(bytes, num_bytes);
-  }
-#endif
-  uint64_t result_or = 0;
-  uint32_t i;
-  for (i = 0; i < num_bytes / 8; ++i) {
-    uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
-    result_or |= x;
-  }
-  if (num_bytes % 8 > 0) {
-    uint64_t tail = 0;
-    result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
-  }
-  return result_or == 0;
-}
+//
+//inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
+//  // This will not be correct on big-endian architectures.
+//#if !ARROW_LITTLE_ENDIAN
+//  ARROW_DCHECK(false);
+//#endif
+//  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
+//  if (num_bytes == 8) {
+//    return util::SafeLoad(reinterpret_cast<const uint64_t*>(bytes));
+//  } else {
+//    uint64_t word = 0;
+//    for (int i = 0; i < num_bytes; ++i) {
+//      word |= static_cast<uint64_t>(bytes[i]) << (8 * i);
+//    }
+//    return word;
+//  }
+//}
+//
+//inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) {
+//  // This will not be correct on big-endian architectures.
+//#if !ARROW_LITTLE_ENDIAN
+//  ARROW_DCHECK(false);
+//#endif
+//  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
+//  if (num_bytes == 8) {
+//    util::SafeStore(reinterpret_cast<uint64_t*>(bytes), value);
+//  } else {
+//    for (int i = 0; i < num_bytes; ++i) {
+//      bytes[i] = static_cast<uint8_t>(value >> (8 * i));
+//    }
+//  }
+//}
+//
+//inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+//                                             int* num_indexes, uint16_t* indexes) {
+//  int n = *num_indexes;
+//  while (word) {
+//    indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
+//    word &= word - 1;
+//  }
+//  *num_indexes = n;
+//}
+//
+//inline void bit_util::bits_filter_indexes_helper(uint64_t word,
+//                                                 const uint16_t* input_indexes,
+//                                                 int* num_indexes, uint16_t* indexes) {
+//  int n = *num_indexes;
+//  while (word) {
+//    indexes[n++] = input_indexes[CountTrailingZeros(word)];
+//    word &= word - 1;
+//  }
+//  *num_indexes = n;
+//}
+//
+//template <int bit_to_search, bool filter_input_indexes>
+//void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+//                                        const uint8_t* bits,
+//                                        const uint16_t* input_indexes, int* num_indexes,
+//                                        uint16_t* indexes, uint16_t base_index) {
+//  // 64 bits at a time
+//  constexpr int unroll = 64;
+//  int tail = num_bits % unroll;
+//#if defined(ARROW_HAVE_AVX2)
+//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+//    if (filter_input_indexes) {
+//      bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
+//                               num_indexes, indexes);
+//    } else {
+//      bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes,
+//                           base_index);
+//    }
+//  } else {
+//#endif
+//    *num_indexes = 0;
+//    for (int i = 0; i < num_bits / unroll; ++i) {
+//      uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
+//      if (bit_to_search == 0) {
+//        word = ~word;
+//      }
+//      if (filter_input_indexes) {
+//        bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
+//      } else {
+//        bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes);
+//      }
+//    }
+//#if defined(ARROW_HAVE_AVX2)
+//  }
+//#endif
+//  // Optionally process the last partial word with masking out bits outside range
+//  if (tail) {
+//    const uint8_t* bits_tail = bits + (num_bits - tail) / 8;
+//    uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8);
+//    if (bit_to_search == 0) {
+//      word = ~word;
+//    }
+//    word &= ~0ULL >> (64 - tail);
+//    if (filter_input_indexes) {
+//      bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
+//                                 indexes);
+//    } else {
+//      bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes);
+//    }
+//  }
+//}
+//
+//void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits,
+//                               const uint8_t* bits, int* num_indexes, uint16_t* indexes,
+//                               int bit_offset) {
+//  bits += bit_offset / 8;
+//  bit_offset %= 8;
+//  *num_indexes = 0;
+//  uint16_t base_index = 0;
+//  if (bit_offset != 0) {
+//    uint64_t bits_head = bits[0] >> bit_offset;
+//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+//    bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+//                    reinterpret_cast<const uint8_t*>(&bits_head), num_indexes, indexes);
+//    if (num_bits <= bits_in_first_byte) {
+//      return;
+//    }
+//    num_bits -= bits_in_first_byte;
+//    indexes += *num_indexes;
+//    bits += 1;
+//    base_index = bits_in_first_byte;
+//  }
+//
+//  int num_indexes_new = 0;
+//  if (bit_to_search == 0) {
+//    bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
+//                                       &num_indexes_new, indexes, base_index);
+//  } else {
+//    ARROW_DCHECK(bit_to_search == 1);
+//    bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
+//                                       &num_indexes_new, indexes, base_index);
+//  }
+//  *num_indexes += num_indexes_new;
+//}
+//
+//void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+//                                   const int num_bits, const uint8_t* bits,
+//                                   const uint16_t* input_indexes, int* num_indexes,
+//                                   uint16_t* indexes, int bit_offset) {
+//  bits += bit_offset / 8;
+//  bit_offset %= 8;
+//  if (bit_offset != 0) {
+//    int num_indexes_head = 0;
+//    uint64_t bits_head = bits[0] >> bit_offset;
+//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+//    bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+//                        reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
+//                        &num_indexes_head, indexes);
+//    int num_indexes_tail = 0;
+//    if (num_bits > bits_in_first_byte) {
+//      bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+//                          bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
+//                          indexes + num_indexes_head);
+//    }
+//    *num_indexes = num_indexes_head + num_indexes_tail;
+//    return;
+//  }
+//
+//  if (bit_to_search == 0) {
+//    bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
+//                                      num_indexes, indexes);
+//  } else {
+//    ARROW_DCHECK(bit_to_search == 1);
+//    bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
+//                                      num_indexes, indexes);
+//  }
+//}
+//
+//void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits,
+//                                  const uint8_t* bits, int* num_indexes_bit0,
+//                                  uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+//                                  int bit_offset) {
+//  bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
+//                  bit_offset);
+//  int num_indexes_bit1;
+//  bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
+//                  bit_offset);
+//}
+//
+//void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits,
+//                             const uint8_t* bits, uint8_t* bytes, int bit_offset) {
+//  bits += bit_offset / 8;
+//  bit_offset %= 8;
+//  if (bit_offset != 0) {
+//    uint64_t bits_head = bits[0] >> bit_offset;
+//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+//    bits_to_bytes(hardware_flags, bits_in_first_byte,
+//                  reinterpret_cast<const uint8_t*>(&bits_head), bytes);
+//    if (num_bits > bits_in_first_byte) {
+//      bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
+//                    bytes + bits_in_first_byte);
+//    }
+//    return;
+//  }
+//
+//  int num_processed = 0;
+//#if defined(ARROW_HAVE_AVX2)
+//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+//    // The function call below processes whole 32 bit chunks together.
+//    num_processed = num_bits - (num_bits % 32);
+//    bits_to_bytes_avx2(num_processed, bits, bytes);
+//  }
+//#endif
+//  // Processing 8 bits at a time
+//  constexpr int unroll = 8;
+//  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
+//    uint8_t bits_next = bits[i];
+//    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
+//    // from the previous.
+//    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
+//                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
+//                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
+//    unpacked |= (bits_next & 1);
+//    unpacked &= 0x0101010101010101ULL;
+//    unpacked *= 255;
+//    util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
+//  }
+//  int tail = num_bits % unroll;
+//  if (tail) {
+//    uint8_t bits_next = bits[(num_bits - tail) / unroll];
+//    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
+//    // from the previous.
+//    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
+//                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
+//                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
+//    unpacked |= (bits_next & 1);
+//    unpacked &= 0x0101010101010101ULL;
+//    unpacked *= 255;
+//    SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked);
+//  }
+//}
+//
+//void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits,
+//                             const uint8_t* bytes, uint8_t* bits, int bit_offset) {
+//  bits += bit_offset / 8;
+//  bit_offset %= 8;
+//  if (bit_offset != 0) {
+//    uint64_t bits_head;
+//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+//    bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
+//                  reinterpret_cast<uint8_t*>(&bits_head));
+//    uint8_t mask = (1 << bit_offset) - 1;
+//    *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
+//
+//    if (num_bits > bits_in_first_byte) {
+//      bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
+//                    bytes + bits_in_first_byte, bits + 1);
+//    }
+//    return;
+//  }
+//
+//  int num_processed = 0;
+//#if defined(ARROW_HAVE_AVX2)
+//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+//    // The function call below processes whole 32 bit chunks together.
+//    num_processed = num_bits - (num_bits % 32);
+//    bytes_to_bits_avx2(num_processed, bytes, bits);
+//  }
+//#endif
+//  // Process 8 bits at a time
+//  constexpr int unroll = 8;
+//  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
+//    uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+//    bytes_next &= 0x0101010101010101ULL;
+//    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
+//    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
+//    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
+//    bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
+//  }
+//  int tail = num_bits % unroll;
+//  if (tail) {
+//    uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail);
+//    bytes_next &= 0x0101010101010101ULL;
+//    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
+//    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
+//    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
+//    bits[num_bits / 8] = static_cast<uint8_t>(bytes_next & 0xff);
+//  }
+//}
+//
+//bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+//                                  uint32_t num_bytes) {
+//#if defined(ARROW_HAVE_AVX2)
+//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+//    return are_all_bytes_zero_avx2(bytes, num_bytes);
+//  }
+//#endif
+//  uint64_t result_or = 0;
+//  uint32_t i;
+//  for (i = 0; i < num_bytes / 8; ++i) {
+//    uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+//    result_or |= x;
+//  }
+//  if (num_bytes % 8 > 0) {
+//    uint64_t tail = 0;
+//    result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
+//  }
+//  return result_or == 0;
+//}
 
 }  // namespace util
 
diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h
index def9e0c714b..e0ffca29070 100644
--- a/cpp/src/arrow/compute/exec/util.h
+++ b/cpp/src/arrow/compute/exec/util.h
@@ -67,25 +67,25 @@ namespace util {
 //  ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
 //}
 
-// Some platforms typedef int64_t as long int instead of long long int,
-// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
-// which need long long.
-// We use the cast to the type below in these intrinsics to make the code
-// compile in all cases.
+//// Some platforms typedef int64_t as long int instead of long long int,
+//// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
+//// which need long long.
+//// We use the cast to the type below in these intrinsics to make the code
+//// compile in all cases.
+////
+//using int64_for_gather_t = const long long int;  // NOLINT runtime-int
 //
-using int64_for_gather_t = const long long int;  // NOLINT runtime-int
-
-// All MiniBatch... classes use TempVectorStack for vector allocations and can
-// only work with vectors up to 1024 elements.
-//
-// They should only be allocated on the stack to guarantee the right sequence
-// of allocation and deallocation of vectors from TempVectorStack.
-//
-class MiniBatch {
- public:
-  static constexpr int kLogMiniBatchLength = 10;
-  static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
-};
+//// All MiniBatch... classes use TempVectorStack for vector allocations and can
+//// only work with vectors up to 1024 elements.
+////
+//// They should only be allocated on the stack to guarantee the right sequence
+//// of allocation and deallocation of vectors from TempVectorStack.
+////
+//class MiniBatch {
+// public:
+//  static constexpr int kLogMiniBatchLength = 10;
+//  static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
+//};
 
 // DIPO
 ///// Storage used to allocate temporary vectors of a batch size.
@@ -151,89 +151,89 @@ class MiniBatch {
 //  std::unique_ptr<Buffer> buffer_;
 //  int64_t buffer_size_;
 //};
-
-template <typename T>
-class TempVectorHolder {
-  friend class TempVectorStack;
-
- public:
-  ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
-  T* mutable_data() { return reinterpret_cast<T*>(data_); }
-  TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
-    stack_ = stack;
-    num_elements_ = num_elements;
-    stack_->alloc(num_elements * sizeof(T), &data_, &id_);
-  }
-
- private:
-  TempVectorStack* stack_;
-  uint8_t* data_;
-  int id_;
-  uint32_t num_elements_;
-};
-
-class bit_util {
- public:
-  static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
-                              const int num_bits, const uint8_t* bits, int* num_indexes,
-                              uint16_t* indexes, int bit_offset = 0);
-
-  static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
-                                  const int num_bits, const uint8_t* bits,
-                                  const uint16_t* input_indexes, int* num_indexes,
-                                  uint16_t* indexes, int bit_offset = 0);
-
-  // Input and output indexes may be pointing to the same data (in-place filtering).
-  static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
-                                 const uint8_t* bits, int* num_indexes_bit0,
-                                 uint16_t* indexes_bit0, uint16_t* indexes_bit1,
-                                 int bit_offset = 0);
-
-  // Bit 1 is replaced with byte 0xFF.
-  static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
-                            const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
-
-  // Return highest bit of each byte.
-  static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
-                            const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
-
-  static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
-                                 uint32_t num_bytes);
-
- private:
-  inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes);
-  inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value);
-  inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
-                                            int* num_indexes, uint16_t* indexes);
-  inline static void bits_filter_indexes_helper(uint64_t word,
-                                                const uint16_t* input_indexes,
-                                                int* num_indexes, uint16_t* indexes);
-  template <int bit_to_search, bool filter_input_indexes>
-  static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
-                                       const uint8_t* bits, const uint16_t* input_indexes,
-                                       int* num_indexes, uint16_t* indexes,
-                                       uint16_t base_index = 0);
-
-#if defined(ARROW_HAVE_AVX2)
-  static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
-                                   const uint8_t* bits, int* num_indexes,
-                                   uint16_t* indexes, uint16_t base_index = 0);
-  static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
-                                       const uint8_t* bits, const uint16_t* input_indexes,
-                                       int* num_indexes, uint16_t* indexes);
-  template <int bit_to_search>
-  static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
-                                       int* num_indexes, uint16_t* indexes,
-                                       uint16_t base_index = 0);
-  template <int bit_to_search>
-  static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
-                                           const uint16_t* input_indexes,
-                                           int* num_indexes, uint16_t* indexes);
-  static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
-  static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
-  static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
-#endif
-};
+//
+//template <typename T>
+//class TempVectorHolder {
+//  friend class TempVectorStack;
+//
+// public:
+//  ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
+//  T* mutable_data() { return reinterpret_cast<T*>(data_); }
+//  TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
+//    stack_ = stack;
+//    num_elements_ = num_elements;
+//    stack_->alloc(num_elements * sizeof(T), &data_, &id_);
+//  }
+//
+// private:
+//  TempVectorStack* stack_;
+//  uint8_t* data_;
+//  int id_;
+//  uint32_t num_elements_;
+//};
+//
+//class bit_util {
+// public:
+//  static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+//                              const int num_bits, const uint8_t* bits, int* num_indexes,
+//                              uint16_t* indexes, int bit_offset = 0);
+//
+//  static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+//                                  const int num_bits, const uint8_t* bits,
+//                                  const uint16_t* input_indexes, int* num_indexes,
+//                                  uint16_t* indexes, int bit_offset = 0);
+//
+//  // Input and output indexes may be pointing to the same data (in-place filtering).
+//  static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
+//                                 const uint8_t* bits, int* num_indexes_bit0,
+//                                 uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+//                                 int bit_offset = 0);
+//
+//  // Bit 1 is replaced with byte 0xFF.
+//  static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
+//                            const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
+//
+//  // Return highest bit of each byte.
+//  static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
+//                            const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
+//
+//  static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+//                                 uint32_t num_bytes);
+//
+// private:
+//  inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes);
+//  inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value);
+//  inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+//                                            int* num_indexes, uint16_t* indexes);
+//  inline static void bits_filter_indexes_helper(uint64_t word,
+//                                                const uint16_t* input_indexes,
+//                                                int* num_indexes, uint16_t* indexes);
+//  template <int bit_to_search, bool filter_input_indexes>
+//  static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+//                                       const uint8_t* bits, const uint16_t* input_indexes,
+//                                       int* num_indexes, uint16_t* indexes,
+//                                       uint16_t base_index = 0);
+//
+//#if defined(ARROW_HAVE_AVX2)
+//  static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
+//                                   const uint8_t* bits, int* num_indexes,
+//                                   uint16_t* indexes, uint16_t base_index = 0);
+//  static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
+//                                       const uint8_t* bits, const uint16_t* input_indexes,
+//                                       int* num_indexes, uint16_t* indexes);
+//  template <int bit_to_search>
+//  static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+//                                       int* num_indexes, uint16_t* indexes,
+//                                       uint16_t base_index = 0);
+//  template <int bit_to_search>
+//  static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+//                                           const uint16_t* input_indexes,
+//                                           int* num_indexes, uint16_t* indexes);
+//  static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
+//  static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
+//  static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
+//#endif
+//};
 
 }  // namespace util
 namespace compute {
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index eecfb054321..81fb2e871b5 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -30,7 +30,7 @@
 #include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/key_hash.h"
-#include "arrow/compute/exec/key_map.h"
+#include "arrow/compute/key_map.h"
 #include "arrow/compute/exec/util.h"
 #include "arrow/compute/exec_internal.h"
 #include "arrow/compute/kernel.h"
diff --git a/cpp/src/arrow/compute/exec/key_map.cc b/cpp/src/arrow/compute/key_map.cc
similarity index 94%
rename from cpp/src/arrow/compute/exec/key_map.cc
rename to cpp/src/arrow/compute/key_map.cc
index a61184e4ca9..46741602cd6 100644
--- a/cpp/src/arrow/compute/exec/key_map.cc
+++ b/cpp/src/arrow/compute/key_map.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/compute/exec/key_map.h"
+#include "key_map.h"
 
 #include <memory.h>
 
@@ -25,6 +25,8 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/ubsan.h"
+#include "arrow/util/logging.h"
+
 
 namespace arrow {
 
@@ -830,5 +832,51 @@ void SwissTable::cleanup() {
   num_inserted_ = 0;
 }
 
+
+uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
+                                      uint64_t group_id_mask) const {
+  // Group id values for all 8 slots in the block are bit-packed and follow the status
+  // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
+  // that case we can extract group id using aligned 64-bit word access.
+  int num_group_id_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
+  ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 ||
+      num_group_id_bits == 32 || num_group_id_bits == 64);
+
+  int bit_offset = slot * num_group_id_bits;
+  const uint64_t* group_id_bytes =
+      reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
+  uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
+
+  return group_id;
+}
+
+void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
+                                        uint32_t group_id) {
+  const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+
+  // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
+  // In that case we can insert group id value using aligned 64-bit word access.
+  ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+      num_groupid_bits == 32 || num_groupid_bits == 64);
+
+  const uint64_t num_block_bytes = (8 + num_groupid_bits);
+  constexpr uint64_t stamp_mask = 0x7f;
+
+  int start_slot = (slot_id & 7);
+  int stamp =
+      static_cast<int>((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask);
+  uint64_t block_id = slot_id >> 3;
+  uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
+
+  blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
+  int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
+
+  // Block status bytes should start at an address aligned to 8 bytes
+  ARROW_DCHECK((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
+  uint64_t* ptr = reinterpret_cast<uint64_t*>(blockbase) + 1 + (groupid_bit_offset >> 6);
+  *ptr |= (static_cast<uint64_t>(group_id) << (groupid_bit_offset & 63));
+}
+
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/key_map.h b/cpp/src/arrow/compute/key_map.h
similarity index 80%
rename from cpp/src/arrow/compute/exec/key_map.h
rename to cpp/src/arrow/compute/key_map.h
index cc630e0b1c3..c9a3a9dc100 100644
--- a/cpp/src/arrow/compute/exec/key_map.h
+++ b/cpp/src/arrow/compute/key_map.h
@@ -19,7 +19,7 @@
 
 #include <functional>
 
-#include "arrow/compute/exec/util.h"
+#include "arrow/compute/util.h"
 #include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
@@ -107,7 +107,7 @@ class SwissTable {
 
   /// \brief Extract group id for a given slot in a given block.
   ///
-  inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
+  uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
                                    uint64_t group_id_mask) const;
   void extract_group_ids(const int num_keys, const uint16_t* optional_selection,
                          const uint32_t* hashes, const uint8_t* local_slots,
@@ -159,7 +159,7 @@ class SwissTable {
   inline bool find_next_stamp_match(const uint32_t hash, const uint32_t in_slot_id,
                                     uint32_t* out_slot_id, uint32_t* out_group_id) const;
 
-  inline void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id);
+  void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id);
 
   // Slow processing of input keys in the most generic case.
   // Handles inserting new keys.
@@ -227,49 +227,5 @@ class SwissTable {
   MemoryPool* pool_;
 };
 
-uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
-                                      uint64_t group_id_mask) const {
-  // Group id values for all 8 slots in the block are bit-packed and follow the status
-  // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
-  // that case we can extract group id using aligned 64-bit word access.
-  int num_group_id_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
-  ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 ||
-               num_group_id_bits == 32 || num_group_id_bits == 64);
-
-  int bit_offset = slot * num_group_id_bits;
-  const uint64_t* group_id_bytes =
-      reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
-  uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
-
-  return group_id;
-}
-
-void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
-                                        uint32_t group_id) {
-  const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
-
-  // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
-  // In that case we can insert group id value using aligned 64-bit word access.
-  ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
-               num_groupid_bits == 32 || num_groupid_bits == 64);
-
-  const uint64_t num_block_bytes = (8 + num_groupid_bits);
-  constexpr uint64_t stamp_mask = 0x7f;
-
-  int start_slot = (slot_id & 7);
-  int stamp =
-      static_cast<int>((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask);
-  uint64_t block_id = slot_id >> 3;
-  uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
-
-  blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
-  int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
-
-  // Block status bytes should start at an address aligned to 8 bytes
-  ARROW_DCHECK((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
-  uint64_t* ptr = reinterpret_cast<uint64_t*>(blockbase) + 1 + (groupid_bit_offset >> 6);
-  *ptr |= (static_cast<uint64_t>(group_id) << (groupid_bit_offset & 63));
-}
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_avx2.cc
similarity index 99%
rename from cpp/src/arrow/compute/exec/key_map_avx2.cc
rename to cpp/src/arrow/compute/key_map_avx2.cc
index 4c77f3af237..102e28eca41 100644
--- a/cpp/src/arrow/compute/exec/key_map_avx2.cc
+++ b/cpp/src/arrow/compute/key_map_avx2.cc
@@ -17,7 +17,7 @@
 
 #include <immintrin.h>
 
-#include "arrow/compute/exec/key_map.h"
+#include "arrow/compute/key_map.h"
 
 namespace arrow {
 namespace compute {
diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc
index 750012e60e2..2216add6849 100644
--- a/cpp/src/arrow/compute/row/compare_internal.cc
+++ b/cpp/src/arrow/compute/row/compare_internal.cc
@@ -22,7 +22,9 @@
 #include <algorithm>
 #include <cstdint>
 
-#include "arrow/compute/exec/util.h"
+// DIPO
+#include "arrow/compute/util.h"
+#include "arrow/compute/util_internal.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/ubsan.h"
 
diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h
index f9ec1e7f535..162eae19790 100644
--- a/cpp/src/arrow/compute/row/compare_internal.h
+++ b/cpp/src/arrow/compute/row/compare_internal.h
@@ -19,7 +19,8 @@
 
 #include <cstdint>
 
-#include "arrow/compute/exec/util.h"
+// DIPO
+#include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/row/encode_internal.h"
 #include "arrow/compute/row/row_internal.h"
diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc
index 9d138258d66..8c0c0aac6a7 100644
--- a/cpp/src/arrow/compute/row/encode_internal.cc
+++ b/cpp/src/arrow/compute/row/encode_internal.cc
@@ -16,7 +16,8 @@
 // under the License.
 
 #include "arrow/compute/row/encode_internal.h"
-#include "arrow/compute/exec.h"
+//#include "arrow/compute/exec.h"
+//DIPO
 #include "arrow/util/checked_cast.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h
index 970537a3067..58d4abd3e83 100644
--- a/cpp/src/arrow/compute/row/encode_internal.h
+++ b/cpp/src/arrow/compute/row/encode_internal.h
@@ -22,8 +22,10 @@
 #include <vector>
 
 #include "arrow/array/data.h"
-#include "arrow/compute/exec.h"
-#include "arrow/compute/exec/util.h"
+//#include "arrow/compute/exec.h"
+// DIPO
+#include "arrow/compute/key_map.h"
+#include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/row/row_internal.h"
 #include "arrow/memory_pool.h"
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index 579c448319a..c879e7c7403 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -21,9 +21,10 @@
 #include <mutex>
 
 #include "arrow/compute/key_hash.h"
-#include "arrow/compute/exec/key_map.h"
-#include "arrow/compute/exec/options.h"
-#include "arrow/compute/exec_internal.h"
+// DIPO
+//#include "arrow/compute/exec/key_map.h"
+//#include "arrow/compute/exec/options.h"
+//#include "arrow/compute/exec_internal.h"
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/light_array.h"
diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h
index ce09adf09b3..0ab5bba5f3e 100644
--- a/cpp/src/arrow/compute/row/grouper.h
+++ b/cpp/src/arrow/compute/row/grouper.h
@@ -20,8 +20,9 @@
 #include <memory>
 #include <vector>
 
-#include "arrow/compute/exec.h"
-#include "arrow/compute/exec/options.h"
+//#include "arrow/compute/exec.h"
+//#include "arrow/compute/exec/options.h"
+#include "arrow/compute/key_map.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc
index 11a8a0bc436..52e0c9cb6c7 100644
--- a/cpp/src/arrow/compute/row/row_internal.cc
+++ b/cpp/src/arrow/compute/row/row_internal.cc
@@ -17,7 +17,8 @@
 
 #include "arrow/compute/row/row_internal.h"
 
-#include "arrow/compute/exec/util.h"
+// DIPO
+#include "arrow/compute/util.h"
 
 namespace arrow {
 namespace compute {
diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc
new file mode 100644
index 00000000000..572a52a5f19
--- /dev/null
+++ b/cpp/src/arrow/compute/util.cc
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/util.h"
+
+#include "arrow/table.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/tracing_internal.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using bit_util::CountTrailingZeros;
+
+namespace util {
+
+void TempVectorStack::alloc(uint32_t num_bytes, uint8_t **data, int *id) {
+  int64_t old_top = top_;
+  top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
+  // Stack overflow check
+      ARROW_DCHECK(top_ <= buffer_size_);
+  *data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
+  // We set 8 bytes before the beginning of the allocated range and
+  // 8 bytes after the end to check for stack overflow (which would
+  // result in those known bytes being corrupted).
+  reinterpret_cast<uint64_t *>(buffer_->mutable_data() + old_top)[0] = kGuard1;
+  reinterpret_cast<uint64_t *>(buffer_->mutable_data() + top_)[-1] = kGuard2;
+  *id = num_vectors_++;
+}
+
+void TempVectorStack::release(int id, uint32_t num_bytes) {
+      ARROW_DCHECK(num_vectors_ == id + 1);
+  int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
+      ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[-1] ==
+      kGuard2);
+      ARROW_DCHECK(top_ >= size);
+  top_ -= size;
+      ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[0] ==
+      kGuard1);
+  --num_vectors_;
+}
+
+inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
+  // This will not be correct on big-endian architectures.
+#if !ARROW_LITTLE_ENDIAN
+  ARROW_DCHECK(false);
+#endif
+  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
+  if (num_bytes == 8) {
+    return util::SafeLoad(reinterpret_cast<const uint64_t*>(bytes));
+  } else {
+    uint64_t word = 0;
+    for (int i = 0; i < num_bytes; ++i) {
+      word |= static_cast<uint64_t>(bytes[i]) << (8 * i);
+    }
+    return word;
+  }
+}
+
+inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) {
+  // This will not be correct on big-endian architectures.
+#if !ARROW_LITTLE_ENDIAN
+  ARROW_DCHECK(false);
+#endif
+  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
+  if (num_bytes == 8) {
+    util::SafeStore(reinterpret_cast<uint64_t*>(bytes), value);
+  } else {
+    for (int i = 0; i < num_bytes; ++i) {
+      bytes[i] = static_cast<uint8_t>(value >> (8 * i));
+    }
+  }
+}
+
+inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+                                             int* num_indexes, uint16_t* indexes) {
+  int n = *num_indexes;
+  while (word) {
+    indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
+    word &= word - 1;
+  }
+  *num_indexes = n;
+}
+
+inline void bit_util::bits_filter_indexes_helper(uint64_t word,
+                                                 const uint16_t* input_indexes,
+                                                 int* num_indexes, uint16_t* indexes) {
+  int n = *num_indexes;
+  while (word) {
+    indexes[n++] = input_indexes[CountTrailingZeros(word)];
+    word &= word - 1;
+  }
+  *num_indexes = n;
+}
+
+template <int bit_to_search, bool filter_input_indexes>
+void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+                                        const uint8_t* bits,
+                                        const uint16_t* input_indexes, int* num_indexes,
+                                        uint16_t* indexes, uint16_t base_index) {
+  // 64 bits at a time
+  constexpr int unroll = 64;
+  int tail = num_bits % unroll;
+#if defined(ARROW_HAVE_AVX2)
+  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+    if (filter_input_indexes) {
+      bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
+                               num_indexes, indexes);
+    } else {
+      bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes,
+                           base_index);
+    }
+  } else {
+#endif
+    *num_indexes = 0;
+    for (int i = 0; i < num_bits / unroll; ++i) {
+      uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
+      if (bit_to_search == 0) {
+        word = ~word;
+      }
+      if (filter_input_indexes) {
+        bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
+      } else {
+        bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes);
+      }
+    }
+#if defined(ARROW_HAVE_AVX2)
+  }
+#endif
+  // Optionally process the last partial word with masking out bits outside range
+  if (tail) {
+    const uint8_t* bits_tail = bits + (num_bits - tail) / 8;
+    uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8);
+    if (bit_to_search == 0) {
+      word = ~word;
+    }
+    word &= ~0ULL >> (64 - tail);
+    if (filter_input_indexes) {
+      bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
+                                 indexes);
+    } else {
+      bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes);
+    }
+  }
+}
+
+void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits,
+                               const uint8_t* bits, int* num_indexes, uint16_t* indexes,
+                               int bit_offset) {
+  bits += bit_offset / 8;
+  bit_offset %= 8;
+  *num_indexes = 0;
+  uint16_t base_index = 0;
+  if (bit_offset != 0) {
+    uint64_t bits_head = bits[0] >> bit_offset;
+    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+    bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+                    reinterpret_cast<const uint8_t*>(&bits_head), num_indexes, indexes);
+    if (num_bits <= bits_in_first_byte) {
+      return;
+    }
+    num_bits -= bits_in_first_byte;
+    indexes += *num_indexes;
+    bits += 1;
+    base_index = bits_in_first_byte;
+  }
+
+  int num_indexes_new = 0;
+  if (bit_to_search == 0) {
+    bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
+                                       &num_indexes_new, indexes, base_index);
+  } else {
+    ARROW_DCHECK(bit_to_search == 1);
+    bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
+                                       &num_indexes_new, indexes, base_index);
+  }
+  *num_indexes += num_indexes_new;
+}
+
+void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+                                   const int num_bits, const uint8_t* bits,
+                                   const uint16_t* input_indexes, int* num_indexes,
+                                   uint16_t* indexes, int bit_offset) {
+  bits += bit_offset / 8;
+  bit_offset %= 8;
+  if (bit_offset != 0) {
+    int num_indexes_head = 0;
+    uint64_t bits_head = bits[0] >> bit_offset;
+    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+    bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
+                        reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
+                        &num_indexes_head, indexes);
+    int num_indexes_tail = 0;
+    if (num_bits > bits_in_first_byte) {
+      bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
+                          bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
+                          indexes + num_indexes_head);
+    }
+    *num_indexes = num_indexes_head + num_indexes_tail;
+    return;
+  }
+
+  if (bit_to_search == 0) {
+    bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
+                                      num_indexes, indexes);
+  } else {
+    ARROW_DCHECK(bit_to_search == 1);
+    bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
+                                      num_indexes, indexes);
+  }
+}
+
+void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits,
+                                  const uint8_t* bits, int* num_indexes_bit0,
+                                  uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+                                  int bit_offset) {
+  bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
+                  bit_offset);
+  int num_indexes_bit1;
+  bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
+                  bit_offset);
+}
+
+void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits,
+                             const uint8_t* bits, uint8_t* bytes, int bit_offset) {
+  bits += bit_offset / 8;
+  bit_offset %= 8;
+  if (bit_offset != 0) {
+    uint64_t bits_head = bits[0] >> bit_offset;
+    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+    bits_to_bytes(hardware_flags, bits_in_first_byte,
+                  reinterpret_cast<const uint8_t*>(&bits_head), bytes);
+    if (num_bits > bits_in_first_byte) {
+      bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
+                    bytes + bits_in_first_byte);
+    }
+    return;
+  }
+
+  int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+    // The function call below processes whole 32 bit chunks together.
+    num_processed = num_bits - (num_bits % 32);
+    bits_to_bytes_avx2(num_processed, bits, bytes);
+  }
+#endif
+  // Processing 8 bits at a time
+  constexpr int unroll = 8;
+  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
+    uint8_t bits_next = bits[i];
+    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
+    // from the previous.
+    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
+                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
+                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
+    unpacked |= (bits_next & 1);
+    unpacked &= 0x0101010101010101ULL;
+    unpacked *= 255;
+    util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
+  }
+  int tail = num_bits % unroll;
+  if (tail) {
+    uint8_t bits_next = bits[(num_bits - tail) / unroll];
+    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
+    // from the previous.
+    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
+                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
+                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
+    unpacked |= (bits_next & 1);
+    unpacked &= 0x0101010101010101ULL;
+    unpacked *= 255;
+    SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked);
+  }
+}
+
+void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits,
+                             const uint8_t* bytes, uint8_t* bits, int bit_offset) {
+  bits += bit_offset / 8;
+  bit_offset %= 8;
+  if (bit_offset != 0) {
+    uint64_t bits_head;
+    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
+    bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
+                  reinterpret_cast<uint8_t*>(&bits_head));
+    uint8_t mask = (1 << bit_offset) - 1;
+    *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
+
+    if (num_bits > bits_in_first_byte) {
+      bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
+                    bytes + bits_in_first_byte, bits + 1);
+    }
+    return;
+  }
+
+  int num_processed = 0;
+#if defined(ARROW_HAVE_AVX2)
+  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+    // The function call below processes whole 32 bit chunks together.
+    num_processed = num_bits - (num_bits % 32);
+    bytes_to_bits_avx2(num_processed, bytes, bits);
+  }
+#endif
+  // Process 8 bits at a time
+  constexpr int unroll = 8;
+  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
+    uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+    bytes_next &= 0x0101010101010101ULL;
+    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
+    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
+    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
+    bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
+  }
+  int tail = num_bits % unroll;
+  if (tail) {
+    uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail);
+    bytes_next &= 0x0101010101010101ULL;
+    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
+    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
+    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
+    bits[num_bits / 8] = static_cast<uint8_t>(bytes_next & 0xff);
+  }
+}
+
+bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+                                  uint32_t num_bytes) {
+#if defined(ARROW_HAVE_AVX2)
+  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
+    return are_all_bytes_zero_avx2(bytes, num_bytes);
+  }
+#endif
+  uint64_t result_or = 0;
+  uint32_t i;
+  for (i = 0; i < num_bytes / 8; ++i) {
+    uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
+    result_or |= x;
+  }
+  if (num_bytes % 8 > 0) {
+    uint64_t tail = 0;
+    result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
+  }
+  return result_or == 0;
+}
+
+}  // namespace util
+
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index 42477857a18..8447d28261c 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -31,7 +31,6 @@
 #include "arrow/status.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/cpu_info.h"
-#include "arrow/util/logging.h"
 #include "arrow/util/mutex.h"
 #include "arrow/util/thread_pool.h"
 #include "arrow/util/type_fwd.h"
@@ -57,10 +56,25 @@
 namespace arrow {
 namespace util {
 
-template<typename T>
-inline void CheckAlignment(const void *ptr) {
-      ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
-}
+// Some platforms typedef int64_t as long int instead of long long int,
+// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
+// which need long long.
+// We use the cast to the type below in these intrinsics to make the code
+// compile in all cases.
+//
+using int64_for_gather_t = const long long int;  // NOLINT runtime-int
+
+// All MiniBatch... classes use TempVectorStack for vector allocations and can
+// only work with vectors up to 1024 elements.
+//
+// They should only be allocated on the stack to guarantee the right sequence
+// of allocation and deallocation of vectors from TempVectorStack.
+//
+class MiniBatch {
+ public:
+  static constexpr int kLogMiniBatchLength = 10;
+  static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
+};
 
 /// Storage used to allocate temporary vectors of a batch size.
 /// Temporary vectors should resemble allocating temporary variables on the stack
@@ -94,30 +108,8 @@ class TempVectorStack {
     //
     return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
   }
-  void alloc(uint32_t num_bytes, uint8_t **data, int *id) {
-    int64_t old_top = top_;
-    top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
-    // Stack overflow check
-        ARROW_DCHECK(top_ <= buffer_size_);
-    *data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
-    // We set 8 bytes before the beginning of the allocated range and
-    // 8 bytes after the end to check for stack overflow (which would
-    // result in those known bytes being corrupted).
-    reinterpret_cast<uint64_t *>(buffer_->mutable_data() + old_top)[0] = kGuard1;
-    reinterpret_cast<uint64_t *>(buffer_->mutable_data() + top_)[-1] = kGuard2;
-    *id = num_vectors_++;
-  }
-  void release(int id, uint32_t num_bytes) {
-        ARROW_DCHECK(num_vectors_ == id + 1);
-    int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
-        ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[-1] ==
-        kGuard2);
-        ARROW_DCHECK(top_ >= size);
-    top_ -= size;
-        ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[0] ==
-        kGuard1);
-    --num_vectors_;
-  }
+  void alloc(uint32_t num_bytes, uint8_t **data, int *id);
+  void release(int id, uint32_t num_bytes);
   static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
   static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
   static constexpr int64_t kPadding = 64;
@@ -127,5 +119,88 @@ class TempVectorStack {
   int64_t buffer_size_;
 };
 
+template <typename T>
+class TempVectorHolder {
+  friend class TempVectorStack;
+
+ public:
+  ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
+  T* mutable_data() { return reinterpret_cast<T*>(data_); }
+  TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
+    stack_ = stack;
+    num_elements_ = num_elements;
+    stack_->alloc(num_elements * sizeof(T), &data_, &id_);
+  }
+
+ private:
+  TempVectorStack* stack_;
+  uint8_t* data_;
+  int id_;
+  uint32_t num_elements_;
+};
+
+class bit_util {
+ public:
+  static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+                              const int num_bits, const uint8_t* bits, int* num_indexes,
+                              uint16_t* indexes, int bit_offset = 0);
+
+  static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+                                  const int num_bits, const uint8_t* bits,
+                                  const uint16_t* input_indexes, int* num_indexes,
+                                  uint16_t* indexes, int bit_offset = 0);
+
+  // Input and output indexes may be pointing to the same data (in-place filtering).
+  static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
+                                 const uint8_t* bits, int* num_indexes_bit0,
+                                 uint16_t* indexes_bit0, uint16_t* indexes_bit1,
+                                 int bit_offset = 0);
+
+  // Bit 1 is replaced with byte 0xFF.
+  static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
+                            const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
+
+  // Return highest bit of each byte.
+  static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
+                            const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
+
+  static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+                                 uint32_t num_bytes);
+
+ private:
+  inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes);
+  inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value);
+  inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
+                                            int* num_indexes, uint16_t* indexes);
+  inline static void bits_filter_indexes_helper(uint64_t word,
+                                                const uint16_t* input_indexes,
+                                                int* num_indexes, uint16_t* indexes);
+  template <int bit_to_search, bool filter_input_indexes>
+  static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
+                                       const uint8_t* bits, const uint16_t* input_indexes,
+                                       int* num_indexes, uint16_t* indexes,
+                                       uint16_t base_index = 0);
+
+#if defined(ARROW_HAVE_AVX2)
+  static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
+                                   const uint8_t* bits, int* num_indexes,
+                                   uint16_t* indexes, uint16_t base_index = 0);
+  static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
+                                       const uint8_t* bits, const uint16_t* input_indexes,
+                                       int* num_indexes, uint16_t* indexes);
+  template <int bit_to_search>
+  static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+                                       int* num_indexes, uint16_t* indexes,
+                                       uint16_t base_index = 0);
+  template <int bit_to_search>
+  static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
+                                           const uint16_t* input_indexes,
+                                           int* num_indexes, uint16_t* indexes);
+  static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
+  static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
+  static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
+#endif
+};
+
 }
 }
\ No newline at end of file
diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h
new file mode 100644
index 00000000000..9d45ede940f
--- /dev/null
+++ b/cpp/src/arrow/compute/util_internal.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace util {
+
+template<typename T> void CheckAlignment(const void *ptr) {
+      ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
+}
+
+}
+}

From f251a4545239411ecbe0edddf1e550364c93073b Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 09:22:31 -0500
Subject: [PATCH 03/11] some cleanup

---
 cpp/src/arrow/compute/exec/util.cc            | 309 ------------------
 cpp/src/arrow/compute/exec/util.h             | 196 -----------
 cpp/src/arrow/compute/key_map.cc              |  49 ---
 cpp/src/arrow/compute/key_map.h               |  49 ++-
 cpp/src/arrow/compute/light_array.h           |   1 -
 cpp/src/arrow/compute/row/compare_internal.cc |   1 -
 cpp/src/arrow/compute/row/compare_internal.h  |   1 -
 cpp/src/arrow/compute/row/encode_internal.cc  |   2 -
 cpp/src/arrow/compute/row/encode_internal.h   |   2 -
 cpp/src/arrow/compute/row/grouper.cc          |   4 -
 cpp/src/arrow/compute/row/grouper.h           |   2 -
 cpp/src/arrow/compute/row/row_internal.cc     |   1 -
 12 files changed, 47 insertions(+), 570 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/util.cc b/cpp/src/arrow/compute/exec/util.cc
index 98c193a8920..6a1fd37aa19 100644
--- a/cpp/src/arrow/compute/exec/util.cc
+++ b/cpp/src/arrow/compute/exec/util.cc
@@ -25,315 +25,6 @@
 #include "arrow/util/ubsan.h"
 
 namespace arrow {
-
-//using bit_util::CountTrailingZeros;
-
-namespace util {
-//
-//inline uint64_t bit_util::SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes) {
-//  // This will not be correct on big-endian architectures.
-//#if !ARROW_LITTLE_ENDIAN
-//  ARROW_DCHECK(false);
-//#endif
-//  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
-//  if (num_bytes == 8) {
-//    return util::SafeLoad(reinterpret_cast<const uint64_t*>(bytes));
-//  } else {
-//    uint64_t word = 0;
-//    for (int i = 0; i < num_bytes; ++i) {
-//      word |= static_cast<uint64_t>(bytes[i]) << (8 * i);
-//    }
-//    return word;
-//  }
-//}
-//
-//inline void bit_util::SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value) {
-//  // This will not be correct on big-endian architectures.
-//#if !ARROW_LITTLE_ENDIAN
-//  ARROW_DCHECK(false);
-//#endif
-//  ARROW_DCHECK(num_bytes >= 0 && num_bytes <= 8);
-//  if (num_bytes == 8) {
-//    util::SafeStore(reinterpret_cast<uint64_t*>(bytes), value);
-//  } else {
-//    for (int i = 0; i < num_bytes; ++i) {
-//      bytes[i] = static_cast<uint8_t>(value >> (8 * i));
-//    }
-//  }
-//}
-//
-//inline void bit_util::bits_to_indexes_helper(uint64_t word, uint16_t base_index,
-//                                             int* num_indexes, uint16_t* indexes) {
-//  int n = *num_indexes;
-//  while (word) {
-//    indexes[n++] = base_index + static_cast<uint16_t>(CountTrailingZeros(word));
-//    word &= word - 1;
-//  }
-//  *num_indexes = n;
-//}
-//
-//inline void bit_util::bits_filter_indexes_helper(uint64_t word,
-//                                                 const uint16_t* input_indexes,
-//                                                 int* num_indexes, uint16_t* indexes) {
-//  int n = *num_indexes;
-//  while (word) {
-//    indexes[n++] = input_indexes[CountTrailingZeros(word)];
-//    word &= word - 1;
-//  }
-//  *num_indexes = n;
-//}
-//
-//template <int bit_to_search, bool filter_input_indexes>
-//void bit_util::bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
-//                                        const uint8_t* bits,
-//                                        const uint16_t* input_indexes, int* num_indexes,
-//                                        uint16_t* indexes, uint16_t base_index) {
-//  // 64 bits at a time
-//  constexpr int unroll = 64;
-//  int tail = num_bits % unroll;
-//#if defined(ARROW_HAVE_AVX2)
-//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-//    if (filter_input_indexes) {
-//      bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes,
-//                               num_indexes, indexes);
-//    } else {
-//      bits_to_indexes_avx2(bit_to_search, num_bits - tail, bits, num_indexes, indexes,
-//                           base_index);
-//    }
-//  } else {
-//#endif
-//    *num_indexes = 0;
-//    for (int i = 0; i < num_bits / unroll; ++i) {
-//      uint64_t word = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bits)[i]);
-//      if (bit_to_search == 0) {
-//        word = ~word;
-//      }
-//      if (filter_input_indexes) {
-//        bits_filter_indexes_helper(word, input_indexes + i * 64, num_indexes, indexes);
-//      } else {
-//        bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes);
-//      }
-//    }
-//#if defined(ARROW_HAVE_AVX2)
-//  }
-//#endif
-//  // Optionally process the last partial word with masking out bits outside range
-//  if (tail) {
-//    const uint8_t* bits_tail = bits + (num_bits - tail) / 8;
-//    uint64_t word = SafeLoadUpTo8Bytes(bits_tail, (tail + 7) / 8);
-//    if (bit_to_search == 0) {
-//      word = ~word;
-//    }
-//    word &= ~0ULL >> (64 - tail);
-//    if (filter_input_indexes) {
-//      bits_filter_indexes_helper(word, input_indexes + num_bits - tail, num_indexes,
-//                                 indexes);
-//    } else {
-//      bits_to_indexes_helper(word, num_bits - tail + base_index, num_indexes, indexes);
-//    }
-//  }
-//}
-//
-//void bit_util::bits_to_indexes(int bit_to_search, int64_t hardware_flags, int num_bits,
-//                               const uint8_t* bits, int* num_indexes, uint16_t* indexes,
-//                               int bit_offset) {
-//  bits += bit_offset / 8;
-//  bit_offset %= 8;
-//  *num_indexes = 0;
-//  uint16_t base_index = 0;
-//  if (bit_offset != 0) {
-//    uint64_t bits_head = bits[0] >> bit_offset;
-//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-//    bits_to_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
-//                    reinterpret_cast<const uint8_t*>(&bits_head), num_indexes, indexes);
-//    if (num_bits <= bits_in_first_byte) {
-//      return;
-//    }
-//    num_bits -= bits_in_first_byte;
-//    indexes += *num_indexes;
-//    bits += 1;
-//    base_index = bits_in_first_byte;
-//  }
-//
-//  int num_indexes_new = 0;
-//  if (bit_to_search == 0) {
-//    bits_to_indexes_internal<0, false>(hardware_flags, num_bits, bits, nullptr,
-//                                       &num_indexes_new, indexes, base_index);
-//  } else {
-//    ARROW_DCHECK(bit_to_search == 1);
-//    bits_to_indexes_internal<1, false>(hardware_flags, num_bits, bits, nullptr,
-//                                       &num_indexes_new, indexes, base_index);
-//  }
-//  *num_indexes += num_indexes_new;
-//}
-//
-//void bit_util::bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
-//                                   const int num_bits, const uint8_t* bits,
-//                                   const uint16_t* input_indexes, int* num_indexes,
-//                                   uint16_t* indexes, int bit_offset) {
-//  bits += bit_offset / 8;
-//  bit_offset %= 8;
-//  if (bit_offset != 0) {
-//    int num_indexes_head = 0;
-//    uint64_t bits_head = bits[0] >> bit_offset;
-//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-//    bits_filter_indexes(bit_to_search, hardware_flags, bits_in_first_byte,
-//                        reinterpret_cast<const uint8_t*>(&bits_head), input_indexes,
-//                        &num_indexes_head, indexes);
-//    int num_indexes_tail = 0;
-//    if (num_bits > bits_in_first_byte) {
-//      bits_filter_indexes(bit_to_search, hardware_flags, num_bits - bits_in_first_byte,
-//                          bits + 1, input_indexes + bits_in_first_byte, &num_indexes_tail,
-//                          indexes + num_indexes_head);
-//    }
-//    *num_indexes = num_indexes_head + num_indexes_tail;
-//    return;
-//  }
-//
-//  if (bit_to_search == 0) {
-//    bits_to_indexes_internal<0, true>(hardware_flags, num_bits, bits, input_indexes,
-//                                      num_indexes, indexes);
-//  } else {
-//    ARROW_DCHECK(bit_to_search == 1);
-//    bits_to_indexes_internal<1, true>(hardware_flags, num_bits, bits, input_indexes,
-//                                      num_indexes, indexes);
-//  }
-//}
-//
-//void bit_util::bits_split_indexes(int64_t hardware_flags, const int num_bits,
-//                                  const uint8_t* bits, int* num_indexes_bit0,
-//                                  uint16_t* indexes_bit0, uint16_t* indexes_bit1,
-//                                  int bit_offset) {
-//  bits_to_indexes(0, hardware_flags, num_bits, bits, num_indexes_bit0, indexes_bit0,
-//                  bit_offset);
-//  int num_indexes_bit1;
-//  bits_to_indexes(1, hardware_flags, num_bits, bits, &num_indexes_bit1, indexes_bit1,
-//                  bit_offset);
-//}
-//
-//void bit_util::bits_to_bytes(int64_t hardware_flags, const int num_bits,
-//                             const uint8_t* bits, uint8_t* bytes, int bit_offset) {
-//  bits += bit_offset / 8;
-//  bit_offset %= 8;
-//  if (bit_offset != 0) {
-//    uint64_t bits_head = bits[0] >> bit_offset;
-//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-//    bits_to_bytes(hardware_flags, bits_in_first_byte,
-//                  reinterpret_cast<const uint8_t*>(&bits_head), bytes);
-//    if (num_bits > bits_in_first_byte) {
-//      bits_to_bytes(hardware_flags, num_bits - bits_in_first_byte, bits + 1,
-//                    bytes + bits_in_first_byte);
-//    }
-//    return;
-//  }
-//
-//  int num_processed = 0;
-//#if defined(ARROW_HAVE_AVX2)
-//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-//    // The function call below processes whole 32 bit chunks together.
-//    num_processed = num_bits - (num_bits % 32);
-//    bits_to_bytes_avx2(num_processed, bits, bytes);
-//  }
-//#endif
-//  // Processing 8 bits at a time
-//  constexpr int unroll = 8;
-//  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
-//    uint8_t bits_next = bits[i];
-//    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
-//    // from the previous.
-//    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
-//                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
-//                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
-//    unpacked |= (bits_next & 1);
-//    unpacked &= 0x0101010101010101ULL;
-//    unpacked *= 255;
-//    util::SafeStore(&reinterpret_cast<uint64_t*>(bytes)[i], unpacked);
-//  }
-//  int tail = num_bits % unroll;
-//  if (tail) {
-//    uint8_t bits_next = bits[(num_bits - tail) / unroll];
-//    // Clear the lowest bit and then make 8 copies of remaining 7 bits, each 7 bits apart
-//    // from the previous.
-//    uint64_t unpacked = static_cast<uint64_t>(bits_next & 0xfe) *
-//                        ((1ULL << 7) | (1ULL << 14) | (1ULL << 21) | (1ULL << 28) |
-//                         (1ULL << 35) | (1ULL << 42) | (1ULL << 49));
-//    unpacked |= (bits_next & 1);
-//    unpacked &= 0x0101010101010101ULL;
-//    unpacked *= 255;
-//    SafeStoreUpTo8Bytes(bytes + num_bits - tail, tail, unpacked);
-//  }
-//}
-//
-//void bit_util::bytes_to_bits(int64_t hardware_flags, const int num_bits,
-//                             const uint8_t* bytes, uint8_t* bits, int bit_offset) {
-//  bits += bit_offset / 8;
-//  bit_offset %= 8;
-//  if (bit_offset != 0) {
-//    uint64_t bits_head;
-//    int bits_in_first_byte = std::min(num_bits, 8 - bit_offset);
-//    bytes_to_bits(hardware_flags, bits_in_first_byte, bytes,
-//                  reinterpret_cast<uint8_t*>(&bits_head));
-//    uint8_t mask = (1 << bit_offset) - 1;
-//    *bits = static_cast<uint8_t>((*bits & mask) | (bits_head << bit_offset));
-//
-//    if (num_bits > bits_in_first_byte) {
-//      bytes_to_bits(hardware_flags, num_bits - bits_in_first_byte,
-//                    bytes + bits_in_first_byte, bits + 1);
-//    }
-//    return;
-//  }
-//
-//  int num_processed = 0;
-//#if defined(ARROW_HAVE_AVX2)
-//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-//    // The function call below processes whole 32 bit chunks together.
-//    num_processed = num_bits - (num_bits % 32);
-//    bytes_to_bits_avx2(num_processed, bytes, bits);
-//  }
-//#endif
-//  // Process 8 bits at a time
-//  constexpr int unroll = 8;
-//  for (int i = num_processed / unroll; i < num_bits / unroll; ++i) {
-//    uint64_t bytes_next = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
-//    bytes_next &= 0x0101010101010101ULL;
-//    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
-//    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
-//    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
-//    bits[i] = static_cast<uint8_t>(bytes_next & 0xff);
-//  }
-//  int tail = num_bits % unroll;
-//  if (tail) {
-//    uint64_t bytes_next = SafeLoadUpTo8Bytes(bytes + num_bits - tail, tail);
-//    bytes_next &= 0x0101010101010101ULL;
-//    bytes_next |= (bytes_next >> 7);  // Pairs of adjacent output bits in individual bytes
-//    bytes_next |= (bytes_next >> 14);  // 4 adjacent output bits in individual bytes
-//    bytes_next |= (bytes_next >> 28);  // All 8 output bits in the lowest byte
-//    bits[num_bits / 8] = static_cast<uint8_t>(bytes_next & 0xff);
-//  }
-//}
-//
-//bool bit_util::are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
-//                                  uint32_t num_bytes) {
-//#if defined(ARROW_HAVE_AVX2)
-//  if (hardware_flags & arrow::internal::CpuInfo::AVX2) {
-//    return are_all_bytes_zero_avx2(bytes, num_bytes);
-//  }
-//#endif
-//  uint64_t result_or = 0;
-//  uint32_t i;
-//  for (i = 0; i < num_bytes / 8; ++i) {
-//    uint64_t x = util::SafeLoad(&reinterpret_cast<const uint64_t*>(bytes)[i]);
-//    result_or |= x;
-//  }
-//  if (num_bytes % 8 > 0) {
-//    uint64_t tail = 0;
-//    result_or |= memcmp(bytes + i * 8, &tail, num_bytes % 8);
-//  }
-//  return result_or == 0;
-//}
-
-}  // namespace util
-
 namespace compute {
 
 Status ValidateExecNodeInputs(ExecPlan* plan, const std::vector<ExecNode*>& inputs,
diff --git a/cpp/src/arrow/compute/exec/util.h b/cpp/src/arrow/compute/exec/util.h
index e0ffca29070..8e5001d99c1 100644
--- a/cpp/src/arrow/compute/exec/util.h
+++ b/cpp/src/arrow/compute/exec/util.h
@@ -39,203 +39,7 @@
 #include "arrow/util/thread_pool.h"
 #include "arrow/util/type_fwd.h"
 
-// DIPO
-//#if defined(__clang__) || defined(__GNUC__)
-//#define BYTESWAP(x) __builtin_bswap64(x)
-//#define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
-//#define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
-//#define PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
-//#elif defined(_MSC_VER)
-//#include <intrin.h>
-//#define BYTESWAP(x) _byteswap_uint64(x)
-//#define ROTL(x, n) _rotl((x), (n))
-//#define ROTL64(x, n) _rotl64((x), (n))
-//#if defined(_M_X64) || defined(_M_I86)
-//#include <mmintrin.h>  // https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx
-//#define PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
-//#else
-//#define PREFETCH(ptr) (void)(ptr) /* disabled */
-//#endif
-//#endif
-
 namespace arrow {
-namespace util {
-
-// DIPO
-//template <typename T>
-//inline void CheckAlignment(const void* ptr) {
-//  ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
-//}
-
-//// Some platforms typedef int64_t as long int instead of long long int,
-//// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
-//// which need long long.
-//// We use the cast to the type below in these intrinsics to make the code
-//// compile in all cases.
-////
-//using int64_for_gather_t = const long long int;  // NOLINT runtime-int
-//
-//// All MiniBatch... classes use TempVectorStack for vector allocations and can
-//// only work with vectors up to 1024 elements.
-////
-//// They should only be allocated on the stack to guarantee the right sequence
-//// of allocation and deallocation of vectors from TempVectorStack.
-////
-//class MiniBatch {
-// public:
-//  static constexpr int kLogMiniBatchLength = 10;
-//  static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
-//};
-
-// DIPO
-///// Storage used to allocate temporary vectors of a batch size.
-///// Temporary vectors should resemble allocating temporary variables on the stack
-///// but in the context of vectorized processing where we need to store a vector of
-///// temporaries instead of a single value.
-//class TempVectorStack {
-//  template <typename>
-//  friend class TempVectorHolder;
-//
-// public:
-//  Status Init(MemoryPool* pool, int64_t size) {
-//    num_vectors_ = 0;
-//    top_ = 0;
-//    buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t);
-//    ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(size, pool));
-//    // Ensure later operations don't accidentally read uninitialized memory.
-//    std::memset(buffer->mutable_data(), 0xFF, size);
-//    buffer_ = std::move(buffer);
-//    return Status::OK();
-//  }
-//
-// private:
-//  int64_t PaddedAllocationSize(int64_t num_bytes) {
-//    // Round up allocation size to multiple of 8 bytes
-//    // to avoid returning temp vectors with unaligned address.
-//    //
-//    // Also add padding at the end to facilitate loads and stores
-//    // using SIMD when number of vector elements is not divisible
-//    // by the number of SIMD lanes.
-//    //
-//    return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
-//  }
-//  void alloc(uint32_t num_bytes, uint8_t** data, int* id) {
-//    int64_t old_top = top_;
-//    top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
-//    // Stack overflow check
-//    ARROW_DCHECK(top_ <= buffer_size_);
-//    *data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
-//    // We set 8 bytes before the beginning of the allocated range and
-//    // 8 bytes after the end to check for stack overflow (which would
-//    // result in those known bytes being corrupted).
-//    reinterpret_cast<uint64_t*>(buffer_->mutable_data() + old_top)[0] = kGuard1;
-//    reinterpret_cast<uint64_t*>(buffer_->mutable_data() + top_)[-1] = kGuard2;
-//    *id = num_vectors_++;
-//  }
-//  void release(int id, uint32_t num_bytes) {
-//    ARROW_DCHECK(num_vectors_ == id + 1);
-//    int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
-//    ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[-1] ==
-//                 kGuard2);
-//    ARROW_DCHECK(top_ >= size);
-//    top_ -= size;
-//    ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[0] ==
-//                 kGuard1);
-//    --num_vectors_;
-//  }
-//  static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
-//  static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
-//  static constexpr int64_t kPadding = 64;
-//  int num_vectors_;
-//  int64_t top_;
-//  std::unique_ptr<Buffer> buffer_;
-//  int64_t buffer_size_;
-//};
-//
-//template <typename T>
-//class TempVectorHolder {
-//  friend class TempVectorStack;
-//
-// public:
-//  ~TempVectorHolder() { stack_->release(id_, num_elements_ * sizeof(T)); }
-//  T* mutable_data() { return reinterpret_cast<T*>(data_); }
-//  TempVectorHolder(TempVectorStack* stack, uint32_t num_elements) {
-//    stack_ = stack;
-//    num_elements_ = num_elements;
-//    stack_->alloc(num_elements * sizeof(T), &data_, &id_);
-//  }
-//
-// private:
-//  TempVectorStack* stack_;
-//  uint8_t* data_;
-//  int id_;
-//  uint32_t num_elements_;
-//};
-//
-//class bit_util {
-// public:
-//  static void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
-//                              const int num_bits, const uint8_t* bits, int* num_indexes,
-//                              uint16_t* indexes, int bit_offset = 0);
-//
-//  static void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
-//                                  const int num_bits, const uint8_t* bits,
-//                                  const uint16_t* input_indexes, int* num_indexes,
-//                                  uint16_t* indexes, int bit_offset = 0);
-//
-//  // Input and output indexes may be pointing to the same data (in-place filtering).
-//  static void bits_split_indexes(int64_t hardware_flags, const int num_bits,
-//                                 const uint8_t* bits, int* num_indexes_bit0,
-//                                 uint16_t* indexes_bit0, uint16_t* indexes_bit1,
-//                                 int bit_offset = 0);
-//
-//  // Bit 1 is replaced with byte 0xFF.
-//  static void bits_to_bytes(int64_t hardware_flags, const int num_bits,
-//                            const uint8_t* bits, uint8_t* bytes, int bit_offset = 0);
-//
-//  // Return highest bit of each byte.
-//  static void bytes_to_bits(int64_t hardware_flags, const int num_bits,
-//                            const uint8_t* bytes, uint8_t* bits, int bit_offset = 0);
-//
-//  static bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
-//                                 uint32_t num_bytes);
-//
-// private:
-//  inline static uint64_t SafeLoadUpTo8Bytes(const uint8_t* bytes, int num_bytes);
-//  inline static void SafeStoreUpTo8Bytes(uint8_t* bytes, int num_bytes, uint64_t value);
-//  inline static void bits_to_indexes_helper(uint64_t word, uint16_t base_index,
-//                                            int* num_indexes, uint16_t* indexes);
-//  inline static void bits_filter_indexes_helper(uint64_t word,
-//                                                const uint16_t* input_indexes,
-//                                                int* num_indexes, uint16_t* indexes);
-//  template <int bit_to_search, bool filter_input_indexes>
-//  static void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits,
-//                                       const uint8_t* bits, const uint16_t* input_indexes,
-//                                       int* num_indexes, uint16_t* indexes,
-//                                       uint16_t base_index = 0);
-//
-//#if defined(ARROW_HAVE_AVX2)
-//  static void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
-//                                   const uint8_t* bits, int* num_indexes,
-//                                   uint16_t* indexes, uint16_t base_index = 0);
-//  static void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
-//                                       const uint8_t* bits, const uint16_t* input_indexes,
-//                                       int* num_indexes, uint16_t* indexes);
-//  template <int bit_to_search>
-//  static void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
-//                                       int* num_indexes, uint16_t* indexes,
-//                                       uint16_t base_index = 0);
-//  template <int bit_to_search>
-//  static void bits_filter_indexes_imp_avx2(const int num_bits, const uint8_t* bits,
-//                                           const uint16_t* input_indexes,
-//                                           int* num_indexes, uint16_t* indexes);
-//  static void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, uint8_t* bytes);
-//  static void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, uint8_t* bits);
-//  static bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes);
-//#endif
-//};
-
-}  // namespace util
 namespace compute {
 
 ARROW_EXPORT
diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc
index 46741602cd6..dac1dfa2483 100644
--- a/cpp/src/arrow/compute/key_map.cc
+++ b/cpp/src/arrow/compute/key_map.cc
@@ -17,15 +17,12 @@
 
 #include "key_map.h"
 
-#include <memory.h>
-
 #include <algorithm>
 #include <cstdint>
 
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/ubsan.h"
-#include "arrow/util/logging.h"
 
 
 namespace arrow {
@@ -832,51 +829,5 @@ void SwissTable::cleanup() {
   num_inserted_ = 0;
 }
 
-
-uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
-                                      uint64_t group_id_mask) const {
-  // Group id values for all 8 slots in the block are bit-packed and follow the status
-  // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
-  // that case we can extract group id using aligned 64-bit word access.
-  int num_group_id_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
-  ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 ||
-      num_group_id_bits == 32 || num_group_id_bits == 64);
-
-  int bit_offset = slot * num_group_id_bits;
-  const uint64_t* group_id_bytes =
-      reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
-  uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
-
-  return group_id;
-}
-
-void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
-                                        uint32_t group_id) {
-  const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
-
-  // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
-  // In that case we can insert group id value using aligned 64-bit word access.
-  ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
-      num_groupid_bits == 32 || num_groupid_bits == 64);
-
-  const uint64_t num_block_bytes = (8 + num_groupid_bits);
-  constexpr uint64_t stamp_mask = 0x7f;
-
-  int start_slot = (slot_id & 7);
-  int stamp =
-      static_cast<int>((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask);
-  uint64_t block_id = slot_id >> 3;
-  uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
-
-  blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
-  int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
-
-  // Block status bytes should start at an address aligned to 8 bytes
-  ARROW_DCHECK((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
-  uint64_t* ptr = reinterpret_cast<uint64_t*>(blockbase) + 1 + (groupid_bit_offset >> 6);
-  *ptr |= (static_cast<uint64_t>(group_id) << (groupid_bit_offset & 63));
-}
-
-
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h
index c9a3a9dc100..790c90e6411 100644
--- a/cpp/src/arrow/compute/key_map.h
+++ b/cpp/src/arrow/compute/key_map.h
@@ -20,6 +20,7 @@
 #include <functional>
 
 #include "arrow/compute/util.h"
+#include "arrow/compute/util_internal.h"
 #include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
@@ -107,7 +108,7 @@ class SwissTable {
 
   /// \brief Extract group id for a given slot in a given block.
   ///
-  uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
+  inline uint64_t extract_group_id(const uint8_t* block_ptr, int slot,
                                    uint64_t group_id_mask) const;
   void extract_group_ids(const int num_keys, const uint16_t* optional_selection,
                          const uint32_t* hashes, const uint8_t* local_slots,
@@ -159,7 +160,7 @@ class SwissTable {
   inline bool find_next_stamp_match(const uint32_t hash, const uint32_t in_slot_id,
                                     uint32_t* out_slot_id, uint32_t* out_group_id) const;
 
-  void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id);
+  inline void insert_into_empty_slot(uint32_t slot_id, uint32_t hash, uint32_t group_id);
 
   // Slow processing of input keys in the most generic case.
   // Handles inserting new keys.
@@ -227,5 +228,49 @@ class SwissTable {
   MemoryPool* pool_;
 };
 
+uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
+                                      uint64_t group_id_mask) const {
+  // Group id values for all 8 slots in the block are bit-packed and follow the status
+  // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
+  // that case we can extract group id using aligned 64-bit word access.
+  int num_group_id_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
+      ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 ||
+      num_group_id_bits == 32 || num_group_id_bits == 64);
+
+  int bit_offset = slot * num_group_id_bits;
+  const uint64_t* group_id_bytes =
+      reinterpret_cast<const uint64_t*>(block_ptr) + 1 + (bit_offset >> 6);
+  uint64_t group_id = (*group_id_bytes >> (bit_offset & 63)) & group_id_mask;
+
+  return group_id;
+}
+
+void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
+                                        uint32_t group_id) {
+  const uint64_t num_groupid_bits = num_groupid_bits_from_log_blocks(log_blocks_);
+
+  // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
+  // In that case we can insert group id value using aligned 64-bit word access.
+      ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+      num_groupid_bits == 32 || num_groupid_bits == 64);
+
+  const uint64_t num_block_bytes = (8 + num_groupid_bits);
+  constexpr uint64_t stamp_mask = 0x7f;
+
+  int start_slot = (slot_id & 7);
+  int stamp =
+      static_cast<int>((hash >> (bits_hash_ - log_blocks_ - bits_stamp_)) & stamp_mask);
+  uint64_t block_id = slot_id >> 3;
+  uint8_t* blockbase = blocks_ + num_block_bytes * block_id;
+
+  blockbase[7 - start_slot] = static_cast<uint8_t>(stamp);
+  int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
+
+  // Block status bytes should start at an address aligned to 8 bytes
+      ARROW_DCHECK((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
+  uint64_t* ptr = reinterpret_cast<uint64_t*>(blockbase) + 1 + (groupid_bit_offset >> 6);
+  *ptr |= (static_cast<uint64_t>(group_id) << (groupid_bit_offset & 63));
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/light_array.h b/cpp/src/arrow/compute/light_array.h
index 33b48161733..d617b0aa064 100644
--- a/cpp/src/arrow/compute/light_array.h
+++ b/cpp/src/arrow/compute/light_array.h
@@ -21,7 +21,6 @@
 
 #include "arrow/array.h"
 #include "arrow/compute/exec.h"
-// DIPO #include "arrow/compute/exec/util.h"
 #include "arrow/compute/util.h"
 #include "arrow/type.h"
 #include "arrow/util/cpu_info.h"
diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc
index 2216add6849..39ac33932b5 100644
--- a/cpp/src/arrow/compute/row/compare_internal.cc
+++ b/cpp/src/arrow/compute/row/compare_internal.cc
@@ -22,7 +22,6 @@
 #include <algorithm>
 #include <cstdint>
 
-// DIPO
 #include "arrow/compute/util.h"
 #include "arrow/compute/util_internal.h"
 #include "arrow/util/bit_util.h"
diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h
index 162eae19790..85a4a4f68af 100644
--- a/cpp/src/arrow/compute/row/compare_internal.h
+++ b/cpp/src/arrow/compute/row/compare_internal.h
@@ -19,7 +19,6 @@
 
 #include <cstdint>
 
-// DIPO
 #include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/row/encode_internal.h"
diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc
index 8c0c0aac6a7..3a6a85b0272 100644
--- a/cpp/src/arrow/compute/row/encode_internal.cc
+++ b/cpp/src/arrow/compute/row/encode_internal.cc
@@ -16,8 +16,6 @@
 // under the License.
 
 #include "arrow/compute/row/encode_internal.h"
-//#include "arrow/compute/exec.h"
-//DIPO
 #include "arrow/util/checked_cast.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h
index 58d4abd3e83..2caa02d2f9c 100644
--- a/cpp/src/arrow/compute/row/encode_internal.h
+++ b/cpp/src/arrow/compute/row/encode_internal.h
@@ -22,8 +22,6 @@
 #include <vector>
 
 #include "arrow/array/data.h"
-//#include "arrow/compute/exec.h"
-// DIPO
 #include "arrow/compute/key_map.h"
 #include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index c879e7c7403..1747bf73267 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -21,10 +21,6 @@
 #include <mutex>
 
 #include "arrow/compute/key_hash.h"
-// DIPO
-//#include "arrow/compute/exec/key_map.h"
-//#include "arrow/compute/exec/options.h"
-//#include "arrow/compute/exec_internal.h"
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/light_array.h"
diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h
index 0ab5bba5f3e..8f43d2601d5 100644
--- a/cpp/src/arrow/compute/row/grouper.h
+++ b/cpp/src/arrow/compute/row/grouper.h
@@ -20,8 +20,6 @@
 #include <memory>
 #include <vector>
 
-//#include "arrow/compute/exec.h"
-//#include "arrow/compute/exec/options.h"
 #include "arrow/compute/key_map.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/datum.h"
diff --git a/cpp/src/arrow/compute/row/row_internal.cc b/cpp/src/arrow/compute/row/row_internal.cc
index 52e0c9cb6c7..f6a62c09fcf 100644
--- a/cpp/src/arrow/compute/row/row_internal.cc
+++ b/cpp/src/arrow/compute/row/row_internal.cc
@@ -17,7 +17,6 @@
 
 #include "arrow/compute/row/row_internal.h"
 
-// DIPO
 #include "arrow/compute/util.h"
 
 namespace arrow {

From 817013dcb455c8e5c1c6d9012a0037c09d53266c Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 09:40:43 -0500
Subject: [PATCH 04/11] fix api test

---
 cpp/src/arrow/compute/key_map.cc    | 2 +-
 cpp/src/arrow/compute/row/grouper.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc
index dac1dfa2483..4161f1e75c3 100644
--- a/cpp/src/arrow/compute/key_map.cc
+++ b/cpp/src/arrow/compute/key_map.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "key_map.h"
+#include "arrow/compute/key_map.h"
 
 #include <algorithm>
 #include <cstdint>
diff --git a/cpp/src/arrow/compute/row/grouper.h b/cpp/src/arrow/compute/row/grouper.h
index 8f43d2601d5..94c591687da 100644
--- a/cpp/src/arrow/compute/row/grouper.h
+++ b/cpp/src/arrow/compute/row/grouper.h
@@ -20,7 +20,6 @@
 #include <memory>
 #include <vector>
 
-#include "arrow/compute/key_map.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"

From 105cd3c9ad732092cf84ecafadff721607c76e9f Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 10:44:18 -0500
Subject: [PATCH 05/11] more include update

---
 cpp/src/arrow/compute/key_hash.h     | 2 +-
 cpp/src/arrow/compute/row/grouper.cc | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h
index 68197973e02..f0056af5cb1 100644
--- a/cpp/src/arrow/compute/key_hash.h
+++ b/cpp/src/arrow/compute/key_hash.h
@@ -23,7 +23,7 @@
 
 #include <cstdint>
 
-#include "arrow/compute/exec/util.h"
+#include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
 
 namespace arrow {
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index 1747bf73267..fce39261a73 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -25,6 +25,7 @@
 #include "arrow/compute/kernels/row_encoder_internal.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/registry.h"
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/row/compare_internal.h"
 #include "arrow/type.h"
 #include "arrow/util/bitmap_ops.h"

From ea5eeb9a987bbd48364f6943bfdb24d3ebe7abed Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 12:30:40 -0500
Subject: [PATCH 06/11] more include update

---
 cpp/src/arrow/compute/kernels/codegen_internal.h     | 1 -
 cpp/src/arrow/compute/kernels/common_internal.h      | 3 ++-
 cpp/src/arrow/compute/kernels/hash_aggregate.cc      | 4 ----
 cpp/src/arrow/compute/kernels/row_encoder_internal.h | 1 -
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index b05a47b600c..3dd1f2b8112 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -30,7 +30,6 @@
 #include "arrow/array/data.h"
 #include "arrow/buffer.h"
 #include "arrow/buffer_builder.h"
-#include "arrow/compute/exec.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/datum.h"
 #include "arrow/result.h"
diff --git a/cpp/src/arrow/compute/kernels/common_internal.h b/cpp/src/arrow/compute/kernels/common_internal.h
index bf90d114512..d5fe3e0376d 100644
--- a/cpp/src/arrow/compute/kernels/common_internal.h
+++ b/cpp/src/arrow/compute/kernels/common_internal.h
@@ -30,7 +30,8 @@
 #include "arrow/array/data.h"
 #include "arrow/buffer.h"
 #include "arrow/chunked_array.h"
-#include "arrow/compute/exec.h"
+
+
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/codegen_internal.h"
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 81fb2e871b5..380dde016ef 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -29,10 +29,6 @@
 #include "arrow/buffer_builder.h"
 #include "arrow/compute/api_aggregate.h"
 #include "arrow/compute/api_vector.h"
-#include "arrow/compute/key_hash.h"
-#include "arrow/compute/key_map.h"
-#include "arrow/compute/exec/util.h"
-#include "arrow/compute/exec_internal.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/aggregate_internal.h"
 #include "arrow/compute/kernels/aggregate_var_std_internal.h"
diff --git a/cpp/src/arrow/compute/kernels/row_encoder_internal.h b/cpp/src/arrow/compute/kernels/row_encoder_internal.h
index 5fe80e0f506..9bf7c1d1c4f 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder_internal.h
+++ b/cpp/src/arrow/compute/kernels/row_encoder_internal.h
@@ -19,7 +19,6 @@
 
 #include <cstdint>
 
-#include "arrow/compute/exec.h"
 #include "arrow/compute/kernels/codegen_internal.h"
 #include "arrow/visit_data_inline.h"
 

From 2a25b29d39805079373de23474505a1eb27785ad Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 13:15:35 -0500
Subject: [PATCH 07/11] format files

---
 cpp/src/arrow/compute/exec/asof_join_node.cc  |  2 +-
 .../arrow/compute/exec/bloom_filter_test.cc   |  2 +-
 cpp/src/arrow/compute/exec/hash_join_node.cc  |  2 +-
 cpp/src/arrow/compute/exec/swiss_join.cc      |  2 +-
 .../arrow/compute/exec/swiss_join_internal.h  |  2 +-
 .../arrow/compute/kernels/common_internal.h   |  1 -
 cpp/src/arrow/compute/key_hash.h              |  2 +-
 cpp/src/arrow/compute/key_hash_test.cc        |  2 +-
 cpp/src/arrow/compute/key_map.cc              |  1 -
 cpp/src/arrow/compute/key_map.h               | 10 ++++-----
 cpp/src/arrow/compute/light_array_test.cc     | 10 ++++-----
 cpp/src/arrow/compute/row/compare_internal.h  |  2 +-
 cpp/src/arrow/compute/row/encode_internal.h   |  2 +-
 cpp/src/arrow/compute/row/grouper.cc          |  4 ++--
 cpp/src/arrow/compute/util.cc                 | 22 +++++++++----------
 cpp/src/arrow/compute/util.h                  | 13 +++++------
 cpp/src/arrow/compute/util_internal.h         |  9 ++++----
 17 files changed, 43 insertions(+), 45 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/asof_join_node.cc b/cpp/src/arrow/compute/exec/asof_join_node.cc
index 4aa0fb72f05..47acc41e889 100644
--- a/cpp/src/arrow/compute/exec/asof_join_node.cc
+++ b/cpp/src/arrow/compute/exec/asof_join_node.cc
@@ -30,11 +30,11 @@
 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_primitive.h"
 #include "arrow/compute/exec/exec_plan.h"
-#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/query_context.h"
 #include "arrow/compute/exec/schema_util.h"
 #include "arrow/compute/exec/util.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/record_batch.h"
 #include "arrow/result.h"
diff --git a/cpp/src/arrow/compute/exec/bloom_filter_test.cc b/cpp/src/arrow/compute/exec/bloom_filter_test.cc
index 5dc35ed42ab..3a79c10be2a 100644
--- a/cpp/src/arrow/compute/exec/bloom_filter_test.cc
+++ b/cpp/src/arrow/compute/exec/bloom_filter_test.cc
@@ -23,10 +23,10 @@
 #include <thread>
 #include <unordered_set>
 #include "arrow/compute/exec/bloom_filter.h"
-#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/task_util.h"
 #include "arrow/compute/exec/test_util.h"
 #include "arrow/compute/exec/util.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/cpu_info.h"
 
diff --git a/cpp/src/arrow/compute/exec/hash_join_node.cc b/cpp/src/arrow/compute/exec/hash_join_node.cc
index c270b868ecc..6da58330e22 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node.cc
@@ -24,10 +24,10 @@
 #include "arrow/compute/exec/hash_join.h"
 #include "arrow/compute/exec/hash_join_dict.h"
 #include "arrow/compute/exec/hash_join_node.h"
-#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/schema_util.h"
 #include "arrow/compute/exec/util.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/future.h"
 #include "arrow/util/thread_pool.h"
diff --git a/cpp/src/arrow/compute/exec/swiss_join.cc b/cpp/src/arrow/compute/exec/swiss_join.cc
index 69479325bbb..8bf2ee1df47 100644
--- a/cpp/src/arrow/compute/exec/swiss_join.cc
+++ b/cpp/src/arrow/compute/exec/swiss_join.cc
@@ -22,10 +22,10 @@
 #include <mutex>
 #include "arrow/array/util.h"  // MakeArrayFromScalar
 #include "arrow/compute/exec/hash_join.h"
-#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/swiss_join_internal.h"
 #include "arrow/compute/exec/util.h"
 #include "arrow/compute/kernels/row_encoder_internal.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/row/compare_internal.h"
 #include "arrow/compute/row/encode_internal.h"
 #include "arrow/util/bit_util.h"
diff --git a/cpp/src/arrow/compute/exec/swiss_join_internal.h b/cpp/src/arrow/compute/exec/swiss_join_internal.h
index 4c765874bea..766f40e131c 100644
--- a/cpp/src/arrow/compute/exec/swiss_join_internal.h
+++ b/cpp/src/arrow/compute/exec/swiss_join_internal.h
@@ -18,12 +18,12 @@
 #pragma once
 
 #include <cstdint>
-#include "arrow/compute/key_map.h"
 #include "arrow/compute/exec/options.h"
 #include "arrow/compute/exec/partition_util.h"
 #include "arrow/compute/exec/schema_util.h"
 #include "arrow/compute/exec/task_util.h"
 #include "arrow/compute/kernels/row_encoder_internal.h"
+#include "arrow/compute/key_map.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/row/encode_internal.h"
 
diff --git a/cpp/src/arrow/compute/kernels/common_internal.h b/cpp/src/arrow/compute/kernels/common_internal.h
index d5fe3e0376d..744aee23795 100644
--- a/cpp/src/arrow/compute/kernels/common_internal.h
+++ b/cpp/src/arrow/compute/kernels/common_internal.h
@@ -31,7 +31,6 @@
 #include "arrow/buffer.h"
 #include "arrow/chunked_array.h"
 
-
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/codegen_internal.h"
diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h
index f0056af5cb1..ddf86dfcdc0 100644
--- a/cpp/src/arrow/compute/key_hash.h
+++ b/cpp/src/arrow/compute/key_hash.h
@@ -23,8 +23,8 @@
 
 #include <cstdint>
 
-#include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
+#include "arrow/compute/util.h"
 
 namespace arrow {
 namespace compute {
diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc
index 1ee9eb25312..d030e622641 100644
--- a/cpp/src/arrow/compute/key_hash_test.cc
+++ b/cpp/src/arrow/compute/key_hash_test.cc
@@ -21,9 +21,9 @@
 #include <map>
 #include <unordered_set>
 #include "arrow/array/builder_binary.h"
-#include "arrow/compute/key_hash.h"
 #include "arrow/compute/exec/test_util.h"
 #include "arrow/compute/exec/util.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/util/cpu_info.h"
 #include "arrow/util/pcg_random.h"
 
diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc
index 4161f1e75c3..ebbf8a7b828 100644
--- a/cpp/src/arrow/compute/key_map.cc
+++ b/cpp/src/arrow/compute/key_map.cc
@@ -24,7 +24,6 @@
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/ubsan.h"
 
-
 namespace arrow {
 
 using bit_util::CountLeadingZeros;
diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h
index 790c90e6411..4702c5ecc8b 100644
--- a/cpp/src/arrow/compute/key_map.h
+++ b/cpp/src/arrow/compute/key_map.h
@@ -234,8 +234,8 @@ uint64_t SwissTable::extract_group_id(const uint8_t* block_ptr, int slot,
   // bytes. We assume here that the number of bits is rounded up to 8, 16, 32 or 64. In
   // that case we can extract group id using aligned 64-bit word access.
   int num_group_id_bits = static_cast<int>(ARROW_POPCOUNT64(group_id_mask));
-      ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 ||
-      num_group_id_bits == 32 || num_group_id_bits == 64);
+  ARROW_DCHECK(num_group_id_bits == 8 || num_group_id_bits == 16 ||
+               num_group_id_bits == 32 || num_group_id_bits == 64);
 
   int bit_offset = slot * num_group_id_bits;
   const uint64_t* group_id_bytes =
@@ -251,8 +251,8 @@ void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
 
   // We assume here that the number of bits is rounded up to 8, 16, 32 or 64.
   // In that case we can insert group id value using aligned 64-bit word access.
-      ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
-      num_groupid_bits == 32 || num_groupid_bits == 64);
+  ARROW_DCHECK(num_groupid_bits == 8 || num_groupid_bits == 16 ||
+               num_groupid_bits == 32 || num_groupid_bits == 64);
 
   const uint64_t num_block_bytes = (8 + num_groupid_bits);
   constexpr uint64_t stamp_mask = 0x7f;
@@ -267,7 +267,7 @@ void SwissTable::insert_into_empty_slot(uint32_t slot_id, uint32_t hash,
   int groupid_bit_offset = static_cast<int>(start_slot * num_groupid_bits);
 
   // Block status bytes should start at an address aligned to 8 bytes
-      ARROW_DCHECK((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
+  ARROW_DCHECK((reinterpret_cast<uint64_t>(blockbase) & 7) == 0);
   uint64_t* ptr = reinterpret_cast<uint64_t*>(blockbase) + 1 + (groupid_bit_offset >> 6);
   *ptr |= (static_cast<uint64_t>(group_id) << (groupid_bit_offset & 63));
 }
diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
index 015d407e810..8b0e38e64c8 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -217,7 +217,7 @@ TEST(KeyColumnArray, SliceBool) {
 }
 
 // DIPO
-//TEST(KeyColumnArray, FromExecBatch) {
+// TEST(KeyColumnArray, FromExecBatch) {
 //  ExecBatch batch =
 //      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
 //  std::vector<KeyColumnArray> arrays;
@@ -316,7 +316,7 @@ TEST(ResizableArrayData, Binary) {
   }
 }
 // DIPO
-//TEST(ExecBatchBuilder, AppendBatches) {
+// TEST(ExecBatchBuilder, AppendBatches) {
 //  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
 //  MemoryPool* pool = owned_pool.get();
 //  ExecBatch batch_one =
@@ -339,7 +339,7 @@ TEST(ResizableArrayData, Binary) {
 //}
 
 // DIPO
-//TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
+// TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
 //  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
 //  MemoryPool* pool = owned_pool.get();
 //  ExecBatch batch_one =
@@ -360,7 +360,7 @@ TEST(ResizableArrayData, Binary) {
 //  ASSERT_EQ(0, pool->bytes_allocated());
 //}
 // DIPO
-//TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
+// TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
 //  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
 //  MemoryPool* pool = owned_pool.get();
 //  ExecBatch batch_one =
@@ -408,7 +408,7 @@ TEST(ResizableArrayData, Binary) {
 //  ASSERT_EQ(0, pool->bytes_allocated());
 //}
 //
-//TEST(ExecBatchBuilder, AppendNulls) {
+// TEST(ExecBatchBuilder, AppendNulls) {
 //  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
 //  MemoryPool* pool = owned_pool.get();
 //  ExecBatch batch_one =
diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h
index 85a4a4f68af..778485e5c46 100644
--- a/cpp/src/arrow/compute/row/compare_internal.h
+++ b/cpp/src/arrow/compute/row/compare_internal.h
@@ -19,10 +19,10 @@
 
 #include <cstdint>
 
-#include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/row/encode_internal.h"
 #include "arrow/compute/row/row_internal.h"
+#include "arrow/compute/util.h"
 #include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h
index 2caa02d2f9c..bdf38df4fc3 100644
--- a/cpp/src/arrow/compute/row/encode_internal.h
+++ b/cpp/src/arrow/compute/row/encode_internal.h
@@ -23,9 +23,9 @@
 
 #include "arrow/array/data.h"
 #include "arrow/compute/key_map.h"
-#include "arrow/compute/util.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/row/row_internal.h"
+#include "arrow/compute/util.h"
 #include "arrow/memory_pool.h"
 #include "arrow/result.h"
 #include "arrow/status.h"
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index fce39261a73..ca26600c98b 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -20,12 +20,12 @@
 #include <memory>
 #include <mutex>
 
-#include "arrow/compute/key_hash.h"
+#include "arrow/compute/api_vector.h"
 #include "arrow/compute/function.h"
 #include "arrow/compute/kernels/row_encoder_internal.h"
+#include "arrow/compute/key_hash.h"
 #include "arrow/compute/light_array.h"
 #include "arrow/compute/registry.h"
-#include "arrow/compute/api_vector.h"
 #include "arrow/compute/row/compare_internal.h"
 #include "arrow/type.h"
 #include "arrow/util/bitmap_ops.h"
diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc
index 572a52a5f19..78f90ea37f7 100644
--- a/cpp/src/arrow/compute/util.cc
+++ b/cpp/src/arrow/compute/util.cc
@@ -20,9 +20,9 @@
 #include "arrow/table.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
+#include "arrow/util/logging.h"
 #include "arrow/util/tracing_internal.h"
 #include "arrow/util/ubsan.h"
-#include "arrow/util/logging.h"
 
 namespace arrow {
 
@@ -30,29 +30,29 @@ using bit_util::CountTrailingZeros;
 
 namespace util {
 
-void TempVectorStack::alloc(uint32_t num_bytes, uint8_t **data, int *id) {
+void TempVectorStack::alloc(uint32_t num_bytes, uint8_t** data, int* id) {
   int64_t old_top = top_;
   top_ += PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
   // Stack overflow check
-      ARROW_DCHECK(top_ <= buffer_size_);
+  ARROW_DCHECK(top_ <= buffer_size_);
   *data = buffer_->mutable_data() + old_top + sizeof(uint64_t);
   // We set 8 bytes before the beginning of the allocated range and
   // 8 bytes after the end to check for stack overflow (which would
   // result in those known bytes being corrupted).
-  reinterpret_cast<uint64_t *>(buffer_->mutable_data() + old_top)[0] = kGuard1;
-  reinterpret_cast<uint64_t *>(buffer_->mutable_data() + top_)[-1] = kGuard2;
+  reinterpret_cast<uint64_t*>(buffer_->mutable_data() + old_top)[0] = kGuard1;
+  reinterpret_cast<uint64_t*>(buffer_->mutable_data() + top_)[-1] = kGuard2;
   *id = num_vectors_++;
 }
 
 void TempVectorStack::release(int id, uint32_t num_bytes) {
-      ARROW_DCHECK(num_vectors_ == id + 1);
+  ARROW_DCHECK(num_vectors_ == id + 1);
   int64_t size = PaddedAllocationSize(num_bytes) + 2 * sizeof(uint64_t);
-      ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[-1] ==
-      kGuard2);
-      ARROW_DCHECK(top_ >= size);
+  ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[-1] ==
+               kGuard2);
+  ARROW_DCHECK(top_ >= size);
   top_ -= size;
-      ARROW_DCHECK(reinterpret_cast<const uint64_t *>(buffer_->mutable_data() + top_)[0] ==
-      kGuard1);
+  ARROW_DCHECK(reinterpret_cast<const uint64_t*>(buffer_->mutable_data() + top_)[0] ==
+               kGuard1);
   --num_vectors_;
 }
 
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index 8447d28261c..66082afc0e2 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -81,12 +81,11 @@ class MiniBatch {
 /// but in the context of vectorized processing where we need to store a vector of
 /// temporaries instead of a single value.
 class TempVectorStack {
-  template<typename>
-  friend
-  class TempVectorHolder;
+  template <typename>
+  friend class TempVectorHolder;
 
  public:
-  Status Init(MemoryPool *pool, int64_t size) {
+  Status Init(MemoryPool* pool, int64_t size) {
     num_vectors_ = 0;
     top_ = 0;
     buffer_size_ = PaddedAllocationSize(size) + kPadding + 2 * sizeof(uint64_t);
@@ -108,7 +107,7 @@ class TempVectorStack {
     //
     return ::arrow::bit_util::RoundUp(num_bytes, sizeof(int64_t)) + kPadding;
   }
-  void alloc(uint32_t num_bytes, uint8_t **data, int *id);
+  void alloc(uint32_t num_bytes, uint8_t** data, int* id);
   void release(int id, uint32_t num_bytes);
   static constexpr uint64_t kGuard1 = 0x3141592653589793ULL;
   static constexpr uint64_t kGuard2 = 0x0577215664901532ULL;
@@ -202,5 +201,5 @@ class bit_util {
 #endif
 };
 
-}
-}
\ No newline at end of file
+}  // namespace util
+}  // namespace arrow
\ No newline at end of file
diff --git a/cpp/src/arrow/compute/util_internal.h b/cpp/src/arrow/compute/util_internal.h
index 9d45ede940f..87e89a33507 100644
--- a/cpp/src/arrow/compute/util_internal.h
+++ b/cpp/src/arrow/compute/util_internal.h
@@ -22,9 +22,10 @@
 namespace arrow {
 namespace util {
 
-template<typename T> void CheckAlignment(const void *ptr) {
-      ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
+template <typename T>
+void CheckAlignment(const void* ptr) {
+  ARROW_DCHECK(reinterpret_cast<uint64_t>(ptr) % sizeof(T) == 0);
 }
 
-}
-}
+}  // namespace util
+}  // namespace arrow

From b81e2713fcf087edb3ca77bae6201bed1064262e Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 17:08:32 -0500
Subject: [PATCH 08/11] fix light_array test

---
 cpp/src/arrow/compute/exec/CMakeLists.txt     |   6 +-
 .../compute/exec/light_array_exec_test.cc     | 172 ++++++++++++++++++
 cpp/src/arrow/compute/light_array_test.cc     | 144 ---------------
 3 files changed, 177 insertions(+), 145 deletions(-)
 create mode 100644 cpp/src/arrow/compute/exec/light_array_exec_test.cc

diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt
index cc8e7175a2a..ec854c83936 100644
--- a/cpp/src/arrow/compute/exec/CMakeLists.txt
+++ b/cpp/src/arrow/compute/exec/CMakeLists.txt
@@ -53,7 +53,6 @@ add_arrow_compute_test(pivot_longer_node_test
                        SOURCES
                        pivot_longer_node_test.cc
                        test_nodes.cc)
-
 add_arrow_compute_test(asof_join_node_test
                        REQUIRE_ALL_KERNELS
                        PREFIX
@@ -70,6 +69,11 @@ add_arrow_compute_test(util_test
                        SOURCES
                        util_test.cc
                        task_util_test.cc)
+add_arrow_compute_test(light_array_exec_test
+        PREFIX
+        "arrow-compute"
+        SOURCES
+        light_array_exec_test.cc)
 
 add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute")
 
diff --git a/cpp/src/arrow/compute/exec/light_array_exec_test.cc b/cpp/src/arrow/compute/exec/light_array_exec_test.cc
new file mode 100644
index 00000000000..ff452fa91fe
--- /dev/null
+++ b/cpp/src/arrow/compute/exec/light_array_exec_test.cc
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/compute/light_array.h"
+
+#include <gtest/gtest.h>
+
+#include "arrow/compute/exec/test_util.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow {
+namespace compute {
+
+ TEST(KeyColumnArray, FromExecBatch) {
+  ExecBatch batch =
+      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+  std::vector<KeyColumnArray> arrays;
+  ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays));
+
+  ASSERT_EQ(2, arrays.size());
+  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
+  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
+  ASSERT_EQ(3, arrays[0].length());
+  ASSERT_EQ(3, arrays[1].length());
+
+  ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays));
+
+  ASSERT_EQ(2, arrays.size());
+  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
+  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
+  ASSERT_EQ(1, arrays[0].length());
+  ASSERT_EQ(1, arrays[1].length());
+}
+
+TEST(ExecBatchBuilder, AppendBatches) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  ExecBatch batch_one =
+      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+  ExecBatch batch_two =
+      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
+  ExecBatch combined = ExecBatchFromJSON(
+      {int64(), boolean()},
+      "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]");
+  {
+    ExecBatchBuilder builder;
+    uint16_t row_ids[3] = {0, 1, 2};
+    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
+    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2));
+    ExecBatch built = builder.Flush();
+    ASSERT_EQ(combined, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+  ASSERT_EQ(0, pool->bytes_allocated());
+}
+
+TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  ExecBatch batch_one =
+      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+  ExecBatch batch_two =
+      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
+  ExecBatch combined = ExecBatchFromJSON(
+      {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]");
+  {
+    ExecBatchBuilder builder;
+    uint16_t row_ids[2] = {0, 1};
+    ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2));
+    ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2));
+    ExecBatch built = builder.Flush();
+    ASSERT_EQ(combined, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+  ASSERT_EQ(0, pool->bytes_allocated());
+}
+
+TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  ExecBatch batch_one =
+      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+  ExecBatch batch_two =
+      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
+  ExecBatch first_col_only =
+      ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]");
+  ExecBatch last_col_only = ExecBatchFromJSON(
+      {boolean()}, "[[true], [false], [null], [true], [true], [false]]");
+  {
+    ExecBatchBuilder builder;
+    uint16_t row_ids[3] = {0, 1, 2};
+    int first_col_ids[1] = {0};
+    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
+                                     first_col_ids));
+    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
+                                     first_col_ids));
+    ExecBatch built = builder.Flush();
+    ASSERT_EQ(first_col_only, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+  {
+    ExecBatchBuilder builder;
+    uint16_t row_ids[3] = {0, 1, 2};
+    // If we don't specify col_ids and num_cols is 1 it is implicitly the first col
+    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1));
+    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1));
+    ExecBatch built = builder.Flush();
+    ASSERT_EQ(first_col_only, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+  {
+    ExecBatchBuilder builder;
+    uint16_t row_ids[3] = {0, 1, 2};
+    int last_col_ids[1] = {1};
+    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
+                                     last_col_ids));
+    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
+                                     last_col_ids));
+    ExecBatch built = builder.Flush();
+    ASSERT_EQ(last_col_only, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+  ASSERT_EQ(0, pool->bytes_allocated());
+}
+
+ TEST(ExecBatchBuilder, AppendNulls) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  ExecBatch batch_one =
+      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
+  ExecBatch combined = ExecBatchFromJSON(
+      {int64(), boolean()},
+      "[[1, true], [2, false], [null, null], [null, null], [null, null]]");
+  ExecBatch just_nulls =
+      ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]");
+  {
+    ExecBatchBuilder builder;
+    uint16_t row_ids[3] = {0, 1, 2};
+    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
+    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
+    ExecBatch built = builder.Flush();
+    ASSERT_EQ(combined, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+  {
+    ExecBatchBuilder builder;
+    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
+    ExecBatch built = builder.Flush();
+    ASSERT_EQ(just_nulls, built);
+    ASSERT_NE(0, pool->bytes_allocated());
+  }
+  ASSERT_EQ(0, pool->bytes_allocated());
+}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc
index 8b0e38e64c8..c5e83b546b6 100644
--- a/cpp/src/arrow/compute/light_array_test.cc
+++ b/cpp/src/arrow/compute/light_array_test.cc
@@ -20,7 +20,6 @@
 #include <gtest/gtest.h>
 #include <numeric>
 
-// DIPO #include "arrow/compute/exec/test_util.h"
 #include "arrow/testing/generator.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/type.h"
@@ -216,28 +215,6 @@ TEST(KeyColumnArray, SliceBool) {
   }
 }
 
-// DIPO
-// TEST(KeyColumnArray, FromExecBatch) {
-//  ExecBatch batch =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-//  std::vector<KeyColumnArray> arrays;
-//  ASSERT_OK(ColumnArraysFromExecBatch(batch, &arrays));
-//
-//  ASSERT_EQ(2, arrays.size());
-//  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
-//  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
-//  ASSERT_EQ(3, arrays[0].length());
-//  ASSERT_EQ(3, arrays[1].length());
-//
-//  ASSERT_OK(ColumnArraysFromExecBatch(batch, 1, 1, &arrays));
-//
-//  ASSERT_EQ(2, arrays.size());
-//  ASSERT_EQ(8, arrays[0].metadata().fixed_length);
-//  ASSERT_EQ(0, arrays[1].metadata().fixed_length);
-//  ASSERT_EQ(1, arrays[0].length());
-//  ASSERT_EQ(1, arrays[1].length());
-//}
-
 TEST(ResizableArrayData, Basic) {
   std::unique_ptr<MemoryPool> pool = MemoryPool::CreateDefault();
   for (const auto& type : kSampleFixedDataTypes) {
@@ -315,127 +292,6 @@ TEST(ResizableArrayData, Binary) {
     ASSERT_EQ(0, pool->bytes_allocated());
   }
 }
-// DIPO
-// TEST(ExecBatchBuilder, AppendBatches) {
-//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-//  MemoryPool* pool = owned_pool.get();
-//  ExecBatch batch_one =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-//  ExecBatch batch_two =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
-//  ExecBatch combined = ExecBatchFromJSON(
-//      {int64(), boolean()},
-//      "[[1, true], [2, false], [null, null], [null, true], [5, true], [6, false]]");
-//  {
-//    ExecBatchBuilder builder;
-//    uint16_t row_ids[3] = {0, 1, 2};
-//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
-//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/2));
-//    ExecBatch built = builder.Flush();
-//    ASSERT_EQ(combined, built);
-//    ASSERT_NE(0, pool->bytes_allocated());
-//  }
-//  ASSERT_EQ(0, pool->bytes_allocated());
-//}
-
-// DIPO
-// TEST(ExecBatchBuilder, AppendBatchesSomeRows) {
-//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-//  MemoryPool* pool = owned_pool.get();
-//  ExecBatch batch_one =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-//  ExecBatch batch_two =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
-//  ExecBatch combined = ExecBatchFromJSON(
-//      {int64(), boolean()}, "[[1, true], [2, false], [null, true], [5, true]]");
-//  {
-//    ExecBatchBuilder builder;
-//    uint16_t row_ids[2] = {0, 1};
-//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 2, row_ids, /*num_cols=*/2));
-//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 2, row_ids, /*num_cols=*/2));
-//    ExecBatch built = builder.Flush();
-//    ASSERT_EQ(combined, built);
-//    ASSERT_NE(0, pool->bytes_allocated());
-//  }
-//  ASSERT_EQ(0, pool->bytes_allocated());
-//}
-// DIPO
-// TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
-//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-//  MemoryPool* pool = owned_pool.get();
-//  ExecBatch batch_one =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-//  ExecBatch batch_two =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[null, true], [5, true], [6, false]]");
-//  ExecBatch first_col_only =
-//      ExecBatchFromJSON({int64()}, "[[1], [2], [null], [null], [5], [6]]");
-//  ExecBatch last_col_only = ExecBatchFromJSON(
-//      {boolean()}, "[[true], [false], [null], [true], [true], [false]]");
-//  {
-//    ExecBatchBuilder builder;
-//    uint16_t row_ids[3] = {0, 1, 2};
-//    int first_col_ids[1] = {0};
-//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
-//                                     first_col_ids));
-//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
-//                                     first_col_ids));
-//    ExecBatch built = builder.Flush();
-//    ASSERT_EQ(first_col_only, built);
-//    ASSERT_NE(0, pool->bytes_allocated());
-//  }
-//  {
-//    ExecBatchBuilder builder;
-//    uint16_t row_ids[3] = {0, 1, 2};
-//    // If we don't specify col_ids and num_cols is 1 it is implicitly the first col
-//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1));
-//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1));
-//    ExecBatch built = builder.Flush();
-//    ASSERT_EQ(first_col_only, built);
-//    ASSERT_NE(0, pool->bytes_allocated());
-//  }
-//  {
-//    ExecBatchBuilder builder;
-//    uint16_t row_ids[3] = {0, 1, 2};
-//    int last_col_ids[1] = {1};
-//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/1,
-//                                     last_col_ids));
-//    ASSERT_OK(builder.AppendSelected(pool, batch_two, 3, row_ids, /*num_cols=*/1,
-//                                     last_col_ids));
-//    ExecBatch built = builder.Flush();
-//    ASSERT_EQ(last_col_only, built);
-//    ASSERT_NE(0, pool->bytes_allocated());
-//  }
-//  ASSERT_EQ(0, pool->bytes_allocated());
-//}
-//
-// TEST(ExecBatchBuilder, AppendNulls) {
-//  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
-//  MemoryPool* pool = owned_pool.get();
-//  ExecBatch batch_one =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
-//  ExecBatch combined = ExecBatchFromJSON(
-//      {int64(), boolean()},
-//      "[[1, true], [2, false], [null, null], [null, null], [null, null]]");
-//  ExecBatch just_nulls =
-//      ExecBatchFromJSON({int64(), boolean()}, "[[null, null], [null, null]]");
-//  {
-//    ExecBatchBuilder builder;
-//    uint16_t row_ids[3] = {0, 1, 2};
-//    ASSERT_OK(builder.AppendSelected(pool, batch_one, 3, row_ids, /*num_cols=*/2));
-//    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
-//    ExecBatch built = builder.Flush();
-//    ASSERT_EQ(combined, built);
-//    ASSERT_NE(0, pool->bytes_allocated());
-//  }
-//  {
-//    ExecBatchBuilder builder;
-//    ASSERT_OK(builder.AppendNulls(pool, {int64(), boolean()}, 2));
-//    ExecBatch built = builder.Flush();
-//    ASSERT_EQ(just_nulls, built);
-//    ASSERT_NE(0, pool->bytes_allocated());
-//  }
-//  ASSERT_EQ(0, pool->bytes_allocated());
-//}
 
 TEST(ExecBatchBuilder, AppendNullsBeyondLimit) {
   std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();

From 2401904ed04db5bfbd2d2ca7985e2017fee4b3f4 Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Wed, 8 Mar 2023 17:26:24 -0500
Subject: [PATCH 09/11] fix format

---
 cpp/src/arrow/compute/exec/light_array_exec_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/light_array_exec_test.cc b/cpp/src/arrow/compute/exec/light_array_exec_test.cc
index ff452fa91fe..cbc93299a5b 100644
--- a/cpp/src/arrow/compute/exec/light_array_exec_test.cc
+++ b/cpp/src/arrow/compute/exec/light_array_exec_test.cc
@@ -27,7 +27,7 @@
 namespace arrow {
 namespace compute {
 
- TEST(KeyColumnArray, FromExecBatch) {
+TEST(KeyColumnArray, FromExecBatch) {
   ExecBatch batch =
       ExecBatchFromJSON({int64(), boolean()}, "[[1, true], [2, false], [null, null]]");
   std::vector<KeyColumnArray> arrays;
@@ -139,7 +139,7 @@ TEST(ExecBatchBuilder, AppendBatchesSomeCols) {
   ASSERT_EQ(0, pool->bytes_allocated());
 }
 
- TEST(ExecBatchBuilder, AppendNulls) {
+TEST(ExecBatchBuilder, AppendNulls) {
   std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
   MemoryPool* pool = owned_pool.get();
   ExecBatch batch_one =

From 93778dbe4486f1415c51fdfcffb5e1a440f9ddec Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Thu, 9 Mar 2023 09:00:02 -0500
Subject: [PATCH 10/11] fix lint

---
 cpp/src/arrow/compute/key_hash.cc | 2 +-
 cpp/src/arrow/compute/util.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc
index 993d68b1fa3..3fcfbf3d831 100644
--- a/cpp/src/arrow/compute/key_hash.cc
+++ b/cpp/src/arrow/compute/key_hash.cc
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "key_hash.h"
+#include "arrow/compute/key_hash.h"
 
 #include <memory.h>
 
diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h
index 66082afc0e2..60c20137c8c 100644
--- a/cpp/src/arrow/compute/util.h
+++ b/cpp/src/arrow/compute/util.h
@@ -202,4 +202,4 @@ class bit_util {
 };
 
 }  // namespace util
-}  // namespace arrow
\ No newline at end of file
+}  // namespace arrow

From 24edd64febbf0288a780b7e4e5c8600f5f50657f Mon Sep 17 00:00:00 2001
From: Davide Pasetto <dpasetto69@gmail.com>
Date: Thu, 9 Mar 2023 09:22:29 -0500
Subject: [PATCH 11/11] fix cmake format

---
 cpp/src/arrow/compute/exec/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/CMakeLists.txt b/cpp/src/arrow/compute/exec/CMakeLists.txt
index ec854c83936..914840b0396 100644
--- a/cpp/src/arrow/compute/exec/CMakeLists.txt
+++ b/cpp/src/arrow/compute/exec/CMakeLists.txt
@@ -70,10 +70,10 @@ add_arrow_compute_test(util_test
                        util_test.cc
                        task_util_test.cc)
 add_arrow_compute_test(light_array_exec_test
-        PREFIX
-        "arrow-compute"
-        SOURCES
-        light_array_exec_test.cc)
+                       PREFIX
+                       "arrow-compute"
+                       SOURCES
+                       light_array_exec_test.cc)
 
 add_arrow_benchmark(expression_benchmark PREFIX "arrow-compute")