From 7ea173cd54f67ded0bd14afca6eae926609a5698 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 6 Jun 2022 17:51:13 -0700
Subject: [PATCH 01/15] feat: implement dispatch for ChunkedArray take

---
 .../arrow/compute/kernels/vector_selection.cc | 89 ++++++++-----------
 1 file changed, 35 insertions(+), 54 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 5060b06465b..9e2d42ddefb 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -2005,70 +2005,51 @@ Result<std::shared_ptr<ArrayData>> TakeAA(const std::shared_ptr<ArrayData>& valu
   return result.array();
 }
 
+Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
+                                             const ChunkedArray& indices,
+                                             const TakeOptions& options,
+                                             ExecContext* ctx) {
+  ARROW_ASSIGN_OR_RAISE(
+      Datum result,
+      CallFunction("array_take", {Datum(values), Datum(indices)}, &options, ctx));
+  return result.chunked_array();
+}
+
 Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
-                                             const Array& indices,
+                                             std::shared_ptr<Array> indices,
                                              const TakeOptions& options,
                                              ExecContext* ctx) {
   auto num_chunks = values.num_chunks();
-  std::shared_ptr<Array> current_chunk;
-
-  // Case 1: `values` has a single chunk, so just use it
-  if (num_chunks == 1) {
-    current_chunk = values.chunk(0);
-  } else {
-    // TODO Case 2: See if all `indices` fall in the same chunk and call Array Take on it
-    // See
-    // https://github.com/apache/arrow/blob/6f2c9041137001f7a9212f244b51bc004efc29af/r/src/compute.cpp#L123-L151
-    // TODO Case 3: If indices are sorted, can slice them and call Array Take
 
-    // Case 4: Else, concatenate chunks and call Array Take
-    if (values.chunks().empty()) {
+  // If `values` has zero or one chunks, just use the AA implementation
+  if (num_chunks <= 1) {
+    std::shared_ptr<Array> current_chunk;
+    // Case 1: `values` has a single chunk, so just use it
+    if (num_chunks == 0) {
+      current_chunk = values.chunk(0);
+    } else {
+      // Case 2: `values` has no chunks, so create an empty one
       ARROW_ASSIGN_OR_RAISE(current_chunk, MakeArrayOfNull(values.type(), /*length=*/0,
                                                            ctx->memory_pool()));
-    } else {
-      ARROW_ASSIGN_OR_RAISE(current_chunk,
-                            Concatenate(values.chunks(), ctx->memory_pool()));
     }
+    // Call Array Take on our single chunk
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> new_chunk,
+                          TakeAA(current_chunk->data(), indices->data(), options, ctx));
+    std::vector<std::shared_ptr<Array>> chunks = {MakeArray(new_chunk)};
+    return std::make_shared<ChunkedArray>(std::move(chunks));
+    // Case 3:
+  } else {
+    ChunkedArray indices_chunked(indices);
+    return TakeCC(values, indices_chunked, options, ctx);
   }
-  // Call Array Take on our single chunk
-  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> new_chunk,
-                        TakeAA(current_chunk->data(), indices.data(), options, ctx));
-  std::vector<std::shared_ptr<Array>> chunks = {MakeArray(new_chunk)};
-  return std::make_shared<ChunkedArray>(std::move(chunks));
-}
-
-Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
-                                             const ChunkedArray& indices,
-                                             const TakeOptions& options,
-                                             ExecContext* ctx) {
-  auto num_chunks = indices.num_chunks();
-  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-  for (int i = 0; i < num_chunks; i++) {
-    // Take with that indices chunk
-    // Note that as currently implemented, this is inefficient because `values`
-    // will get concatenated on every iteration of this loop
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ChunkedArray> current_chunk,
-                          TakeCA(values, *indices.chunk(i), options, ctx));
-    // Concatenate the result to make a single array for this chunk
-    ARROW_ASSIGN_OR_RAISE(new_chunks[i],
-                          Concatenate(current_chunk->chunks(), ctx->memory_pool()));
-  }
-  return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
 }
 
-Result<std::shared_ptr<ChunkedArray>> TakeAC(const Array& values,
+Result<std::shared_ptr<ChunkedArray>> TakeAC(std::shared_ptr<Array> values,
                                              const ChunkedArray& indices,
                                              const TakeOptions& options,
                                              ExecContext* ctx) {
-  auto num_chunks = indices.num_chunks();
-  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-  for (int i = 0; i < num_chunks; i++) {
-    // Take with that indices chunk
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<ArrayData> chunk,
-                          TakeAA(values.data(), indices.chunk(i)->data(), options, ctx));
-    new_chunks[i] = MakeArray(chunk);
-  }
-  return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
+  ChunkedArray values_chunked(values);
+  return TakeCC(values_chunked, indices, options, ctx);
 }
 
 Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
@@ -2086,7 +2067,7 @@ Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
   return RecordBatch::Make(batch.schema(), nrows, std::move(columns));
 }
 
-Result<std::shared_ptr<Table>> TakeTA(const Table& table, const Array& indices,
+Result<std::shared_ptr<Table>> TakeTA(const Table& table, std::shared_ptr<Array> indices,
                                       const TakeOptions& options, ExecContext* ctx) {
   auto ncols = table.num_columns();
   std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
@@ -2138,12 +2119,12 @@ class TakeMetaFunction : public MetaFunction {
         if (index_kind == Datum::ARRAY) {
           return TakeAA(args[0].array(), args[1].array(), take_opts, ctx);
         } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeAC(*args[0].make_array(), *args[1].chunked_array(), take_opts, ctx);
+          return TakeAC(args[0].make_array(), *args[1].chunked_array(), take_opts, ctx);
         }
         break;
       case Datum::CHUNKED_ARRAY:
         if (index_kind == Datum::ARRAY) {
-          return TakeCA(*args[0].chunked_array(), *args[1].make_array(), take_opts, ctx);
+          return TakeCA(*args[0].chunked_array(), args[1].make_array(), take_opts, ctx);
         } else if (index_kind == Datum::CHUNKED_ARRAY) {
           return TakeCC(*args[0].chunked_array(), *args[1].chunked_array(), take_opts,
                         ctx);
@@ -2156,7 +2137,7 @@ class TakeMetaFunction : public MetaFunction {
         break;
       case Datum::TABLE:
         if (index_kind == Datum::ARRAY) {
-          return TakeTA(*args[0].table(), *args[1].make_array(), take_opts, ctx);
+          return TakeTA(*args[0].table(), args[1].make_array(), take_opts, ctx);
         } else if (index_kind == Datum::CHUNKED_ARRAY) {
           return TakeTC(*args[0].table(), *args[1].chunked_array(), take_opts, ctx);
         }

From d7e7c10040ba5881b796bfce0c7c6e6b8eea1855 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Fri, 12 Aug 2022 16:07:28 -0700
Subject: [PATCH 02/15] Draft implementation for ChunkedPrimitiveTake

---
 cpp/src/arrow/compute/kernel.h                |   2 +-
 .../arrow/compute/kernels/vector_selection.cc | 264 ++++++++++++++++--
 cpp/src/arrow/util/int_util.cc                |   8 +
 cpp/src/arrow/util/int_util.h                 |   3 +
 4 files changed, 246 insertions(+), 31 deletions(-)

diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index d8960308dff..077bdcb31fa 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -538,7 +538,7 @@ struct ScalarKernel : public Kernel {
 // ----------------------------------------------------------------------
 // VectorKernel (for VectorFunction)
 
-/// \brief Kernel data structure for implementations of VectorFunction. In
+/// \brief Kernel data structure for implementations of VectorFunction. It
 /// contains an optional finalizer function, the null handling and memory
 /// pre-allocation preferences (which have different defaults from
 /// ScalarKernel), and some other execution-related options.
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 9e2d42ddefb..3e72f854efa 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -28,6 +28,7 @@
 #include "arrow/array/builder_primitive.h"
 #include "arrow/array/concatenate.h"
 #include "arrow/buffer_builder.h"
+#include "arrow/chunk_resolver.h"
 #include "arrow/chunked_array.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/kernels/common.h"
@@ -43,6 +44,7 @@
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/bitmap_reader.h"
 #include "arrow/util/int_util.h"
+#include "arrow/util/vector.h"
 
 namespace arrow {
 
@@ -50,8 +52,11 @@ using internal::BinaryBitBlockCounter;
 using internal::BitBlockCount;
 using internal::BitBlockCounter;
 using internal::CheckIndexBounds;
+using internal::ChunkLocation;
+using internal::ChunkResolver;
 using internal::CopyBitmap;
 using internal::CountSetBits;
+using internal::MapVector;
 using internal::OptionalBitBlockCounter;
 using internal::OptionalBitIndexer;
 
@@ -365,6 +370,126 @@ struct PrimitiveTakeImpl {
     }
     out_arr->null_count = out_arr->length - valid_count;
   }
+
+  static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
+                   ArrayData* out_arr) {
+    auto values_resolver = ChunkResolver(values.chunks());
+    const std::vector<const ValueCType*> values_data = MapVector(
+        [](const auto& x) { return x->data()->template GetValues<ValueCType>(1); },
+        values.chunks());
+    const std::vector<const uint8_t*> values_is_valid =
+        MapVector([](const auto& x) { return x->null_bitmap_data(); }, values.chunks());
+    const std::vector<int64_t> values_offset =
+        MapVector([](const auto& x) { return x->offset(); }, values.chunks());
+
+    auto out = out_arr->GetMutableValues<ValueCType>(1);
+    uint8_t* out_is_valid = out_arr->buffers[0]->mutable_data();
+    int64_t out_offset = out_arr->offset;
+
+    int64_t position = 0;  // Position in output array
+    int64_t valid_count = 0;
+    int64_t internal_offset = 0;  // Total length of indices chunks already processed
+
+    for (const auto& indices_chunk : indices_chunked.chunks()) {
+      const ArraySpan indices = ArraySpan(*indices_chunk.get()->data());
+      // TODO: How do we reduce duplication of code?
+      const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
+      const uint8_t* indices_is_valid = indices.buffers[0].data;
+      int64_t indices_offset = indices.offset;
+
+      // If either the values or indices have nulls, we preemptively zero out the
+      // out validity bitmap so that we don't have to use ClearBit in each
+      // iteration for nulls.
+      if (values.null_count() != 0 || indices.null_count != 0) {
+        bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
+      }
+
+      OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
+                                                  indices.length);
+
+      while (position < internal_offset + indices.length) {
+        BitBlockCount block = indices_bit_counter.NextBlock();
+        if (values.null_count() == 0) {
+          // Values are never null, so things are easier
+          valid_count += block.popcount;
+          if (block.popcount == block.length) {
+            // Fastest path: neither values nor index nulls
+            bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
+            for (int64_t i = 0; i < block.length; ++i) {
+              int64_t idx = indices_data[position];
+              ChunkLocation loc = values_resolver.Resolve(idx);
+              out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
+              ++position;
+            }
+          } else if (block.popcount > 0) {
+            // Slow path: some indices but not all are null
+            for (int64_t i = 0; i < block.length; ++i) {
+              if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
+                // index is not null
+                bit_util::SetBit(out_is_valid, out_offset + position);
+                int64_t idx = indices_data[position];
+                ChunkLocation loc = values_resolver.Resolve(idx);
+                out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
+              } else {
+                out[position] = ValueCType{};
+              }
+              ++position;
+            }
+          } else {
+            memset(out + position, 0, sizeof(ValueCType) * block.length);
+            position += block.length;
+          }
+        } else {
+          // Values have nulls, so we must do random access into the values bitmap
+          if (block.popcount == block.length) {
+            // Faster path: indices are not null but values may be
+            for (int64_t i = 0; i < block.length; ++i) {
+              int64_t idx = indices_data[position];
+              ChunkLocation loc = values_resolver.Resolve(idx);
+              if (bit_util::GetBit(values_is_valid[loc.chunk_index],
+                                   values_offset[loc.chunk_index] + loc.index_in_chunk)) {
+                // value is not null
+                out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
+                bit_util::SetBit(out_is_valid, out_offset + position);
+                ++valid_count;
+              } else {
+                out[position] = ValueCType{};
+              }
+              ++position;
+            }
+          } else if (block.popcount > 0) {
+            // Slow path: some but not all indices are null. Since we are doing
+            // random access in general we have to check the value nullness one by
+            // one.
+            for (int64_t i = 0; i < block.length; ++i) {
+              int64_t idx = indices_data[position];
+              ChunkLocation loc = values_resolver.Resolve(idx);
+              if (bit_util::GetBit(indices_is_valid, indices_offset + position) &&
+                  bit_util::GetBit(values_is_valid[loc.chunk_index],
+                                   values_offset[loc.chunk_index] + loc.index_in_chunk)) {
+                // index is not null && value is not null
+                out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
+                bit_util::SetBit(out_is_valid, out_offset + position);
+                ++valid_count;
+              } else {
+                out[position] = ValueCType{};
+              }
+              ++position;
+            }
+          } else {
+            memset(out + position, 0, sizeof(ValueCType) * block.length);
+            position += block.length;
+          }
+        }
+      }
+
+      // Start next output at end of what we just wrote.
+      out_offset += indices.length;
+      internal_offset += indices.length;
+    }
+
+    out_arr->null_count = out_arr->length - valid_count;
+  }
 };
 
 template <typename IndexCType>
@@ -464,6 +589,11 @@ struct BooleanTakeImpl {
     }
     out_arr->null_count = out_arr->length - valid_count;
   }
+
+  static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
+                   ArrayData* out_arr) {
+    // TODO
+  }
 };
 
 template <template <typename...> class TakeImpl, typename... Args>
@@ -489,6 +619,29 @@ void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
   }
 }
 
+template <template <typename...> class TakeImpl, typename... Args>
+void TakeIndexDispatch(const ChunkedArray& values, const ChunkedArray& indices,
+                       ArrayData* out) {
+  // With the simplifying assumption that boundschecking has taken place
+  // already at a higher level, we can now assume that the index values are all
+  // non-negative. Thus, we can interpret signed integers as unsigned and avoid
+  // having to generate double the amount of binary code to handle each integer
+  // width.
+  switch (indices.type()->byte_width()) {
+    case 1:
+      return TakeImpl<uint8_t, Args...>::Exec(values, indices, out);
+    case 2:
+      return TakeImpl<uint16_t, Args...>::Exec(values, indices, out);
+    case 4:
+      return TakeImpl<uint32_t, Args...>::Exec(values, indices, out);
+    case 8:
+      return TakeImpl<uint64_t, Args...>::Exec(values, indices, out);
+    default:
+      DCHECK(false) << "Invalid indices byte width";
+      break;
+  }
+}
+
 Status PrimitiveTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const ArraySpan& values = batch[0].array;
   const ArraySpan& indices = batch[1].array;
@@ -530,6 +683,54 @@ Status PrimitiveTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
   return Status::OK();
 }
 
+Status ChunkedPrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const ChunkedArray& values = *batch[0].chunked_array();
+  const ChunkedArray& indices = *batch[1].chunked_array();
+
+  if (TakeState::Get(ctx).boundscheck) {
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
+  }
+
+  // TODO: Is there any reason to chunk the output for primitive arrays?
+  // We probably want to keep within 32-bit sizes for interoperability with other
+  // implementations.
+  auto out_data = ArrayData::Make(values.type(), indices.length());
+
+  const int bit_width = values.type()->bit_width();
+
+  // TODO: When neither values nor indices contain nulls, we can skip
+  // allocating the validity bitmap altogether and save time and space. A
+  // streamlined PrimitiveTakeImpl would need to be written that skips all
+  // interactions with the output validity bitmap, though.
+  RETURN_NOT_OK(PreallocateData(ctx, indices.length(), bit_width,
+                                /*allocate_validity=*/true, out_data.get()));
+
+  switch (bit_width) {
+    case 1:
+      TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_data.get());
+      break;
+    case 8:
+      TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_data.get());
+      break;
+    case 16:
+      TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_data.get());
+      break;
+    case 32:
+      TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_data.get());
+      break;
+    case 64:
+      TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_data.get());
+      break;
+    default:
+      DCHECK(false) << "Invalid values byte width";
+      break;
+  }
+
+  *out = Datum(ChunkedArray(MakeArray(out_data)));
+
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // Optimized and streamlined filter for primitive types
 
@@ -2319,6 +2520,7 @@ Status TakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
 struct SelectionKernelData {
   InputType input;
   ArrayKernelExec exec;
+  VectorKernel::ChunkedExec exec_chunked;
 };
 
 void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
@@ -2332,6 +2534,7 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
     base_kernel.signature =
         KernelSignature::Make({std::move(kernel_data.input), selection_type}, FirstType);
     base_kernel.exec = kernel_data.exec;
+    base_kernel.exec_chunked = kernel_data.exec_chunked;
     DCHECK_OK(func->AddKernel(base_kernel));
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
@@ -2454,22 +2657,22 @@ std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
 void RegisterVectorSelection(FunctionRegistry* registry) {
   // Filter kernels
   std::vector<SelectionKernelData> filter_kernels = {
-      {InputType(match::Primitive()), PrimitiveFilter},
-      {InputType(match::BinaryLike()), BinaryFilter},
-      {InputType(match::LargeBinaryLike()), BinaryFilter},
-      {InputType(Type::FIXED_SIZE_BINARY), FilterExec<FSBImpl>},
-      {InputType(null()), NullFilter},
-      {InputType(Type::DECIMAL128), FilterExec<FSBImpl>},
-      {InputType(Type::DECIMAL256), FilterExec<FSBImpl>},
-      {InputType(Type::DICTIONARY), DictionaryFilter},
-      {InputType(Type::EXTENSION), ExtensionFilter},
-      {InputType(Type::LIST), FilterExec<ListImpl<ListType>>},
-      {InputType(Type::LARGE_LIST), FilterExec<ListImpl<LargeListType>>},
-      {InputType(Type::FIXED_SIZE_LIST), FilterExec<FSLImpl>},
-      {InputType(Type::DENSE_UNION), FilterExec<DenseUnionImpl>},
-      {InputType(Type::STRUCT), StructFilter},
+      {InputType(match::Primitive()), PrimitiveFilter, NULLPTR},
+      {InputType(match::BinaryLike()), BinaryFilter, NULLPTR},
+      {InputType(match::LargeBinaryLike()), BinaryFilter, NULLPTR},
+      {InputType(Type::FIXED_SIZE_BINARY), FilterExec<FSBImpl>, NULLPTR},
+      {InputType(null()), NullFilter, NULLPTR},
+      {InputType(Type::DECIMAL128), FilterExec<FSBImpl>, NULLPTR},
+      {InputType(Type::DECIMAL256), FilterExec<FSBImpl>, NULLPTR},
+      {InputType(Type::DICTIONARY), DictionaryFilter, NULLPTR},
+      {InputType(Type::EXTENSION), ExtensionFilter, NULLPTR},
+      {InputType(Type::LIST), FilterExec<ListImpl<ListType>>, NULLPTR},
+      {InputType(Type::LARGE_LIST), FilterExec<ListImpl<LargeListType>>, NULLPTR},
+      {InputType(Type::FIXED_SIZE_LIST), FilterExec<FSLImpl>, NULLPTR},
+      {InputType(Type::DENSE_UNION), FilterExec<DenseUnionImpl>, NULLPTR},
+      {InputType(Type::STRUCT), StructFilter, NULLPTR},
       // TODO: Reuse ListType kernel for MAP
-      {InputType(Type::MAP), FilterExec<ListImpl<MapType>>},
+      {InputType(Type::MAP), FilterExec<ListImpl<MapType>>, NULLPTR},
   };
 
   VectorKernel filter_base;
@@ -2482,22 +2685,23 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
 
   // Take kernels
   std::vector<SelectionKernelData> take_kernels = {
-      {InputType(match::Primitive()), PrimitiveTake},
-      {InputType(match::BinaryLike()), TakeExec<VarBinaryImpl<BinaryType>>},
-      {InputType(match::LargeBinaryLike()), TakeExec<VarBinaryImpl<LargeBinaryType>>},
-      {InputType(Type::FIXED_SIZE_BINARY), TakeExec<FSBImpl>},
-      {InputType(null()), NullTake},
-      {InputType(Type::DECIMAL128), TakeExec<FSBImpl>},
-      {InputType(Type::DECIMAL256), TakeExec<FSBImpl>},
-      {InputType(Type::DICTIONARY), DictionaryTake},
-      {InputType(Type::EXTENSION), ExtensionTake},
-      {InputType(Type::LIST), TakeExec<ListImpl<ListType>>},
-      {InputType(Type::LARGE_LIST), TakeExec<ListImpl<LargeListType>>},
-      {InputType(Type::FIXED_SIZE_LIST), TakeExec<FSLImpl>},
-      {InputType(Type::DENSE_UNION), TakeExec<DenseUnionImpl>},
-      {InputType(Type::STRUCT), TakeExec<StructImpl>},
+      {InputType(match::Primitive()), PrimitiveTake, ChunkedPrimitiveTake},
+      {InputType(match::BinaryLike()), TakeExec<VarBinaryImpl<BinaryType>>, NULLPTR},
+      {InputType(match::LargeBinaryLike()), TakeExec<VarBinaryImpl<LargeBinaryType>>,
+       NULLPTR},
+      {InputType(Type::FIXED_SIZE_BINARY), TakeExec<FSBImpl>, NULLPTR},
+      {InputType(null()), NullTake, NULLPTR},
+      {InputType(Type::DECIMAL128), TakeExec<FSBImpl>, NULLPTR},
+      {InputType(Type::DECIMAL256), TakeExec<FSBImpl>, NULLPTR},
+      {InputType(Type::DICTIONARY), DictionaryTake, NULLPTR},
+      {InputType(Type::EXTENSION), ExtensionTake, NULLPTR},
+      {InputType(Type::LIST), TakeExec<ListImpl<ListType>>, NULLPTR},
+      {InputType(Type::LARGE_LIST), TakeExec<ListImpl<LargeListType>>, NULLPTR},
+      {InputType(Type::FIXED_SIZE_LIST), TakeExec<FSLImpl>, NULLPTR},
+      {InputType(Type::DENSE_UNION), TakeExec<DenseUnionImpl>, NULLPTR},
+      {InputType(Type::STRUCT), TakeExec<StructImpl>, NULLPTR},
       // TODO: Reuse ListType kernel for MAP
-      {InputType(Type::MAP), TakeExec<ListImpl<MapType>>},
+      {InputType(Type::MAP), TakeExec<ListImpl<MapType>>, NULLPTR},
   };
 
   VectorKernel take_base;
diff --git a/cpp/src/arrow/util/int_util.cc b/cpp/src/arrow/util/int_util.cc
index 1a48c7a3ec8..4b83e59e5bc 100644
--- a/cpp/src/arrow/util/int_util.cc
+++ b/cpp/src/arrow/util/int_util.cc
@@ -22,6 +22,7 @@
 #include <limits>
 
 #include "arrow/array/data.h"
+#include "arrow/chunked_array.h"
 #include "arrow/datum.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
@@ -590,6 +591,13 @@ Status CheckIndexBounds(const ArraySpan& values, uint64_t upper_limit) {
   }
 }
 
+Status CheckIndexBounds(const ChunkedArray& values, uint64_t upper_limit) {
+  for (const auto& chunk : values.chunks()) {
+    RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*chunk->data()), upper_limit));
+  }
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // Utilities for casting from one integer type to another
 
diff --git a/cpp/src/arrow/util/int_util.h b/cpp/src/arrow/util/int_util.h
index 5ce9dc2820e..97fb1c3b8f9 100644
--- a/cpp/src/arrow/util/int_util.h
+++ b/cpp/src/arrow/util/int_util.h
@@ -29,6 +29,7 @@ namespace arrow {
 class DataType;
 struct ArraySpan;
 struct Scalar;
+class ChunkedArray;
 
 namespace internal {
 
@@ -99,6 +100,8 @@ Status TransposeInts(const DataType& src_type, const DataType& dest_type,
 /// limit (which is usually the length of an array that is being indexed-into).
 ARROW_EXPORT
 Status CheckIndexBounds(const ArraySpan& values, uint64_t upper_limit);
+ARROW_EXPORT
+Status CheckIndexBounds(const ChunkedArray& values, uint64_t upper_limit);
 
 /// \brief Boundscheck integer values to determine if they are all between the
 /// passed upper and lower limits (inclusive). Upper and lower bounds must be

From e36e0310b84d187d5f322b238b3a4de9160c88b6 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Aug 2022 12:11:25 -0700
Subject: [PATCH 03/15] Use existing indices chunk layout

---
 .../arrow/compute/kernels/vector_selection.cc | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 3e72f854efa..f9855de8b81 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -372,7 +372,7 @@ struct PrimitiveTakeImpl {
   }
 
   static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
-                   ArrayData* out_arr) {
+                   ArrayDataVector* out_chunks) {
     auto values_resolver = ChunkResolver(values.chunks());
     const std::vector<const ValueCType*> values_data = MapVector(
         [](const auto& x) { return x->data()->template GetValues<ValueCType>(1); },
@@ -382,15 +382,16 @@ struct PrimitiveTakeImpl {
     const std::vector<int64_t> values_offset =
         MapVector([](const auto& x) { return x->offset(); }, values.chunks());
 
-    auto out = out_arr->GetMutableValues<ValueCType>(1);
-    uint8_t* out_is_valid = out_arr->buffers[0]->mutable_data();
-    int64_t out_offset = out_arr->offset;
+    for (int i = 0; i < indices_chunked.num_chunks(); ++i) {
+      const std::shared_ptr<Array>& indices_chunk = indices_chunked.chunk(i);
+      ArrayData* out_arr = (*out_chunks)[i].get();
+      ValueCType* out = out_arr->GetMutableValues<ValueCType>(1);
+      uint8_t* out_is_valid = out_arr->buffers[0]->mutable_data();
+      int64_t out_offset = out_arr->offset;
 
-    int64_t position = 0;  // Position in output array
-    int64_t valid_count = 0;
-    int64_t internal_offset = 0;  // Total length of indices chunks already processed
+      int64_t valid_count = 0;
 
-    for (const auto& indices_chunk : indices_chunked.chunks()) {
+      int64_t position = 0;  // Position in output array
       const ArraySpan indices = ArraySpan(*indices_chunk.get()->data());
       // TODO: How do we reduce duplication of code?
       const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
@@ -407,7 +408,7 @@ struct PrimitiveTakeImpl {
       OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
                                                   indices.length);
 
-      while (position < internal_offset + indices.length) {
+      while (position < indices.length) {
         BitBlockCount block = indices_bit_counter.NextBlock();
         if (values.null_count() == 0) {
           // Values are never null, so things are easier
@@ -482,13 +483,8 @@ struct PrimitiveTakeImpl {
           }
         }
       }
-
-      // Start next output at end of what we just wrote.
-      out_offset += indices.length;
-      internal_offset += indices.length;
+      out_arr->null_count = out_arr->length - valid_count;
     }
-
-    out_arr->null_count = out_arr->length - valid_count;
   }
 };
 
@@ -591,7 +587,7 @@ struct BooleanTakeImpl {
   }
 
   static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
-                   ArrayData* out_arr) {
+                   ArrayDataVector* out_arr) {
     // TODO
   }
 };
@@ -621,7 +617,7 @@ void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
 
 template <template <typename...> class TakeImpl, typename... Args>
 void TakeIndexDispatch(const ChunkedArray& values, const ChunkedArray& indices,
-                       ArrayData* out) {
+                       ArrayDataVector* out) {
   // With the simplifying assumption that boundschecking has taken place
   // already at a higher level, we can now assume that the index values are all
   // non-negative. Thus, we can interpret signed integers as unsigned and avoid
@@ -691,42 +687,45 @@ Status ChunkedPrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* o
     RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
 
-  // TODO: Is there any reason to chunk the output for primitive arrays?
-  // We probably want to keep within 32-bit sizes for interoperability with other
-  // implementations.
-  auto out_data = ArrayData::Make(values.type(), indices.length());
-
   const int bit_width = values.type()->bit_width();
 
-  // TODO: When neither values nor indices contain nulls, we can skip
-  // allocating the validity bitmap altogether and save time and space. A
-  // streamlined PrimitiveTakeImpl would need to be written that skips all
-  // interactions with the output validity bitmap, though.
-  RETURN_NOT_OK(PreallocateData(ctx, indices.length(), bit_width,
-                                /*allocate_validity=*/true, out_data.get()));
+  ArrayDataVector out_data;
+
+  // We will match the chunking structure of the indices
+  for (const auto& chunk : indices.chunks()) {
+    auto chunk_out = ArrayData::Make(values.type(), indices.length());
+    // TODO: When neither values nor indices contain nulls, we can skip
+    // allocating the validity bitmap altogether and save time and space. A
+    // streamlined PrimitiveTakeImpl would need to be written that skips all
+    // interactions with the output validity bitmap, though.
+    RETURN_NOT_OK(PreallocateData(ctx, chunk->length(), bit_width,
+                                  /*allocate_validity=*/true, chunk_out.get()));
+
+    out_data.push_back(chunk_out);
+  }
 
   switch (bit_width) {
     case 1:
-      TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_data.get());
+      TakeIndexDispatch<BooleanTakeImpl>(values, indices, &out_data);
       break;
     case 8:
-      TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, out_data.get());
+      TakeIndexDispatch<PrimitiveTakeImpl, int8_t>(values, indices, &out_data);
       break;
     case 16:
-      TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, out_data.get());
+      TakeIndexDispatch<PrimitiveTakeImpl, int16_t>(values, indices, &out_data);
       break;
     case 32:
-      TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, out_data.get());
+      TakeIndexDispatch<PrimitiveTakeImpl, int32_t>(values, indices, &out_data);
       break;
     case 64:
-      TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, out_data.get());
+      TakeIndexDispatch<PrimitiveTakeImpl, int64_t>(values, indices, &out_data);
       break;
     default:
       DCHECK(false) << "Invalid values byte width";
       break;
   }
 
-  *out = Datum(ChunkedArray(MakeArray(out_data)));
+  *out = std::make_shared<ChunkedArray>(MapVector(MakeArray, out_data), values.type());
 
   return Status::OK();
 }
@@ -2707,6 +2706,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
   VectorKernel take_base;
   take_base.init = TakeState::Init;
   take_base.can_execute_chunkwise = false;
+  take_base.output_chunked = false;
   RegisterSelectionFunction("array_take", array_take_doc, take_base,
                             /*selection_type=*/match::Integer(), take_kernels,
                             GetDefaultTakeOptions(), registry);

From a9ea85ca20299003a499e5f67f9f7b54aa9df0f5 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Aug 2022 14:19:45 -0700
Subject: [PATCH 04/15] Reduce code duplication

---
 .../arrow/compute/kernels/vector_selection.cc | 192 +++++++-----------
 1 file changed, 72 insertions(+), 120 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index f9855de8b81..d7c63756eaa 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -276,12 +276,64 @@ Status PreallocateData(KernelContext* ctx, int64_t length, int bit_width,
 /// This function assumes that the indices have been boundschecked.
 template <typename IndexCType, typename ValueCType>
 struct PrimitiveTakeImpl {
-  static void Exec(const ArraySpan& values, const ArraySpan& indices,
-                   ArrayData* out_arr) {
-    const ValueCType* values_data = values.GetValues<ValueCType>(1);
-    const uint8_t* values_is_valid = values.buffers[0].data;
-    auto values_offset = values.offset;
+  // Defines how to get individual values and validity bits from values input
+  template <typename T>
+  struct PrimitiveTakeSource {};
+
+  template <>
+  struct PrimitiveTakeSource<ArraySpan> {
+    PrimitiveTakeSource(const ArraySpan values) {
+      offset = values.offset;
+      null_count = values.null_count;
+      values_data = values.GetValues<ValueCType>(1);
+      values_is_valid = values.buffers[0].data;
+    }
+    ValueCType GetValue(int64_t i) const { return values_data[i]; }
+
+    bool GetValidity(int64_t i) const {
+      return bit_util::GetBit(values_is_valid, offset + i);
+    }
+
+    int64_t offset;
+    int64_t null_count;
+    const ValueCType* values_data;
+    const uint8_t* values_is_valid;
+  };
 
+  template <>
+  struct PrimitiveTakeSource<ChunkedArray> {
+    PrimitiveTakeSource(const ChunkedArray& values)
+        : resolver(ChunkResolver(values.chunks())), null_count(values.null_count()) {
+      values_data = MapVector(
+          [](const auto& x) { return x->data()->template GetValues<ValueCType>(1); },
+          values.chunks());
+      values_is_valid =
+          MapVector([](const auto& x) { return x->null_bitmap_data(); }, values.chunks());
+      values_offset =
+          MapVector([](const auto& x) { return x->offset(); }, values.chunks());
+    }
+
+    ValueCType GetValue(int64_t i) {
+      ChunkLocation loc = resolver.Resolve(i);
+      return values_data[loc.chunk_index][loc.index_in_chunk];
+    }
+
+    bool GetValidity(int64_t i) {
+      ChunkLocation loc = resolver.Resolve(i);
+      return bit_util::GetBit(values_is_valid[loc.chunk_index],
+                              values_offset[loc.chunk_index] + loc.index_in_chunk);
+    }
+
+    ChunkResolver resolver;
+    int64_t null_count;
+    std::vector<const ValueCType*> values_data;
+    std::vector<const uint8_t*> values_is_valid;
+    std::vector<int64_t> values_offset;
+  };
+
+  template <typename T>
+  static void ExecImpl(PrimitiveTakeSource<T>& values, const ArraySpan& indices,
+                       ArrayData* out_arr) {
     const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
     const uint8_t* indices_is_valid = indices.buffers[0].data;
     auto indices_offset = indices.offset;
@@ -310,7 +362,7 @@ struct PrimitiveTakeImpl {
           // Fastest path: neither values nor index nulls
           bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
           for (int64_t i = 0; i < block.length; ++i) {
-            out[position] = values_data[indices_data[position]];
+            out[position] = values.GetValue(indices_data[position]);
             ++position;
           }
         } else if (block.popcount > 0) {
@@ -319,7 +371,7 @@ struct PrimitiveTakeImpl {
             if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
               // index is not null
               bit_util::SetBit(out_is_valid, out_offset + position);
-              out[position] = values_data[indices_data[position]];
+              out[position] = values.GetValue(indices_data[position]);
             } else {
               out[position] = ValueCType{};
             }
@@ -334,10 +386,9 @@ struct PrimitiveTakeImpl {
         if (block.popcount == block.length) {
           // Faster path: indices are not null but values may be
           for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
+            if (values.GetValidity(indices_data[position])) {
               // value is not null
-              out[position] = values_data[indices_data[position]];
+              out[position] = values.GetValue(indices_data[position]);
               bit_util::SetBit(out_is_valid, out_offset + position);
               ++valid_count;
             } else {
@@ -351,10 +402,9 @@ struct PrimitiveTakeImpl {
           // one.
           for (int64_t i = 0; i < block.length; ++i) {
             if (bit_util::GetBit(indices_is_valid, indices_offset + position) &&
-                bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
+                values.GetValidity(indices_data[position])) {
               // index is not null && value is not null
-              out[position] = values_data[indices_data[position]];
+              out[position] = values.GetValue(indices_data[position]);
               bit_util::SetBit(out_is_valid, out_offset + position);
               ++valid_count;
             } else {
@@ -371,119 +421,21 @@ struct PrimitiveTakeImpl {
     out_arr->null_count = out_arr->length - valid_count;
   }
 
+  static void Exec(const ArraySpan& values, const ArraySpan& indices,
+                   ArrayData* out_arr) {
+    auto source = PrimitiveTakeSource<ArraySpan>(values);
+    ExecImpl(source, indices, out_arr);
+  }
+
   static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
                    ArrayDataVector* out_chunks) {
-    auto values_resolver = ChunkResolver(values.chunks());
-    const std::vector<const ValueCType*> values_data = MapVector(
-        [](const auto& x) { return x->data()->template GetValues<ValueCType>(1); },
-        values.chunks());
-    const std::vector<const uint8_t*> values_is_valid =
-        MapVector([](const auto& x) { return x->null_bitmap_data(); }, values.chunks());
-    const std::vector<int64_t> values_offset =
-        MapVector([](const auto& x) { return x->offset(); }, values.chunks());
+    auto source = PrimitiveTakeSource<ChunkedArray>(values);
 
     for (int i = 0; i < indices_chunked.num_chunks(); ++i) {
-      const std::shared_ptr<Array>& indices_chunk = indices_chunked.chunk(i);
+      ArraySpan indices_chunk(*indices_chunked.chunk(i)->data().get());
       ArrayData* out_arr = (*out_chunks)[i].get();
-      ValueCType* out = out_arr->GetMutableValues<ValueCType>(1);
-      uint8_t* out_is_valid = out_arr->buffers[0]->mutable_data();
-      int64_t out_offset = out_arr->offset;
-
-      int64_t valid_count = 0;
-
-      int64_t position = 0;  // Position in output array
-      const ArraySpan indices = ArraySpan(*indices_chunk.get()->data());
-      // TODO: How do we reduce duplication of code?
-      const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
-      const uint8_t* indices_is_valid = indices.buffers[0].data;
-      int64_t indices_offset = indices.offset;
-
-      // If either the values or indices have nulls, we preemptively zero out the
-      // out validity bitmap so that we don't have to use ClearBit in each
-      // iteration for nulls.
-      if (values.null_count() != 0 || indices.null_count != 0) {
-        bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
-      }
 
-      OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
-                                                  indices.length);
-
-      while (position < indices.length) {
-        BitBlockCount block = indices_bit_counter.NextBlock();
-        if (values.null_count() == 0) {
-          // Values are never null, so things are easier
-          valid_count += block.popcount;
-          if (block.popcount == block.length) {
-            // Fastest path: neither values nor index nulls
-            bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
-            for (int64_t i = 0; i < block.length; ++i) {
-              int64_t idx = indices_data[position];
-              ChunkLocation loc = values_resolver.Resolve(idx);
-              out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
-              ++position;
-            }
-          } else if (block.popcount > 0) {
-            // Slow path: some indices but not all are null
-            for (int64_t i = 0; i < block.length; ++i) {
-              if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
-                // index is not null
-                bit_util::SetBit(out_is_valid, out_offset + position);
-                int64_t idx = indices_data[position];
-                ChunkLocation loc = values_resolver.Resolve(idx);
-                out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
-              } else {
-                out[position] = ValueCType{};
-              }
-              ++position;
-            }
-          } else {
-            memset(out + position, 0, sizeof(ValueCType) * block.length);
-            position += block.length;
-          }
-        } else {
-          // Values have nulls, so we must do random access into the values bitmap
-          if (block.popcount == block.length) {
-            // Faster path: indices are not null but values may be
-            for (int64_t i = 0; i < block.length; ++i) {
-              int64_t idx = indices_data[position];
-              ChunkLocation loc = values_resolver.Resolve(idx);
-              if (bit_util::GetBit(values_is_valid[loc.chunk_index],
-                                   values_offset[loc.chunk_index] + loc.index_in_chunk)) {
-                // value is not null
-                out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
-                bit_util::SetBit(out_is_valid, out_offset + position);
-                ++valid_count;
-              } else {
-                out[position] = ValueCType{};
-              }
-              ++position;
-            }
-          } else if (block.popcount > 0) {
-            // Slow path: some but not all indices are null. Since we are doing
-            // random access in general we have to check the value nullness one by
-            // one.
-            for (int64_t i = 0; i < block.length; ++i) {
-              int64_t idx = indices_data[position];
-              ChunkLocation loc = values_resolver.Resolve(idx);
-              if (bit_util::GetBit(indices_is_valid, indices_offset + position) &&
-                  bit_util::GetBit(values_is_valid[loc.chunk_index],
-                                   values_offset[loc.chunk_index] + loc.index_in_chunk)) {
-                // index is not null && value is not null
-                out[position] = values_data[loc.chunk_index][loc.index_in_chunk];
-                bit_util::SetBit(out_is_valid, out_offset + position);
-                ++valid_count;
-              } else {
-                out[position] = ValueCType{};
-              }
-              ++position;
-            }
-          } else {
-            memset(out + position, 0, sizeof(ValueCType) * block.length);
-            position += block.length;
-          }
-        }
-      }
-      out_arr->null_count = out_arr->length - valid_count;
+      ExecImpl(source, indices_chunk, out_arr);
     }
   }
 };

From 882ffd576a0c86b80f47adfba17ef374ab682682 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Wed, 17 Aug 2022 14:43:20 -0700
Subject: [PATCH 05/15] Add boolean chunked take

---
 .../arrow/compute/kernels/vector_selection.cc | 92 ++++++++++++++++---
 .../compute/kernels/vector_selection_test.cc  |  3 +
 2 files changed, 80 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index d7c63756eaa..c0a3e043dfd 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -282,7 +282,7 @@ struct PrimitiveTakeImpl {
 
   template <>
   struct PrimitiveTakeSource<ArraySpan> {
-    PrimitiveTakeSource(const ArraySpan values) {
+    explicit PrimitiveTakeSource(const ArraySpan values) {
       offset = values.offset;
       null_count = values.null_count;
       values_data = values.GetValues<ValueCType>(1);
@@ -302,7 +302,7 @@ struct PrimitiveTakeImpl {
 
   template <>
   struct PrimitiveTakeSource<ChunkedArray> {
-    PrimitiveTakeSource(const ChunkedArray& values)
+    explicit PrimitiveTakeSource(const ChunkedArray& values)
         : resolver(ChunkResolver(values.chunks())), null_count(values.null_count()) {
       values_data = MapVector(
           [](const auto& x) { return x->data()->template GetValues<ValueCType>(1); },
@@ -442,12 +442,64 @@ struct PrimitiveTakeImpl {
 
 template <typename IndexCType>
 struct BooleanTakeImpl {
-  static void Exec(const ArraySpan& values, const ArraySpan& indices,
-                   ArrayData* out_arr) {
-    const uint8_t* values_data = values.buffers[1].data;
-    const uint8_t* values_is_valid = values.buffers[0].data;
-    auto values_offset = values.offset;
+  // Defines how to get individual values and validity bits from values input
+  template <typename T>
+  struct BooleanTakeSource {};
+
+  template <>
+  struct BooleanTakeSource<ArraySpan> {
+    explicit BooleanTakeSource(const ArraySpan values) {
+      offset = values.offset;
+      null_count = values.null_count;
+      values_data = values.buffers[1].data;
+      values_is_valid = values.buffers[0].data;
+    }
+    bool GetValue(int64_t i) const { return bit_util::GetBit(values_data, offset + i); }
+
+    bool GetValidity(int64_t i) const {
+      return bit_util::GetBit(values_is_valid, offset + i);
+    }
+
+    int64_t offset;
+    int64_t null_count;
+    const uint8_t* values_data;
+    const uint8_t* values_is_valid;
+  };
 
+  template <>
+  struct BooleanTakeSource<ChunkedArray> {
+    explicit BooleanTakeSource(const ChunkedArray& values)
+        : resolver(ChunkResolver(values.chunks())), null_count(values.null_count()) {
+      values_data = MapVector([](const auto& x) { return x->data()->buffers[1]->data(); },
+                              values.chunks());
+      values_is_valid =
+          MapVector([](const auto& x) { return x->null_bitmap_data(); }, values.chunks());
+      values_offset =
+          MapVector([](const auto& x) { return x->offset(); }, values.chunks());
+    }
+
+    bool GetValue(int64_t i) {
+      ChunkLocation loc = resolver.Resolve(i);
+      return bit_util::GetBit(values_data[loc.chunk_index],
+                              values_offset[loc.chunk_index] + loc.index_in_chunk);
+    }
+
+    bool GetValidity(int64_t i) {
+      ChunkLocation loc = resolver.Resolve(i);
+      return bit_util::GetBit(values_is_valid[loc.chunk_index],
+                              values_offset[loc.chunk_index] + loc.index_in_chunk);
+    }
+
+    ChunkResolver resolver;
+    int64_t null_count;
+    std::vector<const uint8_t*> values_data;
+    std::vector<const uint8_t*> values_is_valid;
+    std::vector<int64_t> values_offset;
+  };
+
+  template <typename T>
+  static void ExecImpl(BooleanTakeSource<T>& values, const ArraySpan& indices,
+                       ArrayData* out_arr) {
     const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
     const uint8_t* indices_is_valid = indices.buffers[0].data;
     auto indices_offset = indices.offset;
@@ -466,8 +518,7 @@ struct BooleanTakeImpl {
     bit_util::SetBitsTo(out, out_offset, indices.length, false);
 
     auto PlaceDataBit = [&](int64_t loc, IndexCType index) {
-      bit_util::SetBitTo(out, out_offset + loc,
-                         bit_util::GetBit(values_data, values_offset + index));
+      bit_util::SetBitTo(out, out_offset + loc, values.GetValue(index));
     };
 
     OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
@@ -504,8 +555,7 @@ struct BooleanTakeImpl {
         if (block.popcount == block.length) {
           // Faster path: indices are not null but values may be
           for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
+            if (values.GetValidity(indices_data[position])) {
               // value is not null
               bit_util::SetBit(out_is_valid, out_offset + position);
               PlaceDataBit(position, indices_data[position]);
@@ -520,8 +570,7 @@ struct BooleanTakeImpl {
           for (int64_t i = 0; i < block.length; ++i) {
             if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
               // index is not null
-              if (bit_util::GetBit(values_is_valid,
-                                   values_offset + indices_data[position])) {
+              if (values.GetValidity(indices_data[position])) {
                 // value is not null
                 PlaceDataBit(position, indices_data[position]);
                 bit_util::SetBit(out_is_valid, out_offset + position);
@@ -538,9 +587,22 @@ struct BooleanTakeImpl {
     out_arr->null_count = out_arr->length - valid_count;
   }
 
+  static void Exec(const ArraySpan& values, const ArraySpan& indices,
+                   ArrayData* out_arr) {
+    auto source = BooleanTakeSource<ArraySpan>(values);
+    ExecImpl(source, indices, out_arr);
+  }
+
   static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
-                   ArrayDataVector* out_arr) {
-    // TODO
+                   ArrayDataVector* out_chunks) {
+    auto source = BooleanTakeSource<ChunkedArray>(values);
+
+    for (int i = 0; i < indices_chunked.num_chunks(); ++i) {
+      ArraySpan indices_chunk(*indices_chunked.chunk(i)->data().get());
+      ArrayData* out_arr = (*out_chunks)[i].get();
+
+      ExecImpl(source, indices_chunk, out_arr);
+    }
   }
 };
 
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index f98af93eef3..38eb420122c 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -1659,6 +1659,9 @@ TEST_F(TestTakeKernelWithChunkedArray, TakeChunkedArray) {
   this->AssertChunkedTake(int8(), {"[7]", "[8, 9]"}, {"[0, 1, 0]", "[]", "[2]"},
                           {"[7, 8, 7]", "[]", "[9]"});
   this->AssertTake(int8(), {"[7]", "[8, 9]"}, "[2, 1]", {"[9, 8]"});
+  this->AssertChunkedTake(boolean(), {"[true]", "[false, true]"},
+                          {"[0, 1, 0]", "[]", "[2]"},
+                          {"[true, false, true]", "[]", "[true]"});
 
   std::shared_ptr<ChunkedArray> arr;
   ASSERT_RAISES(IndexError,

From 00b97926dc1eb77c7c6e083e099d6e23ef111a2f Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Tue, 30 Aug 2022 11:11:58 -0700
Subject: [PATCH 06/15] Partial impl

---
 .../arrow/compute/kernels/vector_selection.cc | 359 +++++++++++++++---
 cpp/src/arrow/util/bit_block_counter.cc       |   6 +
 cpp/src/arrow/util/bit_block_counter.h        |   3 +
 3 files changed, 316 insertions(+), 52 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index c0a3e043dfd..4ba2aafd838 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -262,6 +262,170 @@ Status PreallocateData(KernelContext* ctx, int64_t length, int bit_width,
   return Status::OK();
 }
 
+template <typename ArrayKind, typename ValueType>
+struct PrimitiveGetter {};
+
+template <>
+struct PrimitiveGetter<ArraySpan, bool> {
+  // For boolean, we can't add offset at beginning because values is a bitmap.
+  explicit PrimitiveGetter(ArraySpan&& array)
+      : inner(std::move(array)),
+        values(array.GetValues<uint8_t>(1)),
+        bit_block_counter(array.buffers[0].data, array.offset, array.length){};
+
+  bool IsValid(int64_t i) const {
+    return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
+  }
+
+  bool GetValue(int64_t i) const { return bit_util::GetBit(values, inner.offset + i); }
+
+  OptionalBitBlockCounter& BitBlockCounter(int64_t i) { return bit_block_counter; }
+
+  int64_t null_count() const { return inner.null_count; }
+  int64_t length() const { return inner.length; }
+
+  ArraySpan inner;
+  uint8_t* values;
+  OptionalBitBlockCounter bit_block_counter;
+};
+
+template <typename ValueType>
+struct PrimitiveGetter<ArraySpan, ValueType> {
+  explicit PrimitiveGetter(ArraySpan&& array)
+      : inner(std::move(array)), values(array.GetValues<ValueType>(1) + array.offset){};
+
+  bool IsValid(int64_t i) const {
+    return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
+  }
+
+  OptionalBitBlockCounter& BitBlockCounter(int64_t i) const { return bit_block_counter; }
+
+  ValueType GetValue(int64_t i) const { return values[i]; }
+
+  int64_t null_count() const { return inner.null_count; }
+  int64_t length() const { return inner.length; }
+
+  ArraySpan inner;
+  ValueType* values;
+  OptionalBitBlockCounter bit_block_counter;
+};
+
+OptionalBitBlockCounter* GetBitBlockCounter(const ChunkedArray& array, int64_t chunk_i) {
+  if (array.num_chunks() > 0) {
+    auto chunk = array.chunks()[chunk_i];
+    return new OptionalBitBlockCounter(chunk->null_bitmap_data(), chunk->offset(),
+                                       chunk->length());
+  } else {
+    return new OptionalBitBlockCounter();
+  }
+}
+
+template <>
+struct PrimitiveGetter<ChunkedArray, bool> {
+  explicit PrimitiveGetter(const ChunkedArray& array)
+      : inner(array),
+        resolver(ChunkResolver(array.chunks())),
+        null_count_(array.null_count()),
+        length_(array.length()),
+        current_counter_(0),
+        bit_block_counter_(GetBitBlockCounter(array, 0)) {
+    values_data =
+        MapVector([](const auto& x) { return x->data()->template GetValues<uint8_t>(1); },
+                  array.chunks());
+    values_is_valid =
+        MapVector([](const auto& x) { return x->null_bitmap_data(); }, array.chunks());
+    values_offset = MapVector([](const auto& x) { return x->offset(); }, array.chunks());
+  }
+
+  bool IsValid(int64_t i) const {
+    ChunkLocation loc = resolver.Resolve(i);
+    return bit_util::GetBit(values_is_valid[loc.chunk_index],
+                            values_offset[loc.chunk_index] + loc.index_in_chunk);
+  }
+
+  bool GetValue(int64_t i) const {
+    ChunkLocation loc = resolver.Resolve(i);
+    return bit_util::GetBit(values_data[loc.chunk_index],
+                            values_offset[loc.chunk_index] + loc.index_in_chunk);
+  }
+
+  OptionalBitBlockCounter& BitBlockCounter(int64_t i) {
+    ChunkLocation loc = resolver.Resolve(i);
+    if (loc.chunk_index != current_counter_) {
+      // New chunk, so we need to change counters
+      delete bit_block_counter_;
+      bit_block_counter_ = GetBitBlockCounter(inner, loc.chunk_index);
+    }
+    return *bit_block_counter_;
+  }
+
+  int64_t null_count() const { return null_count_; }
+  int64_t length() const { return length_; }
+
+  const ChunkedArray& inner;
+  ChunkResolver resolver;
+  int64_t null_count_;
+  int64_t length_;
+  int64_t current_counter_;
+  OptionalBitBlockCounter* bit_block_counter_;
+  std::vector<const uint8_t*> values_data;
+  std::vector<const uint8_t*> values_is_valid;
+  std::vector<int64_t> values_offset;
+};
+
+template <typename ValueType>
+struct PrimitiveGetter<ChunkedArray, ValueType> {
+  explicit PrimitiveGetter(const ChunkedArray& array)
+      : inner(array),
+        resolver(ChunkResolver(array.chunks())),
+        null_count_(array.null_count()),
+        length_(array.length()),
+        current_counter_(0),
+        bit_block_counter_(GetBitBlockCounter(array, 0)) {
+    values_data = MapVector(
+        [](const auto& x) { return x->data()->template GetValues<ValueType>(1); },
+        array.chunks());
+    values_is_valid =
+        MapVector([](const auto& x) { return x->null_bitmap_data(); }, array.chunks());
+    values_offset = MapVector([](const auto& x) { return x->offset(); }, array.chunks());
+  }
+
+  bool IsValid(int64_t i) const {
+    ChunkLocation loc = resolver.Resolve(i);
+    return bit_util::GetBit(values_is_valid[loc.chunk_index],
+                            values_offset[loc.chunk_index] + loc.index_in_chunk);
+  }
+
+  ValueType GetValue(int64_t i) const {
+    ChunkLocation loc = resolver.Resolve(i);
+    ValueType* data = values_data[loc.chunk_index];
+    return data[i];
+  }
+
+  OptionalBitBlockCounter& BitBlockCounter(int64_t i) {
+    ChunkLocation loc = resolver.Resolve(i);
+    if (loc.chunk_index != current_counter_) {
+      // New chunk, so we need to change counters
+      delete bit_block_counter_;
+      bit_block_counter_ = GetBitBlockCounter(inner, loc.chunk_index);
+    }
+    return *bit_block_counter_;
+  }
+
+  int64_t null_count() const { return null_count_; }
+  int64_t length() const { return length_; }
+
+  const ChunkedArray& inner;
+  ChunkResolver resolver;
+  int64_t null_count_;
+  int64_t length_;
+  int64_t current_counter_;
+  OptionalBitBlockCounter* bit_block_counter_;
+  std::vector<const ValueType*> values_data;
+  std::vector<const uint8_t*> values_is_valid;
+  std::vector<int64_t> values_offset;
+};
+
 // ----------------------------------------------------------------------
 // Implement optimized take for primitive types from boolean to 1/2/4/8-byte
 // C-type based types. Use common implementation for every byte width and only
@@ -781,30 +945,21 @@ class DropNullCounter {
 /// generate one take function for each byte width. We use the same
 /// implementation here for boolean and fixed-byte-size inputs with some
 /// template specialization.
-template <typename ArrowType>
+template <typename ArrayKind, typename ArrowType>
 class PrimitiveFilterImpl {
  public:
   using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
                                       uint8_t, typename ArrowType::c_type>::type;
-
-  PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
+  PrimitiveFilterImpl(const PrimitiveGetter<ArrayKind, T>& values_getter,
+                      const ArraySpan& filter,
                       FilterOptions::NullSelectionBehavior null_selection,
                       ArrayData* out_arr)
-      : values_is_valid_(values.buffers[0].data),
-        values_data_(reinterpret_cast<const T*>(values.buffers[1].data)),
-        values_null_count_(values.null_count),
-        values_offset_(values.offset),
-        values_length_(values.length),
+      : values_getter_(values_getter),
         filter_is_valid_(filter.buffers[0].data),
         filter_data_(filter.buffers[1].data),
         filter_null_count_(filter.null_count),
         filter_offset_(filter.offset),
         null_selection_(null_selection) {
-    if (values.type->id() != Type::BOOL) {
-      // No offset applied for boolean because it's a bitmap
-      values_data_ += values.offset;
-    }
-
     if (out_arr->buffers[0] != nullptr) {
       // May not be allocated if neither filter nor values contains nulls
       out_is_valid_ = out_arr->buffers[0]->mutable_data();
@@ -818,7 +973,7 @@ class PrimitiveFilterImpl {
   void ExecNonNull() {
     // Fast filter when values and filter are not null
     ::arrow::internal::VisitSetBitRunsVoid(
-        filter_data_, filter_offset_, values_length_,
+        filter_data_, filter_offset_, values_getter_.length(),
         [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
   }
 
@@ -829,11 +984,11 @@ class PrimitiveFilterImpl {
 
     // Bit counters used for both null_selection behaviors
     DropNullCounter drop_null_counter(filter_is_valid_, filter_data_, filter_offset_,
-                                      values_length_);
-    OptionalBitBlockCounter data_counter(values_is_valid_, values_offset_,
-                                         values_length_);
+                                      values_getter_.length());
+    // ???
+    OptionalBitBlockCounter& data_counter;
     OptionalBitBlockCounter filter_valid_counter(filter_is_valid_, filter_offset_,
-                                                 values_length_);
+                                                 values_getter_.length());
 
     auto WriteNotNull = [&](int64_t index) {
       bit_util::SetBit(out_is_valid_, out_offset_ + out_position_);
@@ -843,13 +998,15 @@ class PrimitiveFilterImpl {
 
     auto WriteMaybeNull = [&](int64_t index) {
       bit_util::SetBitTo(out_is_valid_, out_offset_ + out_position_,
-                         bit_util::GetBit(values_is_valid_, values_offset_ + index));
+                         values_getter_.Validity(index));
       // Increments out_position_
       WriteValue(index);
     };
 
     int64_t in_position = 0;
     while (in_position < values_length_) {
+      // In chunked array, block counter will change as we enter new chunks.
+      data_counter = values_getter_.BitBlockCounter(in_position);
       BitBlockCount filter_block = drop_null_counter.NextBlock();
       BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
       BitBlockCount data_block = data_counter.NextWord();
@@ -951,26 +1108,78 @@ class PrimitiveFilterImpl {
 
   // Write the next out_position given the selected in_position for the input
   // data and advance out_position
-  void WriteValue(int64_t in_position) {
-    out_data_[out_position_++] = values_data_[in_position];
+  std::enable_if_t<!is_boolean_type<T>::value, void> WriteValue(int64_t in_position) {
+    out_data_[out_position_++] = values_getter_.GetValue(in_position);
   }
 
-  void WriteValueSegment(int64_t in_start, int64_t length) {
-    std::memcpy(out_data_ + out_position_, values_data_ + in_start, length * sizeof(T));
-    out_position_ += length;
+  enable_if_boolean<T, void> WriteValue(int64_t in_position) {
+    bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
+                       values_getter_.GetValue(in_position));
   }
 
-  void WriteNull() {
+  std::enable_if_t<!is_boolean_type<T>::value, void> WriteNull() {
     // Zero the memory
     out_data_[out_position_++] = T{};
   }
 
+  enable_if_boolean<T, void> WriteNull() {
+    // Zero the bit
+    bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
+  }
+
+  enable_if_t<(std::is_same<ArrayKind, ArraySpan>::value && !is_boolean_type<T>::value), void>
+  WriteValueSegment(int64_t in_start, int64_t length) {
+    std::memcpy(out_data_ + out_position_, values.values + in_start, length * sizeof(T));
+    out_position_ += length;
+  }
+
+  enable_if_t<(std::is_same<ArrayKind, ChunkedArray>::value && !is_boolean_type<T>::value), void>
+  WriteValueSegment(int64_t in_start, int64_t length) {
+    // Find initial chunk
+    ChunkLocation loc = values.resolver.Resolve(in_start);
+    int64_t processed_length = 0;
+    // While processed_length < length, keep copying more chunks
+    while (processed_length < length) {
+      std::shared_ptr<Array> chunk = values.inner.chunk(loc.chunk_index);
+      int64_t copyable_length = min(chunk.length(), length - processed_length);
+      std::memcpy(out_data_ + out_position_,
+                  values.values_data[loc.chunk_index] + loc.index_in_chunk,
+                  copyable_length * sizeof(T));
+      // Advance to beginning of next chunk
+      ++loc.chunk_index;
+      loc.index_in_chunk = 0;
+    }
+    out_position_ += length;
+  }
+
+  enable_if_t<std::is_same<ArrayKind, ArraySpan> && is_boolean_type<T>, void>
+  WriteValueSegment(int64_t in_start, int64_t length) {
+    CopyBitmap(values_getter_.values, values_getter_.inner.offset + in_start, length,
+               out_data_, out_offset_ + out_position_);
+    out_position_ += length;
+  }
+
+  enable_if_t<std::is_same<ArrayKind, ChunkedArray> && is_boolean_type<T>, void>
+  WriteValueSegment(int64_t in_start, int64_t length) {
+    // Find initial chunk
+    ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
+    int64_t processed_length = 0;
+    // While processed_length < length, keep copying more chunks
+    while (processed_length < length) {
+      std::shared_ptr<Array> chunk = values_getter_.inner.chunk(loc.chunk_index);
+      int64_t copyable_length = min(chunk.length(), length - processed_length);
+      CopyBitmap(values_getter_.values_data[loc.chunk_index],
+                 values_getter_.values_offset[loc.chunk_index] + loc.index_in_chunk,
+                 copyable_length, out_data_, out_offset_ + out_position_);
+      // Advance to beginning of next chunk
+      ++loc.chunk_index;
+      loc.index_in_chunk = 0;
+    }
+    out_position_ += length;
+  }
+
  private:
-  const uint8_t* values_is_valid_;
-  const T* values_data_;
-  int64_t values_null_count_;
-  int64_t values_offset_;
-  int64_t values_length_;
+  const PrimitiveGetter<ArrayKind, T>& values_getter_;
   const uint8_t* filter_is_valid_;
   const uint8_t* filter_data_;
   int64_t filter_null_count_;
@@ -983,24 +1192,15 @@ class PrimitiveFilterImpl {
   int64_t out_position_;
 };
 
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteValue(int64_t in_position) {
-  bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
-                     bit_util::GetBit(values_data_, values_offset_ + in_position));
-}
-
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteValueSegment(int64_t in_start,
-                                                                int64_t length) {
-  CopyBitmap(values_data_, values_offset_ + in_start, length, out_data_,
-             out_offset_ + out_position_);
-  out_position_ += length;
-}
+template <typename ArrowType>
+void PrimitiveFilterDispatch(const ArraySpan& values, const ArraySpan& filter,
+                             FilterOptions::NullSelectionBehavior null_selection,
+                             ArrayData* out) {
+  using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
+                                      uint8_t, typename ArrowType::c_type>::type;
 
-template <>
-inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
-  // Zero the bit
-  bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
+  auto getter = PrimitiveGetter<ArraySpan, T>(values);
+  PrimitiveFilterImpl<ArrowType>(getter, filter, null_selection, out).Exec();
 }
 
 Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
@@ -1034,19 +1234,19 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
 
   switch (bit_width) {
     case 1:
-      PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterDispatch<BooleanType>(values, filter, null_selection, out_arr);
       break;
     case 8:
-      PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterDispatch<UInt8Type>(values, filter, null_selection, out_arr);
       break;
     case 16:
-      PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterDispatch<UInt16Type>(values, filter, null_selection, out_arr);
       break;
     case 32:
-      PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterDispatch<UInt32Type>(values, filter, null_selection, out_arr);
       break;
     case 64:
-      PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
+      PrimitiveFilterDispatch<UInt64Type>(values, filter, null_selection, out_arr);
       break;
     default:
       DCHECK(false) << "Invalid values bit width";
@@ -1055,6 +1255,61 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
   return Status::OK();
 }
 
+template <typename ArrowType>
+void PrimitiveFilterDispatch(const ChunkedArray& values, const ChunkedArray& filter,
+                             FilterOptions::NullSelectionBehavior null_selection,
+                             ArrayDataVector* out) {
+  using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
+                                      uint8_t, typename ArrowType::c_type>::type;
+
+  auto getter = PrimitiveGetter<ChunkedArray, T>(values);
+  auto impl = PrimitiveFilterImpl<ArrowType>(values, null_selection);
+
+  // Will match the chunking structure of the filter array.
+  for (const auto& filter_chunk : filter.chunks()) {
+    auto chunk_out = ArrayData::Make(values.type(), filter.length());
+    PrimitiveFilterImpl<ArrowType>(getter, filter_chunk, null_selection, chunk_out)
+        .Exec();
+    out_data->push_back(chunk_out);
+  }
+}
+
+Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const ChunkedArray& values = *batch[0].chunked_array();
+  const ChunkedArray& filter = *batch[1].chunked_array();
+
+  FilterOptions::NullSelectionBehavior null_selection =
+      FilterState::Get(ctx).null_selection_behavior;
+
+  ArrayDataVector out_data;
+
+  const int bit_width = values.type()->bit_width();
+  switch (bit_width) {
+    case 1:
+      PrimitiveFilterDispatch<BooleanType>(values, filter, null_selection, &out_data);
+      break;
+    case 8:
+      PrimitiveFilterDispatch<UInt8Type>(values, filter, null_selection, &out_data);
+      break;
+    case 16:
+      PrimitiveFilterDispatch<UInt16Type>(values, filter, null_selection, &out_data);
+      break;
+    case 32:
+      PrimitiveFilterDispatch<UInt32Type>(values, filter, null_selection, &out_data);
+      break;
+    case 64:
+      PrimitiveFilterDispatch<UInt64Type>(values, filter, null_selection, &out_data);
+      break;
+    default:
+      DCHECK(false) << "Invalid values bit width";
+      break;
+  }
+
+  *out = std::make_shared<ChunkedArray>(MapVector(MakeArray, out_data), values.type());
+
+  return Status::OK();
+}
+
 // ----------------------------------------------------------------------
 // Optimized filter for base binary types (32-bit and 64-bit)
 
diff --git a/cpp/src/arrow/util/bit_block_counter.cc b/cpp/src/arrow/util/bit_block_counter.cc
index 7b5590f1797..dbc91858ef8 100644
--- a/cpp/src/arrow/util/bit_block_counter.cc
+++ b/cpp/src/arrow/util/bit_block_counter.cc
@@ -37,6 +37,12 @@ BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
   return {run_length, popcount};
 }
 
+OptionalBitBlockCounter::OptionalBitBlockCounter() :
+  has_bitmap_(false),
+  position_(0),
+  length_(0),
+  counter_(util::MakeNonNull(static_cast<const uint8_t*>(NULLPTR)), 0, 0) {}
+
 OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
                                                  int64_t offset, int64_t length)
     : has_bitmap_(validity_bitmap != nullptr),
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
index f77cc319362..67fa5603dc8 100644
--- a/cpp/src/arrow/util/bit_block_counter.h
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -211,6 +211,9 @@ class ARROW_EXPORT OptionalBitBlockCounter {
   // validity_bitmap may be null
   OptionalBitBlockCounter(const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset,
                           int64_t length);
+  
+  OptionalBitBlockCounter(OptionalBitBlockCounter& other) noexcept = default;
+  OptionalBitBlockCounter();
 
   /// Return block count for next word when the bitmap is available otherwise
   /// return a block with length up to INT16_MAX when there is no validity

From 0353d11defb942cf6f9d32542cba6e47ad75a889 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Thu, 15 Sep 2022 10:46:15 -0700
Subject: [PATCH 07/15] more progress

---
 .../arrow/compute/kernels/vector_selection.cc | 263 +++++++++---------
 cpp/src/arrow/util/bit_block_counter.h        |   1 -
 2 files changed, 131 insertions(+), 133 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 4ba2aafd838..8d4b5486809 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -269,9 +269,7 @@ template <>
 struct PrimitiveGetter<ArraySpan, bool> {
   // For boolean, we can't add offset at beginning because values is a bitmap.
   explicit PrimitiveGetter(ArraySpan&& array)
-      : inner(std::move(array)),
-        values(array.GetValues<uint8_t>(1)),
-        bit_block_counter(array.buffers[0].data, array.offset, array.length){};
+      : inner(std::move(array)), values(array.GetValues<uint8_t>(1)){};
 
   bool IsValid(int64_t i) const {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
@@ -279,14 +277,11 @@ struct PrimitiveGetter<ArraySpan, bool> {
 
   bool GetValue(int64_t i) const { return bit_util::GetBit(values, inner.offset + i); }
 
-  OptionalBitBlockCounter& BitBlockCounter(int64_t i) { return bit_block_counter; }
-
   int64_t null_count() const { return inner.null_count; }
   int64_t length() const { return inner.length; }
 
   ArraySpan inner;
   uint8_t* values;
-  OptionalBitBlockCounter bit_block_counter;
 };
 
 template <typename ValueType>
@@ -298,8 +293,6 @@ struct PrimitiveGetter<ArraySpan, ValueType> {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
   }
 
-  OptionalBitBlockCounter& BitBlockCounter(int64_t i) const { return bit_block_counter; }
-
   ValueType GetValue(int64_t i) const { return values[i]; }
 
   int64_t null_count() const { return inner.null_count; }
@@ -307,28 +300,15 @@ struct PrimitiveGetter<ArraySpan, ValueType> {
 
   ArraySpan inner;
   ValueType* values;
-  OptionalBitBlockCounter bit_block_counter;
 };
 
-OptionalBitBlockCounter* GetBitBlockCounter(const ChunkedArray& array, int64_t chunk_i) {
-  if (array.num_chunks() > 0) {
-    auto chunk = array.chunks()[chunk_i];
-    return new OptionalBitBlockCounter(chunk->null_bitmap_data(), chunk->offset(),
-                                       chunk->length());
-  } else {
-    return new OptionalBitBlockCounter();
-  }
-}
-
 template <>
 struct PrimitiveGetter<ChunkedArray, bool> {
   explicit PrimitiveGetter(const ChunkedArray& array)
       : inner(array),
         resolver(ChunkResolver(array.chunks())),
         null_count_(array.null_count()),
-        length_(array.length()),
-        current_counter_(0),
-        bit_block_counter_(GetBitBlockCounter(array, 0)) {
+        length_(array.length()) {
     values_data =
         MapVector([](const auto& x) { return x->data()->template GetValues<uint8_t>(1); },
                   array.chunks());
@@ -349,16 +329,6 @@ struct PrimitiveGetter<ChunkedArray, bool> {
                             values_offset[loc.chunk_index] + loc.index_in_chunk);
   }
 
-  OptionalBitBlockCounter& BitBlockCounter(int64_t i) {
-    ChunkLocation loc = resolver.Resolve(i);
-    if (loc.chunk_index != current_counter_) {
-      // New chunk, so we need to change counters
-      delete bit_block_counter_;
-      bit_block_counter_ = GetBitBlockCounter(inner, loc.chunk_index);
-    }
-    return *bit_block_counter_;
-  }
-
   int64_t null_count() const { return null_count_; }
   int64_t length() const { return length_; }
 
@@ -366,8 +336,6 @@ struct PrimitiveGetter<ChunkedArray, bool> {
   ChunkResolver resolver;
   int64_t null_count_;
   int64_t length_;
-  int64_t current_counter_;
-  OptionalBitBlockCounter* bit_block_counter_;
   std::vector<const uint8_t*> values_data;
   std::vector<const uint8_t*> values_is_valid;
   std::vector<int64_t> values_offset;
@@ -379,15 +347,14 @@ struct PrimitiveGetter<ChunkedArray, ValueType> {
       : inner(array),
         resolver(ChunkResolver(array.chunks())),
         null_count_(array.null_count()),
-        length_(array.length()),
-        current_counter_(0),
-        bit_block_counter_(GetBitBlockCounter(array, 0)) {
+        length_(array.length()) {
     values_data = MapVector(
         [](const auto& x) { return x->data()->template GetValues<ValueType>(1); },
         array.chunks());
     values_is_valid =
         MapVector([](const auto& x) { return x->null_bitmap_data(); }, array.chunks());
     values_offset = MapVector([](const auto& x) { return x->offset(); }, array.chunks());
+    chunk_lengths = MapVector([](const auto& x) { return x->length(); }, array.chunks());
   }
 
   bool IsValid(int64_t i) const {
@@ -398,20 +365,10 @@ struct PrimitiveGetter<ChunkedArray, ValueType> {
 
   ValueType GetValue(int64_t i) const {
     ChunkLocation loc = resolver.Resolve(i);
-    ValueType* data = values_data[loc.chunk_index];
+    const ValueType* data = values_data[loc.chunk_index];
     return data[i];
   }
 
-  OptionalBitBlockCounter& BitBlockCounter(int64_t i) {
-    ChunkLocation loc = resolver.Resolve(i);
-    if (loc.chunk_index != current_counter_) {
-      // New chunk, so we need to change counters
-      delete bit_block_counter_;
-      bit_block_counter_ = GetBitBlockCounter(inner, loc.chunk_index);
-    }
-    return *bit_block_counter_;
-  }
-
   int64_t null_count() const { return null_count_; }
   int64_t length() const { return length_; }
 
@@ -419,8 +376,7 @@ struct PrimitiveGetter<ChunkedArray, ValueType> {
   ChunkResolver resolver;
   int64_t null_count_;
   int64_t length_;
-  int64_t current_counter_;
-  OptionalBitBlockCounter* bit_block_counter_;
+  std::vector<int64_t> chunk_lengths;
   std::vector<const ValueType*> values_data;
   std::vector<const uint8_t*> values_is_valid;
   std::vector<int64_t> values_offset;
@@ -978,17 +934,35 @@ class PrimitiveFilterImpl {
   }
 
   void Exec() {
-    if (filter_null_count_ == 0 && values_null_count_ == 0) {
+    if (filter_null_count_ == 0 && values_getter_.null_count() == 0) {
       return ExecNonNull();
     }
 
+    int64_t in_position = 0;
+    if constexpr (std::is_same<ArrayKind, ChunkedArray>::value) {
+      // The chunked array may be longer than the filter, since we applying the
+      // filter one filter chunk at a time.
+      int64_t remaining_length = out_length_;
+      for (const std::shared_ptr<Array>& chunk : values_getter_.inner.chunks()) {
+        ArraySpan span = ArraySpan(*chunk->data().get());
+        int64_t length = std::min(span.length, remaining_length);
+        ExecChunk(span, in_position, length);
+        remaining_length -= span.length;
+        if (remaining_length <= 0) break;
+      }
+    } else {
+      // We can assume the values and filter are exactly the same length
+      ExecChunk(values_getter_.inner, in_position, out_length_);
+    }
+  }
+
+  void ExecChunk(const ArraySpan& chunk, int64_t& in_position, int64_t length) {
     // Bit counters used for both null_selection behaviors
     DropNullCounter drop_null_counter(filter_is_valid_, filter_data_, filter_offset_,
-                                      values_getter_.length());
-    // ???
-    OptionalBitBlockCounter& data_counter;
+                                      length);
+    OptionalBitBlockCounter data_counter(chunk.buffers[0].data, chunk.offset, length);
     OptionalBitBlockCounter filter_valid_counter(filter_is_valid_, filter_offset_,
-                                                 values_getter_.length());
+                                                 length);
 
     auto WriteNotNull = [&](int64_t index) {
       bit_util::SetBit(out_is_valid_, out_offset_ + out_position_);
@@ -998,15 +972,14 @@ class PrimitiveFilterImpl {
 
     auto WriteMaybeNull = [&](int64_t index) {
       bit_util::SetBitTo(out_is_valid_, out_offset_ + out_position_,
-                         values_getter_.Validity(index));
+                         values_getter_.IsValid(index));
       // Increments out_position_
       WriteValue(index);
     };
 
-    int64_t in_position = 0;
-    while (in_position < values_length_) {
-      // In chunked array, block counter will change as we enter new chunks.
-      data_counter = values_getter_.BitBlockCounter(in_position);
+    int64_t end_position = in_position + length;
+
+    while (in_position < end_position) {
       BitBlockCount filter_block = drop_null_counter.NextBlock();
       BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
       BitBlockCount data_block = data_counter.NextWord();
@@ -1019,8 +992,7 @@ class PrimitiveFilterImpl {
       } else if (filter_block.AllSet()) {
         // Faster: all values are selected, but some values are null
         // Batch copy bits from values validity bitmap to output validity bitmap
-        CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
-                   out_is_valid_, out_offset_ + out_position_);
+        WriteBitmapSegment(in_position, filter_block.length);
         WriteValueSegment(in_position, filter_block.length);
         in_position += filter_block.length;
       } else if (filter_block.NoneSet() && null_selection_ == FilterOptions::DROP) {
@@ -1108,74 +1080,99 @@ class PrimitiveFilterImpl {
 
   // Write the next out_position given the selected in_position for the input
   // data and advance out_position
-  std::enable_if_t<!is_boolean_type<T>::value, void> WriteValue(int64_t in_position) {
-    out_data_[out_position_++] = values_getter_.GetValue(in_position);
-  }
-
-  enable_if_boolean<T, void> WriteValue(int64_t in_position) {
-    bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
-                       values_getter_.GetValue(in_position));
-  }
-
-  std::enable_if_t<!is_boolean_type<T>::value, void> WriteNull() {
-    // Zero the memory
-    out_data_[out_position_++] = T{};
+  void WriteValue(int64_t in_position) {
+    if constexpr (is_boolean_type<ArrowType>::value) {
+      bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
+                         values_getter_.GetValue(in_position));
+    } else {
+      out_data_[out_position_++] = values_getter_.GetValue(in_position);
+    }
   }
 
-  enable_if_boolean<T, void> WriteNull() {
-    // Zero the bit
-    bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
+  void WriteNull() {
+    if constexpr (is_boolean_type<ArrowType>::value) {
+      // Zero the bit
+      bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
+    } else {
+      // Zero the memory
+      out_data_[out_position_++] = T{};
+    }
   }
 
-  enable_if_t<(std::is_same<ArrayKind, ArraySpan>::value && !is_boolean_type<T>::value), void>
-  WriteValueSegment(int64_t in_start, int64_t length) {
-    std::memcpy(out_data_ + out_position_, values.values + in_start, length * sizeof(T));
-    out_position_ += length;
+  void WriteValueSegment(int64_t in_start, int64_t length) {
+    if constexpr (std::is_same<ArrayKind, ArraySpan>::value &&
+                  !is_boolean_type<ArrowType>::value) {
+      std::memcpy(out_data_ + out_position_, values_getter_.values + in_start,
+                  length * sizeof(T));
+      out_position_ += length;
+    } else if constexpr (std::is_same<ArrayKind, ChunkedArray>::value &&
+                         !is_boolean_type<ArrowType>::value) {
+      // Find initial chunk
+      ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
+      int64_t processed_length = 0;
+      // While processed_length < length, keep copying more chunks
+      while (processed_length < length) {
+        int64_t chunk_length = values_getter_.chunk_lengths[loc.chunk_index];
+        int64_t copyable_length = std::min(chunk_length, length - processed_length);
+        std::memcpy(out_data_ + out_position_,
+                    values_getter_.values_data[loc.chunk_index] + loc.index_in_chunk,
+                    copyable_length * sizeof(T));
+        // Advance to beginning of next chunk
+        ++loc.chunk_index;
+        loc.index_in_chunk = 0;
+      }
+      out_position_ += length;
+    } else if constexpr (std::is_same<ArrayKind, ArraySpan>::value &&
+                         is_boolean_type<ArrowType>::value) {
+      CopyBitmap(values_getter_.values, values_getter_.inner.offset + in_start, length,
+                 out_data_, out_offset_ + out_position_);
+      out_position_ += length;
+    } else {  // Chunked boolean array
+      // Find initial chunk
+      ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
+      int64_t processed_length = 0;
+      // While processed_length < length, keep copying more chunks
+      while (processed_length < length) {
+        int64_t chunk_length = values_getter_.chunk_lengths[loc.chunk_index];
+        int64_t copyable_length = std::min(chunk_length, length - processed_length);
+        CopyBitmap(values_getter_.values_data[loc.chunk_index],
+                   values_getter_.values_offset[loc.chunk_index] + loc.index_in_chunk,
+                   copyable_length, out_data_, out_offset_ + out_position_);
+        processed_length += copyable_length;
+        // Advance to beginning of next chunk
+        ++loc.chunk_index;
+        loc.index_in_chunk = 0;
+      }
+      out_position_ += length;
+    }
   }
 
-  enable_if_t<(std::is_same<ArrayKind, ChunkedArray>::value && !is_boolean_type<T>::value), void>
-  WriteValueSegment(int64_t in_start, int64_t length) {
-    // Find initial chunk
-    ChunkLocation loc = values.resolver.Resolve(in_start);
-    int64_t processed_length = 0;
-    // While processed_length < length, keep copying more chunks
-    while (processed_length < length) {
-      std::shared_ptr<Array> chunk = values.inner.chunk(loc.chunk_index);
-      int64_t copyable_length = min(chunk.length(), length - processed_length);
-      std::memcpy(out_data_ + out_position_,
-                  values.values_data[loc.chunk_index] + loc.index_in_chunk,
-                  copyable_length * sizeof(T));
-      // Advance to beginning of next chunk
-      ++loc.chunk_index;
-      loc.index_in_chunk = 0;
-    }
-    out_position_ += length;
-  }
-
-  enable_if_t<std::is_same<ArrayKind, ArraySpan> && is_boolean_type<T>, void>
-  WriteValueSegment(int64_t in_start, int64_t length) {
-    CopyBitmap(values_getter_.values, values_getter_.inner.offset + in_start, length,
-               out_data_, out_offset_ + out_position_);
-    out_position_ += length;
-  }
-
-  enable_if_t<std::is_same<ArrayKind, ChunkedArray> && is_boolean_type<T>, void>
-  WriteValueSegment(int64_t in_start, int64_t length) {
-    // Find initial chunk
-    ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
-    int64_t processed_length = 0;
-    // While processed_length < length, keep copying more chunks
-    while (processed_length < length) {
-      std::shared_ptr<Array> chunk = values_getter_.inner.chunk(loc.chunk_index);
-      int64_t copyable_length = min(chunk.length(), length - processed_length);
-      CopyBitmap(values_getter_.values_data[loc.chunk_index],
-                 values_getter_.values_offset[loc.chunk_index] + loc.index_in_chunk,
-                 copyable_length, out_data_, out_offset_ + out_position_);
-      // Advance to beginning of next chunk
-      ++loc.chunk_index;
-      loc.index_in_chunk = 0;
+  void WriteBitmapSegment(int64_t in_start, int64_t length) {
+    if constexpr (std::is_same<ArrayKind, ArraySpan>::value) {
+      // CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
+      //            out_is_valid_, out_offset_ + out_position_);
+      CopyBitmap(values_getter_.inner.buffers[0].data,
+                 values_getter_.inner.offset + in_start, length, out_is_valid_,
+                 out_offset_ + out_position_);
+    } else {
+      // ChunkedArray case
+      // Find initial chunk
+      ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
+      int64_t processed_length = 0;
+      // While processed_length < length, keep copying more chunks
+      while (processed_length < length) {
+        int64_t chunk_length = values_getter_.chunk_lengths[loc.chunk_index];
+        int64_t copyable_length = std::min(chunk_length, length - processed_length);
+        CopyBitmap(values_getter_.values_is_valid[loc.chunk_index],
+                   values_getter_.values_offset[loc.chunk_index] + loc.index_in_chunk,
+                   copyable_length, out_is_valid_, out_offset_ + out_position_);
+        processed_length += copyable_length;
+        // Advance to beginning of next chunk
+        ++loc.chunk_index;
+        loc.index_in_chunk = 0;
+      }
+      out_position_ += length;
     }
-    out_position_ += length;
   }
 
  private:
@@ -1199,8 +1196,8 @@ void PrimitiveFilterDispatch(const ArraySpan& values, const ArraySpan& filter,
   using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
                                       uint8_t, typename ArrowType::c_type>::type;
 
-  auto getter = PrimitiveGetter<ArraySpan, T>(values);
-  PrimitiveFilterImpl<ArrowType>(getter, filter, null_selection, out).Exec();
+  auto getter = PrimitiveGetter<ArraySpan, T>(ArraySpan(values));
+  PrimitiveFilterImpl<ArraySpan, ArrowType>(getter, filter, null_selection, out).Exec();
 }
 
 Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
@@ -1263,18 +1260,20 @@ void PrimitiveFilterDispatch(const ChunkedArray& values, const ChunkedArray& fil
                                       uint8_t, typename ArrowType::c_type>::type;
 
   auto getter = PrimitiveGetter<ChunkedArray, T>(values);
-  auto impl = PrimitiveFilterImpl<ArrowType>(values, null_selection);
 
   // Will match the chunking structure of the filter array.
-  for (const auto& filter_chunk : filter.chunks()) {
-    auto chunk_out = ArrayData::Make(values.type(), filter.length());
-    PrimitiveFilterImpl<ArrowType>(getter, filter_chunk, null_selection, chunk_out)
+  for (const std::shared_ptr<Array>& filter_chunk : filter.chunks()) {
+    std::shared_ptr<ArrayData> chunk_out =
+        ArrayData::Make(values.type(), filter.length());
+    ArraySpan filter_span = ArraySpan(*filter_chunk->data().get());
+    PrimitiveFilterImpl<ChunkedArray, ArrowType>(getter, filter_span, null_selection,
+                                                 chunk_out.get())
         .Exec();
-    out_data->push_back(chunk_out);
+    out->push_back(chunk_out);
   }
 }
 
-Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ChunkedPrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   const ChunkedArray& values = *batch[0].chunked_array();
   const ChunkedArray& filter = *batch[1].chunked_array();
 
@@ -2925,7 +2924,7 @@ std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
 void RegisterVectorSelection(FunctionRegistry* registry) {
   // Filter kernels
   std::vector<SelectionKernelData> filter_kernels = {
-      {InputType(match::Primitive()), PrimitiveFilter, NULLPTR},
+      {InputType(match::Primitive()), PrimitiveFilter, ChunkedPrimitiveFilter},
       {InputType(match::BinaryLike()), BinaryFilter, NULLPTR},
       {InputType(match::LargeBinaryLike()), BinaryFilter, NULLPTR},
       {InputType(Type::FIXED_SIZE_BINARY), FilterExec<FSBImpl>, NULLPTR},
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
index 67fa5603dc8..bdb5c67ffca 100644
--- a/cpp/src/arrow/util/bit_block_counter.h
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -211,7 +211,6 @@ class ARROW_EXPORT OptionalBitBlockCounter {
   // validity_bitmap may be null
   OptionalBitBlockCounter(const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset,
                           int64_t length);
-  
   OptionalBitBlockCounter(OptionalBitBlockCounter& other) noexcept = default;
   OptionalBitBlockCounter();
 

From 173bb2a0e63edd6bcb8035f046fe1dc0b1482dab Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Thu, 15 Sep 2022 12:51:07 -0700
Subject: [PATCH 08/15] Fix dispatch

---
 .../arrow/compute/kernels/vector_selection.cc | 50 +++++++++++++++----
 cpp/src/arrow/util/bit_block_counter.cc       | 10 ++--
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 8d4b5486809..040f676fe75 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -269,7 +269,7 @@ template <>
 struct PrimitiveGetter<ArraySpan, bool> {
   // For boolean, we can't add offset at beginning because values is a bitmap.
   explicit PrimitiveGetter(ArraySpan&& array)
-      : inner(std::move(array)), values(array.GetValues<uint8_t>(1)){};
+      : inner(std::move(array)), values(array.GetValues<uint8_t>(1)) {}
 
   bool IsValid(int64_t i) const {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
@@ -287,7 +287,7 @@ struct PrimitiveGetter<ArraySpan, bool> {
 template <typename ValueType>
 struct PrimitiveGetter<ArraySpan, ValueType> {
   explicit PrimitiveGetter(ArraySpan&& array)
-      : inner(std::move(array)), values(array.GetValues<ValueType>(1) + array.offset){};
+      : inner(std::move(array)), values(array.GetValues<ValueType>(1) + array.offset) {}
 
   bool IsValid(int64_t i) const {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
@@ -1105,7 +1105,9 @@ class PrimitiveFilterImpl {
       std::memcpy(out_data_ + out_position_, values_getter_.values + in_start,
                   length * sizeof(T));
       out_position_ += length;
-    } else if constexpr (std::is_same<ArrayKind, ChunkedArray>::value &&
+      // We can remove NOLINT when we update cpplint
+      // https://github.com/cpplint/cpplint/issues/135
+    } else if constexpr (std::is_same<ArrayKind, ChunkedArray>::value &&  // NOLINT
                          !is_boolean_type<ArrowType>::value) {
       // Find initial chunk
       ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
@@ -1122,7 +1124,7 @@ class PrimitiveFilterImpl {
         loc.index_in_chunk = 0;
       }
       out_position_ += length;
-    } else if constexpr (std::is_same<ArrayKind, ArraySpan>::value &&
+    } else if constexpr (std::is_same<ArrayKind, ArraySpan>::value &&  // NOLINT
                          is_boolean_type<ArrowType>::value) {
       CopyBitmap(values_getter_.values, values_getter_.inner.offset + in_start, length,
                  out_data_, out_offset_ + out_position_);
@@ -2465,6 +2467,8 @@ class FilterMetaFunction : public MetaFunction {
 // R -> RecordBatch
 // T -> Table
 
+bool chunked_take_supported(const DataType& type) { return is_primitive(type); }
+
 Result<std::shared_ptr<ArrayData>> TakeAA(const std::shared_ptr<ArrayData>& values,
                                           const std::shared_ptr<ArrayData>& indices,
                                           const TakeOptions& options, ExecContext* ctx) {
@@ -2477,10 +2481,26 @@ Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
                                              const ChunkedArray& indices,
                                              const TakeOptions& options,
                                              ExecContext* ctx) {
-  ARROW_ASSIGN_OR_RAISE(
-      Datum result,
-      CallFunction("array_take", {Datum(values), Datum(indices)}, &options, ctx));
-  return result.chunked_array();
+  if (chunked_take_supported(*values.type().get())) {
+    ARROW_ASSIGN_OR_RAISE(
+        Datum result,
+        CallFunction("array_take", {Datum(values), Datum(indices)}, &options, ctx));
+    return result.chunked_array();
+  } else {
+    ArrayVector out_arrays(indices.num_chunks());
+    // Concatenate values into one array, so we can use TakeAA.
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> values_array,
+                          Concatenate(values.chunks(), ctx->memory_pool()));
+    // Match the chunking structure of the indices.
+    for (const std::shared_ptr<Array>& ind_chunk : indices.chunks()) {
+      std::shared_ptr<ArrayData> result_chunk;
+      ARROW_ASSIGN_OR_RAISE(
+          result_chunk, TakeAA(values_array->data(), ind_chunk->data(), options, ctx));
+      out_arrays.push_back(MakeArray(result_chunk));
+    }
+
+    return std::make_shared<ChunkedArray>(std::move(out_arrays));
+  }
 }
 
 Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
@@ -2505,10 +2525,20 @@ Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
                           TakeAA(current_chunk->data(), indices->data(), options, ctx));
     std::vector<std::shared_ptr<Array>> chunks = {MakeArray(new_chunk)};
     return std::make_shared<ChunkedArray>(std::move(chunks));
-    // Case 3:
-  } else {
+    // Case 3: We have support for take on chunked arrays
+  } else if (chunked_take_supported(*values.type().get())) {
     ChunkedArray indices_chunked(indices);
     return TakeCC(values, indices_chunked, options, ctx);
+    // Case 4: We don't directly support chunked array take, so concat.
+  } else {
+    // Concatenate values into one array, so we can use TakeAA.
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> values_array,
+                          Concatenate(values.chunks(), ctx->memory_pool()));
+    std::shared_ptr<ArrayData> result_chunk;
+    ARROW_ASSIGN_OR_RAISE(result_chunk,
+                          TakeAA(values_array->data(), indices->data(), options, ctx));
+
+    return std::make_shared<ChunkedArray>(MakeArray(result_chunk));
   }
 }
 
diff --git a/cpp/src/arrow/util/bit_block_counter.cc b/cpp/src/arrow/util/bit_block_counter.cc
index dbc91858ef8..1f46c31886e 100644
--- a/cpp/src/arrow/util/bit_block_counter.cc
+++ b/cpp/src/arrow/util/bit_block_counter.cc
@@ -37,11 +37,11 @@ BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
   return {run_length, popcount};
 }
 
-OptionalBitBlockCounter::OptionalBitBlockCounter() :
-  has_bitmap_(false),
-  position_(0),
-  length_(0),
-  counter_(util::MakeNonNull(static_cast<const uint8_t*>(NULLPTR)), 0, 0) {}
+OptionalBitBlockCounter::OptionalBitBlockCounter()
+    : has_bitmap_(false),
+      position_(0),
+      length_(0),
+      counter_(util::MakeNonNull(static_cast<const uint8_t*>(NULLPTR)), 0, 0) {}
 
 OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
                                                  int64_t offset, int64_t length)

From 97cb2015543f667e119946050978260d89ccc638 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Thu, 15 Sep 2022 15:33:04 -0700
Subject: [PATCH 09/15] Fix a few bugs

---
 .../arrow/compute/kernels/vector_selection.cc | 43 ++++++++++++++-----
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 040f676fe75..f6252a919c4 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -269,7 +269,9 @@ template <>
 struct PrimitiveGetter<ArraySpan, bool> {
   // For boolean, we can't add offset at beginning because values is a bitmap.
   explicit PrimitiveGetter(ArraySpan&& array)
-      : inner(std::move(array)), values(array.GetValues<uint8_t>(1)) {}
+      : inner(std::move(array)), values(array.GetValues<uint8_t>(1)) {
+    inner.GetNullCount();
+  }
 
   bool IsValid(int64_t i) const {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
@@ -287,7 +289,9 @@ struct PrimitiveGetter<ArraySpan, bool> {
 template <typename ValueType>
 struct PrimitiveGetter<ArraySpan, ValueType> {
   explicit PrimitiveGetter(ArraySpan&& array)
-      : inner(std::move(array)), values(array.GetValues<ValueType>(1) + array.offset) {}
+      : inner(std::move(array)), values(array.GetValues<ValueType>(1)) {
+    inner.GetNullCount();
+  }
 
   bool IsValid(int64_t i) const {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
@@ -319,8 +323,13 @@ struct PrimitiveGetter<ChunkedArray, bool> {
 
   bool IsValid(int64_t i) const {
     ChunkLocation loc = resolver.Resolve(i);
-    return bit_util::GetBit(values_is_valid[loc.chunk_index],
-                            values_offset[loc.chunk_index] + loc.index_in_chunk);
+    const uint8_t* validity_bitmap = values_is_valid[loc.chunk_index];
+    if (validity_bitmap == nullptr) {
+      return true;
+    } else {
+      return bit_util::GetBit(validity_bitmap,
+                              values_offset[loc.chunk_index] + loc.index_in_chunk);
+    }
   }
 
   bool GetValue(int64_t i) const {
@@ -440,8 +449,13 @@ struct PrimitiveTakeImpl {
 
     bool GetValidity(int64_t i) {
       ChunkLocation loc = resolver.Resolve(i);
-      return bit_util::GetBit(values_is_valid[loc.chunk_index],
-                              values_offset[loc.chunk_index] + loc.index_in_chunk);
+      const uint8_t* validity_bitmap = values_is_valid[loc.chunk_index];
+      if (validity_bitmap == nullptr) {
+        return true;
+      } else {
+        return bit_util::GetBit(validity_bitmap,
+                                values_offset[loc.chunk_index] + loc.index_in_chunk);
+      }
     }
 
     ChunkResolver resolver;
@@ -913,8 +927,9 @@ class PrimitiveFilterImpl {
       : values_getter_(values_getter),
         filter_is_valid_(filter.buffers[0].data),
         filter_data_(filter.buffers[1].data),
-        filter_null_count_(filter.null_count),
+        filter_null_count_(filter.GetNullCount()),
         filter_offset_(filter.offset),
+        filter_length_(filter.length),
         null_selection_(null_selection) {
     if (out_arr->buffers[0] != nullptr) {
       // May not be allocated if neither filter nor values contains nulls
@@ -942,7 +957,7 @@ class PrimitiveFilterImpl {
     if constexpr (std::is_same<ArrayKind, ChunkedArray>::value) {
       // The chunked array may be longer than the filter, since we applying the
       // filter one filter chunk at a time.
-      int64_t remaining_length = out_length_;
+      int64_t remaining_length = filter_length_;
       for (const std::shared_ptr<Array>& chunk : values_getter_.inner.chunks()) {
         ArraySpan span = ArraySpan(*chunk->data().get());
         int64_t length = std::min(span.length, remaining_length);
@@ -952,7 +967,7 @@ class PrimitiveFilterImpl {
       }
     } else {
       // We can assume the values and filter are exactly the same length
-      ExecChunk(values_getter_.inner, in_position, out_length_);
+      ExecChunk(values_getter_.inner, in_position, filter_length_);
     }
   }
 
@@ -1183,6 +1198,7 @@ class PrimitiveFilterImpl {
   const uint8_t* filter_data_;
   int64_t filter_null_count_;
   int64_t filter_offset_;
+  int64_t filter_length_;
   FilterOptions::NullSelectionBehavior null_selection_;
   uint8_t* out_is_valid_;
   T* out_data_;
@@ -1208,6 +1224,10 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
   FilterOptions::NullSelectionBehavior null_selection =
       FilterState::Get(ctx).null_selection_behavior;
 
+  // Make sure we compute the null counts
+  values.GetNullCount();
+  filter.GetNullCount();
+
   int64_t output_length = GetFilterOutputSize(filter, null_selection);
 
   ArrayData* out_arr = out->array_data().get();
@@ -2492,11 +2512,12 @@ Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
     ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> values_array,
                           Concatenate(values.chunks(), ctx->memory_pool()));
     // Match the chunking structure of the indices.
-    for (const std::shared_ptr<Array>& ind_chunk : indices.chunks()) {
+    for (int i = 0; i < indices.num_chunks(); ++i) {
+      const std::shared_ptr<Array>& ind_chunk = indices.chunk(i);
       std::shared_ptr<ArrayData> result_chunk;
       ARROW_ASSIGN_OR_RAISE(
           result_chunk, TakeAA(values_array->data(), ind_chunk->data(), options, ctx));
-      out_arrays.push_back(MakeArray(result_chunk));
+      out_arrays[i] = MakeArray(result_chunk);
     }
 
     return std::make_shared<ChunkedArray>(std::move(out_arrays));

From 02c23b6f9f7970faef88348aac00510c4075b1cc Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Fri, 16 Sep 2022 08:48:17 -0700
Subject: [PATCH 10/15] Cleanup redundant classes

---
 .../arrow/compute/kernels/vector_selection.cc | 166 ++++--------------
 1 file changed, 33 insertions(+), 133 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index f6252a919c4..a9dbc0bb718 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -262,6 +262,9 @@ Status PreallocateData(KernelContext* ctx, int64_t length, int bit_width,
   return Status::OK();
 }
 
+/// \brief Wrapper for getting values from ArraySpan and ChunkedArray.
+/// \tparam ArrayKind either ArraySpan or ChunkedArray.
+/// \tparam ValueType C types from array values.
 template <typename ArrayKind, typename ValueType>
 struct PrimitiveGetter {};
 
@@ -273,6 +276,11 @@ struct PrimitiveGetter<ArraySpan, bool> {
     inner.GetNullCount();
   }
 
+  explicit PrimitiveGetter(const ArraySpan& array)
+    : inner(array), values(array.GetValues<uint8_t>(1)) {
+    inner.GetNullCount();
+  }
+
   bool IsValid(int64_t i) const {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
   }
@@ -283,7 +291,7 @@ struct PrimitiveGetter<ArraySpan, bool> {
   int64_t length() const { return inner.length; }
 
   ArraySpan inner;
-  uint8_t* values;
+  const uint8_t* values;
 };
 
 template <typename ValueType>
@@ -293,6 +301,11 @@ struct PrimitiveGetter<ArraySpan, ValueType> {
     inner.GetNullCount();
   }
 
+  explicit PrimitiveGetter(const ArraySpan& array)
+    : inner(array), values(array.GetValues<ValueType>(1)) {
+    inner.GetNullCount();
+  }
+
   bool IsValid(int64_t i) const {
     return bit_util::GetBit(inner.buffers[0].data, inner.offset + i);
   }
@@ -303,7 +316,7 @@ struct PrimitiveGetter<ArraySpan, ValueType> {
   int64_t length() const { return inner.length; }
 
   ArraySpan inner;
-  ValueType* values;
+  const ValueType* values;
 };
 
 template <>
@@ -405,68 +418,9 @@ struct PrimitiveGetter<ChunkedArray, ValueType> {
 /// This function assumes that the indices have been boundschecked.
 template <typename IndexCType, typename ValueCType>
 struct PrimitiveTakeImpl {
-  // Defines how to get individual values and validity bits from values input
-  template <typename T>
-  struct PrimitiveTakeSource {};
-
-  template <>
-  struct PrimitiveTakeSource<ArraySpan> {
-    explicit PrimitiveTakeSource(const ArraySpan values) {
-      offset = values.offset;
-      null_count = values.null_count;
-      values_data = values.GetValues<ValueCType>(1);
-      values_is_valid = values.buffers[0].data;
-    }
-    ValueCType GetValue(int64_t i) const { return values_data[i]; }
-
-    bool GetValidity(int64_t i) const {
-      return bit_util::GetBit(values_is_valid, offset + i);
-    }
-
-    int64_t offset;
-    int64_t null_count;
-    const ValueCType* values_data;
-    const uint8_t* values_is_valid;
-  };
-
-  template <>
-  struct PrimitiveTakeSource<ChunkedArray> {
-    explicit PrimitiveTakeSource(const ChunkedArray& values)
-        : resolver(ChunkResolver(values.chunks())), null_count(values.null_count()) {
-      values_data = MapVector(
-          [](const auto& x) { return x->data()->template GetValues<ValueCType>(1); },
-          values.chunks());
-      values_is_valid =
-          MapVector([](const auto& x) { return x->null_bitmap_data(); }, values.chunks());
-      values_offset =
-          MapVector([](const auto& x) { return x->offset(); }, values.chunks());
-    }
-
-    ValueCType GetValue(int64_t i) {
-      ChunkLocation loc = resolver.Resolve(i);
-      return values_data[loc.chunk_index][loc.index_in_chunk];
-    }
-
-    bool GetValidity(int64_t i) {
-      ChunkLocation loc = resolver.Resolve(i);
-      const uint8_t* validity_bitmap = values_is_valid[loc.chunk_index];
-      if (validity_bitmap == nullptr) {
-        return true;
-      } else {
-        return bit_util::GetBit(validity_bitmap,
-                                values_offset[loc.chunk_index] + loc.index_in_chunk);
-      }
-    }
-
-    ChunkResolver resolver;
-    int64_t null_count;
-    std::vector<const ValueCType*> values_data;
-    std::vector<const uint8_t*> values_is_valid;
-    std::vector<int64_t> values_offset;
-  };
 
   template <typename T>
-  static void ExecImpl(PrimitiveTakeSource<T>& values, const ArraySpan& indices,
+  static void ExecImpl(PrimitiveGetter<T, ValueCType>& values, const ArraySpan& indices,
                        ArrayData* out_arr) {
     const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
     const uint8_t* indices_is_valid = indices.buffers[0].data;
@@ -479,7 +433,7 @@ struct PrimitiveTakeImpl {
     // If either the values or indices have nulls, we preemptively zero out the
     // out validity bitmap so that we don't have to use ClearBit in each
     // iteration for nulls.
-    if (values.null_count != 0 || indices.null_count != 0) {
+    if (values.null_count() != 0 || indices.null_count != 0) {
       bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
     }
 
@@ -489,7 +443,7 @@ struct PrimitiveTakeImpl {
     int64_t valid_count = 0;
     while (position < indices.length) {
       BitBlockCount block = indices_bit_counter.NextBlock();
-      if (values.null_count == 0) {
+      if (values.null_count() == 0) {
         // Values are never null, so things are easier
         valid_count += block.popcount;
         if (block.popcount == block.length) {
@@ -520,7 +474,7 @@ struct PrimitiveTakeImpl {
         if (block.popcount == block.length) {
           // Faster path: indices are not null but values may be
           for (int64_t i = 0; i < block.length; ++i) {
-            if (values.GetValidity(indices_data[position])) {
+            if (values.IsValid(indices_data[position])) {
               // value is not null
               out[position] = values.GetValue(indices_data[position]);
               bit_util::SetBit(out_is_valid, out_offset + position);
@@ -536,7 +490,7 @@ struct PrimitiveTakeImpl {
           // one.
           for (int64_t i = 0; i < block.length; ++i) {
             if (bit_util::GetBit(indices_is_valid, indices_offset + position) &&
-                values.GetValidity(indices_data[position])) {
+                values.IsValid(indices_data[position])) {
               // index is not null && value is not null
               out[position] = values.GetValue(indices_data[position]);
               bit_util::SetBit(out_is_valid, out_offset + position);
@@ -557,82 +511,28 @@ struct PrimitiveTakeImpl {
 
   static void Exec(const ArraySpan& values, const ArraySpan& indices,
                    ArrayData* out_arr) {
-    auto source = PrimitiveTakeSource<ArraySpan>(values);
-    ExecImpl(source, indices, out_arr);
+    auto getter = PrimitiveGetter<ArraySpan, ValueCType>(values);
+    ExecImpl(getter, indices, out_arr);
   }
 
   static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
                    ArrayDataVector* out_chunks) {
-    auto source = PrimitiveTakeSource<ChunkedArray>(values);
+    auto getter = PrimitiveGetter<ChunkedArray, ValueCType>(values);
 
     for (int i = 0; i < indices_chunked.num_chunks(); ++i) {
       ArraySpan indices_chunk(*indices_chunked.chunk(i)->data().get());
       ArrayData* out_arr = (*out_chunks)[i].get();
 
-      ExecImpl(source, indices_chunk, out_arr);
+      ExecImpl(getter, indices_chunk, out_arr);
     }
   }
 };
 
 template <typename IndexCType>
 struct BooleanTakeImpl {
-  // Defines how to get individual values and validity bits from values input
-  template <typename T>
-  struct BooleanTakeSource {};
-
-  template <>
-  struct BooleanTakeSource<ArraySpan> {
-    explicit BooleanTakeSource(const ArraySpan values) {
-      offset = values.offset;
-      null_count = values.null_count;
-      values_data = values.buffers[1].data;
-      values_is_valid = values.buffers[0].data;
-    }
-    bool GetValue(int64_t i) const { return bit_util::GetBit(values_data, offset + i); }
-
-    bool GetValidity(int64_t i) const {
-      return bit_util::GetBit(values_is_valid, offset + i);
-    }
-
-    int64_t offset;
-    int64_t null_count;
-    const uint8_t* values_data;
-    const uint8_t* values_is_valid;
-  };
-
-  template <>
-  struct BooleanTakeSource<ChunkedArray> {
-    explicit BooleanTakeSource(const ChunkedArray& values)
-        : resolver(ChunkResolver(values.chunks())), null_count(values.null_count()) {
-      values_data = MapVector([](const auto& x) { return x->data()->buffers[1]->data(); },
-                              values.chunks());
-      values_is_valid =
-          MapVector([](const auto& x) { return x->null_bitmap_data(); }, values.chunks());
-      values_offset =
-          MapVector([](const auto& x) { return x->offset(); }, values.chunks());
-    }
-
-    bool GetValue(int64_t i) {
-      ChunkLocation loc = resolver.Resolve(i);
-      return bit_util::GetBit(values_data[loc.chunk_index],
-                              values_offset[loc.chunk_index] + loc.index_in_chunk);
-    }
-
-    bool GetValidity(int64_t i) {
-      ChunkLocation loc = resolver.Resolve(i);
-      return bit_util::GetBit(values_is_valid[loc.chunk_index],
-                              values_offset[loc.chunk_index] + loc.index_in_chunk);
-    }
-
-    ChunkResolver resolver;
-    int64_t null_count;
-    std::vector<const uint8_t*> values_data;
-    std::vector<const uint8_t*> values_is_valid;
-    std::vector<int64_t> values_offset;
-  };
 
   template <typename T>
-  static void ExecImpl(BooleanTakeSource<T>& values, const ArraySpan& indices,
+  static void ExecImpl(PrimitiveGetter<T, bool>& values, const ArraySpan& indices,
                        ArrayData* out_arr) {
     const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
     const uint8_t* indices_is_valid = indices.buffers[0].data;
@@ -645,7 +545,7 @@ struct BooleanTakeImpl {
     // If either the values or indices have nulls, we preemptively zero out the
     // out validity bitmap so that we don't have to use ClearBit in each
     // iteration for nulls.
-    if (values.null_count != 0 || indices.null_count != 0) {
+    if (values.null_count() != 0 || indices.null_count != 0) {
       bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
     }
     // Avoid uninitialized data in values array
@@ -661,7 +561,7 @@ struct BooleanTakeImpl {
     int64_t valid_count = 0;
     while (position < indices.length) {
       BitBlockCount block = indices_bit_counter.NextBlock();
-      if (values.null_count == 0) {
+      if (values.null_count() == 0) {
         // Values are never null, so things are easier
         valid_count += block.popcount;
         if (block.popcount == block.length) {
@@ -689,7 +589,7 @@ struct BooleanTakeImpl {
         if (block.popcount == block.length) {
           // Faster path: indices are not null but values may be
           for (int64_t i = 0; i < block.length; ++i) {
-            if (values.GetValidity(indices_data[position])) {
+            if (values.IsValid(indices_data[position])) {
               // value is not null
               bit_util::SetBit(out_is_valid, out_offset + position);
               PlaceDataBit(position, indices_data[position]);
@@ -704,7 +604,7 @@ struct BooleanTakeImpl {
           for (int64_t i = 0; i < block.length; ++i) {
             if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
               // index is not null
-              if (values.GetValidity(indices_data[position])) {
+              if (values.IsValid(indices_data[position])) {
                 // value is not null
                 PlaceDataBit(position, indices_data[position]);
                 bit_util::SetBit(out_is_valid, out_offset + position);
@@ -723,19 +623,19 @@ struct BooleanTakeImpl {
 
   static void Exec(const ArraySpan& values, const ArraySpan& indices,
                    ArrayData* out_arr) {
-    auto source = BooleanTakeSource<ArraySpan>(values);
-    ExecImpl(source, indices, out_arr);
+    auto getter = PrimitiveGetter<ArraySpan, bool>(values);
+    ExecImpl(getter, indices, out_arr);
   }
 
   static void Exec(const ChunkedArray& values, const ChunkedArray& indices_chunked,
                    ArrayDataVector* out_chunks) {
-    auto source = BooleanTakeSource<ChunkedArray>(values);
+    auto getter = PrimitiveGetter<ChunkedArray, bool>(values);
 
     for (int i = 0; i < indices_chunked.num_chunks(); ++i) {
       ArraySpan indices_chunk(*indices_chunked.chunk(i)->data().get());
       ArrayData* out_arr = (*out_chunks)[i].get();
 
-      ExecImpl(source, indices_chunk, out_arr);
+      ExecImpl(getter, indices_chunk, out_arr);
     }
   }
 };

From a43fa07eda14d3f36a9dda593af03d5f1a26325d Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Fri, 16 Sep 2022 13:20:30 -0700
Subject: [PATCH 11/15] fix a few bugs

---
 .../arrow/compute/kernels/vector_selection.cc | 49 +++++++++----------
 .../compute/kernels/vector_selection_test.cc  |  5 ++
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index a9dbc0bb718..8550537f854 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -272,12 +272,12 @@ template <>
 struct PrimitiveGetter<ArraySpan, bool> {
   // For boolean, we can't add offset at beginning because values is a bitmap.
   explicit PrimitiveGetter(ArraySpan&& array)
-      : inner(std::move(array)), values(array.GetValues<uint8_t>(1)) {
+      : inner(std::move(array)), values(array.GetValues<uint8_t>(1, 0)) {
     inner.GetNullCount();
   }
 
   explicit PrimitiveGetter(const ArraySpan& array)
-    : inner(array), values(array.GetValues<uint8_t>(1)) {
+      : inner(array), values(array.GetValues<uint8_t>(1, 0)) {
     inner.GetNullCount();
   }
 
@@ -302,7 +302,7 @@ struct PrimitiveGetter<ArraySpan, ValueType> {
   }
 
   explicit PrimitiveGetter(const ArraySpan& array)
-    : inner(array), values(array.GetValues<ValueType>(1)) {
+      : inner(array), values(array.GetValues<ValueType>(1)) {
     inner.GetNullCount();
   }
 
@@ -326,12 +326,13 @@ struct PrimitiveGetter<ChunkedArray, bool> {
         resolver(ChunkResolver(array.chunks())),
         null_count_(array.null_count()),
         length_(array.length()) {
-    values_data =
-        MapVector([](const auto& x) { return x->data()->template GetValues<uint8_t>(1); },
-                  array.chunks());
+    values_data = MapVector(
+        [](const auto& x) { return x->data()->template GetValues<uint8_t>(1, 0); },
+        array.chunks());
     values_is_valid =
         MapVector([](const auto& x) { return x->null_bitmap_data(); }, array.chunks());
     values_offset = MapVector([](const auto& x) { return x->offset(); }, array.chunks());
+    chunk_lengths = MapVector([](const auto& x) { return x->length(); }, array.chunks());
   }
 
   bool IsValid(int64_t i) const {
@@ -358,6 +359,7 @@ struct PrimitiveGetter<ChunkedArray, bool> {
   ChunkResolver resolver;
   int64_t null_count_;
   int64_t length_;
+  std::vector<int64_t> chunk_lengths;
   std::vector<const uint8_t*> values_data;
   std::vector<const uint8_t*> values_is_valid;
   std::vector<int64_t> values_offset;
@@ -381,14 +383,19 @@ struct PrimitiveGetter<ChunkedArray, ValueType> {
 
   bool IsValid(int64_t i) const {
     ChunkLocation loc = resolver.Resolve(i);
-    return bit_util::GetBit(values_is_valid[loc.chunk_index],
-                            values_offset[loc.chunk_index] + loc.index_in_chunk);
+    const uint8_t* validity_bitmap = values_is_valid[loc.chunk_index];
+    if (validity_bitmap == nullptr) {
+      return true;
+    } else {
+      return bit_util::GetBit(validity_bitmap,
+                              values_offset[loc.chunk_index] + loc.index_in_chunk);
+    }
   }
 
   ValueType GetValue(int64_t i) const {
     ChunkLocation loc = resolver.Resolve(i);
     const ValueType* data = values_data[loc.chunk_index];
-    return data[i];
+    return data[loc.index_in_chunk];
   }
 
   int64_t null_count() const { return null_count_; }
@@ -418,7 +425,6 @@ struct PrimitiveGetter<ChunkedArray, ValueType> {
 /// This function assumes that the indices have been boundschecked.
 template <typename IndexCType, typename ValueCType>
 struct PrimitiveTakeImpl {
-
   template <typename T>
   static void ExecImpl(PrimitiveGetter<T, ValueCType>& values, const ArraySpan& indices,
                        ArrayData* out_arr) {
@@ -530,7 +536,6 @@ struct PrimitiveTakeImpl {
 
 template <typename IndexCType>
 struct BooleanTakeImpl {
-
   template <typename T>
   static void ExecImpl(PrimitiveGetter<T, bool>& values, const ArraySpan& indices,
                        ArrayData* out_arr) {
@@ -820,10 +825,10 @@ class PrimitiveFilterImpl {
  public:
   using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
                                       uint8_t, typename ArrowType::c_type>::type;
-  PrimitiveFilterImpl(const PrimitiveGetter<ArrayKind, T>& values_getter,
-                      const ArraySpan& filter,
-                      FilterOptions::NullSelectionBehavior null_selection,
-                      ArrayData* out_arr)
+  PrimitiveFilterImpl(
+      const PrimitiveGetter<ArrayKind, typename ArrowType::c_type>& values_getter,
+      const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
+      ArrayData* out_arr)
       : values_getter_(values_getter),
         filter_is_valid_(filter.buffers[0].data),
         filter_data_(filter.buffers[1].data),
@@ -1093,7 +1098,7 @@ class PrimitiveFilterImpl {
   }
 
  private:
-  const PrimitiveGetter<ArrayKind, T>& values_getter_;
+  const PrimitiveGetter<ArrayKind, typename ArrowType::c_type>& values_getter_;
   const uint8_t* filter_is_valid_;
   const uint8_t* filter_data_;
   int64_t filter_null_count_;
@@ -1111,10 +1116,7 @@ template <typename ArrowType>
 void PrimitiveFilterDispatch(const ArraySpan& values, const ArraySpan& filter,
                              FilterOptions::NullSelectionBehavior null_selection,
                              ArrayData* out) {
-  using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
-                                      uint8_t, typename ArrowType::c_type>::type;
-
-  auto getter = PrimitiveGetter<ArraySpan, T>(ArraySpan(values));
+  auto getter = PrimitiveGetter<ArraySpan, typename ArrowType::c_type>(ArraySpan(values));
   PrimitiveFilterImpl<ArraySpan, ArrowType>(getter, filter, null_selection, out).Exec();
 }
 
@@ -1178,10 +1180,7 @@ template <typename ArrowType>
 void PrimitiveFilterDispatch(const ChunkedArray& values, const ChunkedArray& filter,
                              FilterOptions::NullSelectionBehavior null_selection,
                              ArrayDataVector* out) {
-  using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
-                                      uint8_t, typename ArrowType::c_type>::type;
-
-  auto getter = PrimitiveGetter<ChunkedArray, T>(values);
+  auto getter = PrimitiveGetter<ChunkedArray, typename ArrowType::c_type>(values);
 
   // Will match the chunking structure of the filter array.
   for (const std::shared_ptr<Array>& filter_chunk : filter.chunks()) {
@@ -2434,7 +2433,7 @@ Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
   if (num_chunks <= 1) {
     std::shared_ptr<Array> current_chunk;
     // Case 1: `values` has a single chunk, so just use it
-    if (num_chunks == 0) {
+    if (num_chunks == 1) {
       current_chunk = values.chunk(0);
     } else {
       // Case 2: `values` has no chunks, so create an empty one
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index 38eb420122c..e7eb009e679 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -1662,6 +1662,11 @@ TEST_F(TestTakeKernelWithChunkedArray, TakeChunkedArray) {
   this->AssertChunkedTake(boolean(), {"[true]", "[false, true]"},
                           {"[0, 1, 0]", "[]", "[2]"},
                           {"[true, false, true]", "[]", "[true]"});
+  this->AssertChunkedTake(int32(), {"[7, null]", "[8, 9, 10]", "[21, null, 42]"},
+                          {"[2, 1]", "[7, 6, 6, 4]"},
+                          {"[8, null]", "[42, null, null, 10]"});
+  this->AssertTake(int32(), {"[7, null]", "[8, 9, 10]", "[21, null, 42]"},
+                   "[2, 1, 7, 6, 6, 4]", {"[8, null, 42, null, null, 10]"});
 
   std::shared_ptr<ChunkedArray> arr;
   ASSERT_RAISES(IndexError,

From 612862a9b4dc9e78e8e5f7e5dd0bdd789d448dfe Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 19 Sep 2022 08:19:47 -0700
Subject: [PATCH 12/15] Prefer TakeAA over TakeCC

---
 .../arrow/compute/kernels/vector_selection.cc | 39 +++++++++----------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 8550537f854..df54fb128f2 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -2396,6 +2396,23 @@ Result<std::shared_ptr<ArrayData>> TakeAA(const std::shared_ptr<ArrayData>& valu
   return result.array();
 }
 
+Result<std::shared_ptr<ChunkedArray>> TakeAC(std::shared_ptr<Array> values,
+                                             const ChunkedArray& indices,
+                                             const TakeOptions& options,
+                                             ExecContext* ctx) {
+  ArrayVector out_arrays(indices.num_chunks());
+  // Match the chunking structure of the indices.
+  for (int i = 0; i < indices.num_chunks(); ++i) {
+    const std::shared_ptr<Array>& ind_chunk = indices.chunk(i);
+    std::shared_ptr<ArrayData> result_chunk;
+    ARROW_ASSIGN_OR_RAISE(result_chunk,
+                          TakeAA(values->data(), ind_chunk->data(), options, ctx));
+    out_arrays[i] = MakeArray(result_chunk);
+  }
+
+  return std::make_shared<ChunkedArray>(std::move(out_arrays));
+}
+
 Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
                                              const ChunkedArray& indices,
                                              const TakeOptions& options,
@@ -2406,20 +2423,10 @@ Result<std::shared_ptr<ChunkedArray>> TakeCC(const ChunkedArray& values,
         CallFunction("array_take", {Datum(values), Datum(indices)}, &options, ctx));
     return result.chunked_array();
   } else {
-    ArrayVector out_arrays(indices.num_chunks());
-    // Concatenate values into one array, so we can use TakeAA.
+    // Concatenate values into one array, so we can use TakeAC.
     ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> values_array,
                           Concatenate(values.chunks(), ctx->memory_pool()));
-    // Match the chunking structure of the indices.
-    for (int i = 0; i < indices.num_chunks(); ++i) {
-      const std::shared_ptr<Array>& ind_chunk = indices.chunk(i);
-      std::shared_ptr<ArrayData> result_chunk;
-      ARROW_ASSIGN_OR_RAISE(
-          result_chunk, TakeAA(values_array->data(), ind_chunk->data(), options, ctx));
-      out_arrays[i] = MakeArray(result_chunk);
-    }
-
-    return std::make_shared<ChunkedArray>(std::move(out_arrays));
+    return TakeAC(values_array, indices, options, ctx);
   }
 }
 
@@ -2462,14 +2469,6 @@ Result<std::shared_ptr<ChunkedArray>> TakeCA(const ChunkedArray& values,
   }
 }
 
-Result<std::shared_ptr<ChunkedArray>> TakeAC(std::shared_ptr<Array> values,
-                                             const ChunkedArray& indices,
-                                             const TakeOptions& options,
-                                             ExecContext* ctx) {
-  ChunkedArray values_chunked(values);
-  return TakeCC(values_chunked, indices, options, ctx);
-}
-
 Result<std::shared_ptr<RecordBatch>> TakeRA(const RecordBatch& batch,
                                             const Array& indices,
                                             const TakeOptions& options,

From 7e5ad7a9d844d2cae8c09e3cc71621721ed0c018 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 19 Sep 2022 08:32:24 -0700
Subject: [PATCH 13/15] Cleanup

---
 cpp/src/arrow/compute/kernels/vector_selection.cc | 2 --
 cpp/src/arrow/util/bit_block_counter.cc           | 6 ------
 cpp/src/arrow/util/bit_block_counter.h            | 2 --
 3 files changed, 10 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index df54fb128f2..4c1f9e54ac7 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -1071,8 +1071,6 @@ class PrimitiveFilterImpl {
 
   void WriteBitmapSegment(int64_t in_start, int64_t length) {
     if constexpr (std::is_same<ArrayKind, ArraySpan>::value) {
-      // CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
-      //            out_is_valid_, out_offset_ + out_position_);
       CopyBitmap(values_getter_.inner.buffers[0].data,
                  values_getter_.inner.offset + in_start, length, out_is_valid_,
                  out_offset_ + out_position_);
diff --git a/cpp/src/arrow/util/bit_block_counter.cc b/cpp/src/arrow/util/bit_block_counter.cc
index 1f46c31886e..7b5590f1797 100644
--- a/cpp/src/arrow/util/bit_block_counter.cc
+++ b/cpp/src/arrow/util/bit_block_counter.cc
@@ -37,12 +37,6 @@ BitBlockCount BitBlockCounter::GetBlockSlow(int64_t block_size) noexcept {
   return {run_length, popcount};
 }
 
-OptionalBitBlockCounter::OptionalBitBlockCounter()
-    : has_bitmap_(false),
-      position_(0),
-      length_(0),
-      counter_(util::MakeNonNull(static_cast<const uint8_t*>(NULLPTR)), 0, 0) {}
-
 OptionalBitBlockCounter::OptionalBitBlockCounter(const uint8_t* validity_bitmap,
                                                  int64_t offset, int64_t length)
     : has_bitmap_(validity_bitmap != nullptr),
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
index bdb5c67ffca..f77cc319362 100644
--- a/cpp/src/arrow/util/bit_block_counter.h
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -211,8 +211,6 @@ class ARROW_EXPORT OptionalBitBlockCounter {
   // validity_bitmap may be null
   OptionalBitBlockCounter(const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset,
                           int64_t length);
-  OptionalBitBlockCounter(OptionalBitBlockCounter& other) noexcept = default;
-  OptionalBitBlockCounter();
 
   /// Return block count for next word when the bitmap is available otherwise
   /// return a block with length up to INT16_MAX when there is no validity

From f7ed19468c843cbb38916c7a942f242951cc61ed Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 19 Sep 2022 08:58:35 -0700
Subject: [PATCH 14/15] Revert most filter changes

---
 cpp/src/arrow/compute/exec_internal.h         |   2 +-
 .../arrow/compute/kernels/vector_selection.cc | 252 +++++-------------
 2 files changed, 62 insertions(+), 192 deletions(-)

diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 8beff2a6c63..7e4f364a928 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -46,7 +46,7 @@ class ARROW_EXPORT ExecSpanIterator {
  public:
   ExecSpanIterator() = default;
 
-  /// \brief Initialize itertor iterator and do basic argument validation
+  /// \brief Initialize iterator and do basic argument validation
   ///
   /// \param[in] batch the input ExecBatch
   /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 4c1f9e54ac7..b1f54e68c69 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -820,22 +820,30 @@ class DropNullCounter {
 /// generate one take function for each byte width. We use the same
 /// implementation here for boolean and fixed-byte-size inputs with some
 /// template specialization.
-template <typename ArrayKind, typename ArrowType>
+template <typename ArrowType>
 class PrimitiveFilterImpl {
  public:
   using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
                                       uint8_t, typename ArrowType::c_type>::type;
-  PrimitiveFilterImpl(
-      const PrimitiveGetter<ArrayKind, typename ArrowType::c_type>& values_getter,
-      const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
-      ArrayData* out_arr)
-      : values_getter_(values_getter),
+
+  PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
+                      FilterOptions::NullSelectionBehavior null_selection,
+                      ArrayData* out_arr)
+      : values_is_valid_(values.buffers[0].data),
+        values_data_(reinterpret_cast<const T*>(values.buffers[1].data)),
+        values_null_count_(values.GetNullCount()),
+        values_offset_(values.offset),
+        values_length_(values.length),
         filter_is_valid_(filter.buffers[0].data),
         filter_data_(filter.buffers[1].data),
         filter_null_count_(filter.GetNullCount()),
         filter_offset_(filter.offset),
-        filter_length_(filter.length),
         null_selection_(null_selection) {
+    if (values.type->id() != Type::BOOL) {
+      // No offset applied for boolean because it's a bitmap
+      values_data_ += values.offset;
+    }
+
     if (out_arr->buffers[0] != nullptr) {
       // May not be allocated if neither filter nor values contains nulls
       out_is_valid_ = out_arr->buffers[0]->mutable_data();
@@ -849,40 +857,22 @@ class PrimitiveFilterImpl {
   void ExecNonNull() {
     // Fast filter when values and filter are not null
     ::arrow::internal::VisitSetBitRunsVoid(
-        filter_data_, filter_offset_, values_getter_.length(),
+        filter_data_, filter_offset_, values_length_,
         [&](int64_t position, int64_t length) { WriteValueSegment(position, length); });
   }
 
   void Exec() {
-    if (filter_null_count_ == 0 && values_getter_.null_count() == 0) {
+    if (filter_null_count_ == 0 && values_null_count_ == 0) {
       return ExecNonNull();
     }
 
-    int64_t in_position = 0;
-    if constexpr (std::is_same<ArrayKind, ChunkedArray>::value) {
-      // The chunked array may be longer than the filter, since we applying the
-      // filter one filter chunk at a time.
-      int64_t remaining_length = filter_length_;
-      for (const std::shared_ptr<Array>& chunk : values_getter_.inner.chunks()) {
-        ArraySpan span = ArraySpan(*chunk->data().get());
-        int64_t length = std::min(span.length, remaining_length);
-        ExecChunk(span, in_position, length);
-        remaining_length -= span.length;
-        if (remaining_length <= 0) break;
-      }
-    } else {
-      // We can assume the values and filter are exactly the same length
-      ExecChunk(values_getter_.inner, in_position, filter_length_);
-    }
-  }
-
-  void ExecChunk(const ArraySpan& chunk, int64_t& in_position, int64_t length) {
     // Bit counters used for both null_selection behaviors
     DropNullCounter drop_null_counter(filter_is_valid_, filter_data_, filter_offset_,
-                                      length);
-    OptionalBitBlockCounter data_counter(chunk.buffers[0].data, chunk.offset, length);
+                                      values_length_);
+    OptionalBitBlockCounter data_counter(values_is_valid_, values_offset_,
+                                         values_length_);
     OptionalBitBlockCounter filter_valid_counter(filter_is_valid_, filter_offset_,
-                                                 length);
+                                                 values_length_);
 
     auto WriteNotNull = [&](int64_t index) {
       bit_util::SetBit(out_is_valid_, out_offset_ + out_position_);
@@ -892,14 +882,13 @@ class PrimitiveFilterImpl {
 
     auto WriteMaybeNull = [&](int64_t index) {
       bit_util::SetBitTo(out_is_valid_, out_offset_ + out_position_,
-                         values_getter_.IsValid(index));
+                         bit_util::GetBit(values_is_valid_, values_offset_ + index));
       // Increments out_position_
       WriteValue(index);
     };
 
-    int64_t end_position = in_position + length;
-
-    while (in_position < end_position) {
+    int64_t in_position = 0;
+    while (in_position < values_length_) {
       BitBlockCount filter_block = drop_null_counter.NextBlock();
       BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
       BitBlockCount data_block = data_counter.NextWord();
@@ -912,7 +901,8 @@ class PrimitiveFilterImpl {
       } else if (filter_block.AllSet()) {
         // Faster: all values are selected, but some values are null
         // Batch copy bits from values validity bitmap to output validity bitmap
-        WriteBitmapSegment(in_position, filter_block.length);
+        CopyBitmap(values_is_valid_, values_offset_ + in_position, filter_block.length,
+                   out_is_valid_, out_offset_ + out_position_);
         WriteValueSegment(in_position, filter_block.length);
         in_position += filter_block.length;
       } else if (filter_block.NoneSet() && null_selection_ == FilterOptions::DROP) {
@@ -1001,107 +991,29 @@ class PrimitiveFilterImpl {
   // Write the next out_position given the selected in_position for the input
   // data and advance out_position
   void WriteValue(int64_t in_position) {
-    if constexpr (is_boolean_type<ArrowType>::value) {
-      bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
-                         values_getter_.GetValue(in_position));
-    } else {
-      out_data_[out_position_++] = values_getter_.GetValue(in_position);
-    }
-  }
-
-  void WriteNull() {
-    if constexpr (is_boolean_type<ArrowType>::value) {
-      // Zero the bit
-      bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
-    } else {
-      // Zero the memory
-      out_data_[out_position_++] = T{};
-    }
+    out_data_[out_position_++] = values_data_[in_position];
   }
 
   void WriteValueSegment(int64_t in_start, int64_t length) {
-    if constexpr (std::is_same<ArrayKind, ArraySpan>::value &&
-                  !is_boolean_type<ArrowType>::value) {
-      std::memcpy(out_data_ + out_position_, values_getter_.values + in_start,
-                  length * sizeof(T));
-      out_position_ += length;
-      // We can remove NOLINT when we update cpplint
-      // https://github.com/cpplint/cpplint/issues/135
-    } else if constexpr (std::is_same<ArrayKind, ChunkedArray>::value &&  // NOLINT
-                         !is_boolean_type<ArrowType>::value) {
-      // Find initial chunk
-      ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
-      int64_t processed_length = 0;
-      // While processed_length < length, keep copying more chunks
-      while (processed_length < length) {
-        int64_t chunk_length = values_getter_.chunk_lengths[loc.chunk_index];
-        int64_t copyable_length = std::min(chunk_length, length - processed_length);
-        std::memcpy(out_data_ + out_position_,
-                    values_getter_.values_data[loc.chunk_index] + loc.index_in_chunk,
-                    copyable_length * sizeof(T));
-        // Advance to beginning of next chunk
-        ++loc.chunk_index;
-        loc.index_in_chunk = 0;
-      }
-      out_position_ += length;
-    } else if constexpr (std::is_same<ArrayKind, ArraySpan>::value &&  // NOLINT
-                         is_boolean_type<ArrowType>::value) {
-      CopyBitmap(values_getter_.values, values_getter_.inner.offset + in_start, length,
-                 out_data_, out_offset_ + out_position_);
-      out_position_ += length;
-    } else {  // Chunked boolean array
-      // Find initial chunk
-      ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
-      int64_t processed_length = 0;
-      // While processed_length < length, keep copying more chunks
-      while (processed_length < length) {
-        int64_t chunk_length = values_getter_.chunk_lengths[loc.chunk_index];
-        int64_t copyable_length = std::min(chunk_length, length - processed_length);
-        CopyBitmap(values_getter_.values_data[loc.chunk_index],
-                   values_getter_.values_offset[loc.chunk_index] + loc.index_in_chunk,
-                   copyable_length, out_data_, out_offset_ + out_position_);
-        processed_length += copyable_length;
-        // Advance to beginning of next chunk
-        ++loc.chunk_index;
-        loc.index_in_chunk = 0;
-      }
-      out_position_ += length;
-    }
+    std::memcpy(out_data_ + out_position_, values_data_ + in_start, length * sizeof(T));
+    out_position_ += length;
   }
 
-  void WriteBitmapSegment(int64_t in_start, int64_t length) {
-    if constexpr (std::is_same<ArrayKind, ArraySpan>::value) {
-      CopyBitmap(values_getter_.inner.buffers[0].data,
-                 values_getter_.inner.offset + in_start, length, out_is_valid_,
-                 out_offset_ + out_position_);
-    } else {
-      // ChunkedArray case
-      // Find initial chunk
-      ChunkLocation loc = values_getter_.resolver.Resolve(in_start);
-      int64_t processed_length = 0;
-      // While processed_length < length, keep copying more chunks
-      while (processed_length < length) {
-        int64_t chunk_length = values_getter_.chunk_lengths[loc.chunk_index];
-        int64_t copyable_length = std::min(chunk_length, length - processed_length);
-        CopyBitmap(values_getter_.values_is_valid[loc.chunk_index],
-                   values_getter_.values_offset[loc.chunk_index] + loc.index_in_chunk,
-                   copyable_length, out_is_valid_, out_offset_ + out_position_);
-        processed_length += copyable_length;
-        // Advance to beginning of next chunk
-        ++loc.chunk_index;
-        loc.index_in_chunk = 0;
-      }
-      out_position_ += length;
-    }
+  void WriteNull() {
+    // Zero the memory
+    out_data_[out_position_++] = T{};
   }
 
  private:
-  const PrimitiveGetter<ArrayKind, typename ArrowType::c_type>& values_getter_;
+  const uint8_t* values_is_valid_;
+  const T* values_data_;
+  int64_t values_null_count_;
+  int64_t values_offset_;
+  int64_t values_length_;
   const uint8_t* filter_is_valid_;
   const uint8_t* filter_data_;
   int64_t filter_null_count_;
   int64_t filter_offset_;
-  int64_t filter_length_;
   FilterOptions::NullSelectionBehavior null_selection_;
   uint8_t* out_is_valid_;
   T* out_data_;
@@ -1110,12 +1022,24 @@ class PrimitiveFilterImpl {
   int64_t out_position_;
 };
 
-template <typename ArrowType>
-void PrimitiveFilterDispatch(const ArraySpan& values, const ArraySpan& filter,
-                             FilterOptions::NullSelectionBehavior null_selection,
-                             ArrayData* out) {
-  auto getter = PrimitiveGetter<ArraySpan, typename ArrowType::c_type>(ArraySpan(values));
-  PrimitiveFilterImpl<ArraySpan, ArrowType>(getter, filter, null_selection, out).Exec();
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteValue(int64_t in_position) {
+  bit_util::SetBitTo(out_data_, out_offset_ + out_position_++,
+                     bit_util::GetBit(values_data_, values_offset_ + in_position));
+}
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteValueSegment(int64_t in_start,
+                                                                int64_t length) {
+  CopyBitmap(values_data_, values_offset_ + in_start, length, out_data_,
+             out_offset_ + out_position_);
+  out_position_ += length;
+}
+
+template <>
+inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
+  // Zero the bit
+  bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
 }
 
 Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
@@ -1153,19 +1077,19 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
 
   switch (bit_width) {
     case 1:
-      PrimitiveFilterDispatch<BooleanType>(values, filter, null_selection, out_arr);
+      PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
       break;
     case 8:
-      PrimitiveFilterDispatch<UInt8Type>(values, filter, null_selection, out_arr);
+      PrimitiveFilterImpl<UInt8Type>(values, filter, null_selection, out_arr).Exec();
       break;
     case 16:
-      PrimitiveFilterDispatch<UInt16Type>(values, filter, null_selection, out_arr);
+      PrimitiveFilterImpl<UInt16Type>(values, filter, null_selection, out_arr).Exec();
       break;
     case 32:
-      PrimitiveFilterDispatch<UInt32Type>(values, filter, null_selection, out_arr);
+      PrimitiveFilterImpl<UInt32Type>(values, filter, null_selection, out_arr).Exec();
       break;
     case 64:
-      PrimitiveFilterDispatch<UInt64Type>(values, filter, null_selection, out_arr);
+      PrimitiveFilterImpl<UInt64Type>(values, filter, null_selection, out_arr).Exec();
       break;
     default:
       DCHECK(false) << "Invalid values bit width";
@@ -1174,60 +1098,6 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou
   return Status::OK();
 }
 
-template <typename ArrowType>
-void PrimitiveFilterDispatch(const ChunkedArray& values, const ChunkedArray& filter,
-                             FilterOptions::NullSelectionBehavior null_selection,
-                             ArrayDataVector* out) {
-  auto getter = PrimitiveGetter<ChunkedArray, typename ArrowType::c_type>(values);
-
-  // Will match the chunking structure of the filter array.
-  for (const std::shared_ptr<Array>& filter_chunk : filter.chunks()) {
-    std::shared_ptr<ArrayData> chunk_out =
-        ArrayData::Make(values.type(), filter.length());
-    ArraySpan filter_span = ArraySpan(*filter_chunk->data().get());
-    PrimitiveFilterImpl<ChunkedArray, ArrowType>(getter, filter_span, null_selection,
-                                                 chunk_out.get())
-        .Exec();
-    out->push_back(chunk_out);
-  }
-}
-
-Status ChunkedPrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  const ChunkedArray& values = *batch[0].chunked_array();
-  const ChunkedArray& filter = *batch[1].chunked_array();
-
-  FilterOptions::NullSelectionBehavior null_selection =
-      FilterState::Get(ctx).null_selection_behavior;
-
-  ArrayDataVector out_data;
-
-  const int bit_width = values.type()->bit_width();
-  switch (bit_width) {
-    case 1:
-      PrimitiveFilterDispatch<BooleanType>(values, filter, null_selection, &out_data);
-      break;
-    case 8:
-      PrimitiveFilterDispatch<UInt8Type>(values, filter, null_selection, &out_data);
-      break;
-    case 16:
-      PrimitiveFilterDispatch<UInt16Type>(values, filter, null_selection, &out_data);
-      break;
-    case 32:
-      PrimitiveFilterDispatch<UInt32Type>(values, filter, null_selection, &out_data);
-      break;
-    case 64:
-      PrimitiveFilterDispatch<UInt64Type>(values, filter, null_selection, &out_data);
-      break;
-    default:
-      DCHECK(false) << "Invalid values bit width";
-      break;
-  }
-
-  *out = std::make_shared<ChunkedArray>(MapVector(MakeArray, out_data), values.type());
-
-  return Status::OK();
-}
-
 // ----------------------------------------------------------------------
 // Optimized filter for base binary types (32-bit and 64-bit)
 
@@ -2871,7 +2741,7 @@ std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
 void RegisterVectorSelection(FunctionRegistry* registry) {
   // Filter kernels
   std::vector<SelectionKernelData> filter_kernels = {
-      {InputType(match::Primitive()), PrimitiveFilter, ChunkedPrimitiveFilter},
+      {InputType(match::Primitive()), PrimitiveFilter, NULLPTR},
       {InputType(match::BinaryLike()), BinaryFilter, NULLPTR},
       {InputType(match::LargeBinaryLike()), BinaryFilter, NULLPTR},
       {InputType(Type::FIXED_SIZE_BINARY), FilterExec<FSBImpl>, NULLPTR},

From 66c478081378d8bb96dffa836f99262fe2463821 Mon Sep 17 00:00:00 2001
From: Will Jones <willjones127@gmail.com>
Date: Mon, 19 Sep 2022 11:56:05 -0700
Subject: [PATCH 15/15] Add chunked take benchmark

---
 .../kernels/vector_selection_benchmark.cc     | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
index 25e30e65a35..0933b796c1d 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_benchmark.cc
@@ -120,6 +120,16 @@ struct TakeBenchmark {
     Bench(values);
   }
 
+  void ChunkedInt64() {
+    const int64_t n_chunks = 10;
+    const int64_t array_size = args.size / n_chunks / sizeof(int64_t);
+    ArrayVector chunks;
+    for (int64_t i = 0; i < n_chunks; ++i) {
+      chunks.push_back(rand.Int64(array_size, -100, 100, args.null_proportion));
+    }
+    BenchChunked(std::make_shared<ChunkedArray>(chunks));
+  }
+
   void FSLInt64() {
     auto int_array = rand.Int64(args.size, -100, 100, args.null_proportion);
     auto values = std::make_shared<FixedSizeListArray>(
@@ -150,6 +160,28 @@ struct TakeBenchmark {
       ABORT_NOT_OK(Take(values, indices).status());
     }
   }
+
+  void BenchChunked(const std::shared_ptr<ChunkedArray>& values) {
+    const int64_t n_chunks = 10;
+    const int64_t array_size = args.size / n_chunks / sizeof(int64_t);
+    ArrayVector chunks;
+    double indices_null_proportion = indices_have_nulls ? args.null_proportion : 0;
+    for (int64_t i = 0; i < n_chunks; ++i) {
+      chunks.push_back(rand.Int32(values->length(), 0,
+                                  static_cast<int32_t>(values->length() - 1),
+                                  indices_null_proportion));
+    }
+    auto indices = std::make_shared<ChunkedArray>(chunks);
+
+    if (monotonic_indices) {
+      std::shared_ptr<Array> arg_sorter = *SortIndices(*indices);
+      indices = (*Take(Datum(indices), Datum(arg_sorter))).chunked_array();
+    }
+
+    for (auto _ : state) {
+      ABORT_NOT_OK(Take(values, indices).status());
+    }
+  }
 };
 
 struct FilterBenchmark {
@@ -283,6 +315,18 @@ static void TakeInt64MonotonicIndices(benchmark::State& state) {
   TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true).Int64();
 }
 
+static void TakeChunkedInt64RandomIndicesNoNulls(benchmark::State& state) {
+  TakeBenchmark(state, false).ChunkedInt64();
+}
+
+static void TakeChunkedInt64RandomIndicesWithNulls(benchmark::State& state) {
+  TakeBenchmark(state, true).ChunkedInt64();
+}
+
+static void TakeChunkedInt64MonotonicIndices(benchmark::State& state) {
+  TakeBenchmark(state, /*indices_with_nulls=*/false, /*monotonic=*/true).ChunkedInt64();
+}
+
 static void TakeFSLInt64RandomIndicesNoNulls(benchmark::State& state) {
   TakeBenchmark(state, false).FSLInt64();
 }
@@ -343,6 +387,9 @@ void TakeSetArgs(benchmark::internal::Benchmark* bench) {
 BENCHMARK(TakeInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeInt64MonotonicIndices)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
+BENCHMARK(TakeChunkedInt64MonotonicIndices)->Apply(TakeSetArgs);
 BENCHMARK(TakeFSLInt64RandomIndicesNoNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeFSLInt64RandomIndicesWithNulls)->Apply(TakeSetArgs);
 BENCHMARK(TakeFSLInt64MonotonicIndices)->Apply(TakeSetArgs);