diff --git a/cpp/src/arrow/array/array_primitive.cc b/cpp/src/arrow/array/array_primitive.cc
index 5312c3ece14..7c4a14d9340 100644
--- a/cpp/src/arrow/array/array_primitive.cc
+++ b/cpp/src/arrow/array/array_primitive.cc
@@ -58,18 +58,9 @@ int64_t BooleanArray::false_count() const {
 int64_t BooleanArray::true_count() const {
   if (data_->null_count.load() != 0) {
     DCHECK(data_->buffers[0]);
-    internal::BinaryBitBlockCounter bit_counter(data_->buffers[0]->data(), data_->offset,
-                                                data_->buffers[1]->data(), data_->offset,
-                                                data_->length);
-    int64_t count = 0;
-    while (true) {
-      internal::BitBlockCount block = bit_counter.NextAndWord();
-      if (block.length == 0) {
-        break;
-      }
-      count += block.popcount;
-    }
-    return count;
+    return internal::CountAndSetBits(data_->buffers[0]->data(), data_->offset,
+                                     data_->buffers[1]->data(), data_->offset,
+                                     data_->length);
   } else {
     return internal::CountSetBits(data_->buffers[1]->data(), data_->offset,
                                   data_->length);
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 653d206f015..37db8ccb775 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -151,6 +151,13 @@ void ArraySpan::SetMembers(const ArrayData& data) {
     }
   }
 
+  Type::type type_id = this->type->id();
+  if (data.buffers[0] == nullptr && type_id != Type::NA &&
+      type_id != Type::SPARSE_UNION && type_id != Type::DENSE_UNION) {
+    // This should already be zero but we make for sure
+    this->null_count = 0;
+  }
+
   // Makes sure any other buffers are seen as null / non-existent
   for (int i = static_cast<int>(data.buffers.size()); i < 3; ++i) {
     ClearBuffer(i);
@@ -208,7 +215,6 @@ int64_t ArraySpan::GetNullCount() const {
 int GetNumBuffers(const DataType& type) {
   switch (type.id()) {
     case Type::NA:
-      return 0;
     case Type::STRUCT:
     case Type::FIXED_SIZE_LIST:
       return 1;
@@ -232,7 +238,7 @@ int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); }
 
 std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
   auto result = std::make_shared<ArrayData>(this->type->Copy(), this->length,
-                                            kUnknownNullCount, this->offset);
+                                            this->null_count, this->offset);
 
   for (int i = 0; i < this->num_buffers(); ++i) {
     if (this->buffers[i].owner) {
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 7c027c9b1e8..779722e0d1c 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -170,7 +170,7 @@ struct DictionaryBuilderCase {
       out->reset(new internal::DictionaryBuilderBase<TypeErasedIntBuilder, ValueType>(
           index_type, value_type, pool));
     } else {
-      auto start_int_size = internal::GetByteWidth(*index_type);
+      auto start_int_size = index_type->byte_width();
       out->reset(new AdaptiveBuilderType(start_int_size, value_type, pool));
     }
     return Status::OK();
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index ae1783a515e..8af319ed9ea 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -1022,10 +1022,10 @@ bool IntegerTensorEquals(const Tensor& left, const Tensor& right) {
     if (!(left_row_major_p && right_row_major_p) &&
         !(left_column_major_p && right_column_major_p)) {
       const auto& type = checked_cast<const FixedWidthType&>(*left.type());
-      are_equal = StridedIntegerTensorContentEquals(0, 0, 0, internal::GetByteWidth(type),
-                                                    left, right);
+      are_equal =
+          StridedIntegerTensorContentEquals(0, 0, 0, type.byte_width(), left, right);
     } else {
-      const int byte_width = internal::GetByteWidth(*left.type());
+      const int byte_width = left.type()->byte_width();
       DCHECK_GT(byte_width, 0);
 
       const uint8_t* left_data = left.data()->data();
@@ -1195,7 +1195,7 @@ struct SparseTensorEqualsImpl<SparseIndexType, SparseIndexType> {
       return false;
     }
 
-    const int byte_width = internal::GetByteWidth(*left.type());
+    const int byte_width = left.type()->byte_width();
     DCHECK_GT(byte_width, 0);
 
     const uint8_t* left_data = left.data()->data();
diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc
index ad4248fc6c1..4ebdecf5e78 100644
--- a/cpp/src/arrow/compute/api_vector.cc
+++ b/cpp/src/arrow/compute/api_vector.cc
@@ -17,6 +17,7 @@
 
 #include "arrow/compute/api_vector.h"
 
+#include <algorithm>
 #include <memory>
 #include <sstream>
 #include <utility>
@@ -26,6 +27,7 @@
 #include "arrow/array/builder_primitive.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/function_internal.h"
+#include "arrow/compute/kernels/vector_sort_internal.h"
 #include "arrow/compute/registry.h"
 #include "arrow/datum.h"
 #include "arrow/record_batch.h"
@@ -305,10 +307,7 @@ Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
 
 Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
                                            SortOrder order, ExecContext* ctx) {
-  SortOptions options({SortKey("not-used", order)});
-  ARROW_ASSIGN_OR_RAISE(
-      Datum result, CallFunction("sort_indices", {Datum(chunked_array)}, &options, ctx));
-  return result.make_array();
+  return SortIndices(chunked_array, ArraySortOptions(order), ctx);
 }
 
 Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h
index 1f4581566fb..88331b6e592 100644
--- a/cpp/src/arrow/compute/api_vector.h
+++ b/cpp/src/arrow/compute/api_vector.h
@@ -275,14 +275,14 @@ namespace internal {
 
 /// \brief Return the number of selected indices in the boolean filter
 ARROW_EXPORT
-int64_t GetFilterOutputSize(const ArrayData& filter,
+int64_t GetFilterOutputSize(const ArraySpan& filter,
                             FilterOptions::NullSelectionBehavior null_selection);
 
 /// \brief Compute uint64 selection indices for use with Take given a boolean
 /// filter
 ARROW_EXPORT
 Result<std::shared_ptr<ArrayData>> GetTakeIndices(
-    const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
+    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
     MemoryPool* memory_pool = default_memory_pool());
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index da226a062d0..a612a83e7a8 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -332,10 +332,14 @@ Status ExecSpanIterator::Init(const ExecBatch& batch, ValueDescr::Shape output_s
                               int64_t max_chunksize) {
   if (batch.num_values() > 0) {
     // Validate arguments
-    ARROW_ASSIGN_OR_RAISE(int64_t inferred_length, InferBatchLength(batch.values));
+    bool all_args_same_length = false;
+    int64_t inferred_length = InferBatchLength(batch.values, &all_args_same_length);
     if (inferred_length != batch.length) {
       return Status::Invalid("Value lengths differed from ExecBatch length");
     }
+    if (!all_args_same_length) {
+      return Status::Invalid("Array arguments must all be the same length");
+    }
   }
   args_ = &batch.values;
   initialized_ = have_chunked_arrays_ = false;
@@ -991,43 +995,62 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
   ExecSpanIterator span_iterator_;
 };
 
-Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
-  int64_t length = 0;
-  for (const auto& arg : args) {
-    switch (arg.kind()) {
-      case Datum::SCALAR:
-      case Datum::ARRAY:
-      case Datum::CHUNKED_ARRAY:
-        length = std::max(arg.length(), length);
-        break;
-      default:
-        DCHECK(false);
-        break;
-    }
-  }
-  out->length = length;
-  out->values = args;
-  return Status::OK();
-}
-
 class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
  public:
-  Status ExecuteImpl(const std::vector<Datum>& args, ExecListener* listener) {
-    RETURN_NOT_OK(PrepareExecute(args));
-    ExecBatch batch;
+  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
+    // TODO(wesm): remove in ARROW-16577
+    if (output_descr_.shape == ValueDescr::SCALAR) {
+      return Status::Invalid("VectorExecutor only supports array output types");
+    }
+
+    // Some vector kernels have a separate code path for handling
+    // chunked arrays (VectorKernel::exec_chunked) so we check if we
+    // have any chunked arrays. If we do and an exec_chunked function
+    // is defined then we call that.
+    bool have_chunked_arrays = false;
+    for (const Datum& arg : batch.values) {
+      if (arg.is_chunked_array()) have_chunked_arrays = true;
+    }
+
+    output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
+
+    // Decide if we need to preallocate memory for this kernel
+    validity_preallocated_ =
+        (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
+         kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
+    if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
+      ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+    }
+
     if (kernel_->can_execute_chunkwise) {
-      while (batch_iterator_->Next(&batch)) {
-        RETURN_NOT_OK(ExecuteBatch(batch, listener));
+      RETURN_NOT_OK(span_iterator_.Init(batch, output_descr_.shape,
+                                        exec_context()->exec_chunksize()));
+      ExecSpan span;
+      while (span_iterator_.Next(&span)) {
+        RETURN_NOT_OK(Exec(span, listener));
       }
     } else {
-      RETURN_NOT_OK(PackBatchNoChunks(args, &batch));
-      RETURN_NOT_OK(ExecuteBatch(batch, listener));
+      // Kernel cannot execute chunkwise. If we have any chunked
+      // arrays, then VectorKernel::exec_chunked must be defined
+      // otherwise we raise an error
+      if (have_chunked_arrays) {
+        RETURN_NOT_OK(ExecChunked(batch, listener));
+      } else {
+        // No chunked arrays. We pack the args into an ExecSpan and
+        // call the regular exec code path
+        RETURN_NOT_OK(Exec(ExecSpan(batch), listener));
+      }
     }
-    return Finalize(listener);
-  }
 
-  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
-    return ExecuteImpl(batch.values, listener);
+    if (kernel_->finalize) {
+      // Intermediate results require post-processing after the execution is
+      // completed (possibly involving some accumulated state)
+      RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_));
+      for (const auto& result : results_) {
+        RETURN_NOT_OK(listener->OnResult(result));
+      }
+    }
+    return Status::OK();
   }
 
   Datum WrapResults(const std::vector<Datum>& inputs,
@@ -1047,59 +1070,54 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
   }
 
  protected:
-  Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
-    Datum out;
-    if (output_descr_.shape == ValueDescr::ARRAY) {
-      // We preallocate (maybe) only for the output of processing the current
-      // batch
-      ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
-    }
+  Status Exec(const ExecSpan& span, ExecListener* listener) {
+    ExecResult out;
 
-    if (kernel_->null_handling == NullHandling::INTERSECTION &&
-        output_descr_.shape == ValueDescr::ARRAY) {
-      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array()));
+    // We preallocate (maybe) only for the output of processing the current
+    // batch, but create an output ArrayData instance regardless
+    ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(span.length));
+
+    if (kernel_->null_handling == NullHandling::INTERSECTION) {
+      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, span, out.array_data().get()));
     }
-    RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+    RETURN_NOT_OK(kernel_->exec(kernel_ctx_, span, &out));
     if (!kernel_->finalize) {
       // If there is no result finalizer (e.g. for hash-based functions, we can
       // emit the processed batch right away rather than waiting
-      RETURN_NOT_OK(listener->OnResult(std::move(out)));
+      RETURN_NOT_OK(listener->OnResult(out.array_data()));
     } else {
-      results_.emplace_back(std::move(out));
+      results_.emplace_back(out.array_data());
     }
     return Status::OK();
   }
 
-  Status Finalize(ExecListener* listener) {
-    if (kernel_->finalize) {
-      // Intermediate results require post-processing after the execution is
-      // completed (possibly involving some accumulated state)
-      RETURN_NOT_OK(kernel_->finalize(kernel_ctx_, &results_));
-      for (const auto& result : results_) {
-        RETURN_NOT_OK(listener->OnResult(result));
-      }
+  Status ExecChunked(const ExecBatch& batch, ExecListener* listener) {
+    if (kernel_->exec_chunked == nullptr) {
+      return Status::Invalid(
+          "Vector kernel cannot execute chunkwise and no "
+          "chunked exec function was defined");
     }
-    return Status::OK();
-  }
 
-  Status PrepareExecute(const std::vector<Datum>& args) {
-    if (kernel_->can_execute_chunkwise) {
-      ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
-                                                 args, exec_context()->exec_chunksize()));
+    if (kernel_->null_handling == NullHandling::INTERSECTION) {
+      return Status::Invalid(
+          "Null pre-propagation is unsupported for ChunkedArray "
+          "execution in vector kernels");
     }
-    output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
 
-    // Decide if we need to preallocate memory for this kernel
-    validity_preallocated_ =
-        (kernel_->null_handling != NullHandling::COMPUTED_NO_PREALLOCATE &&
-         kernel_->null_handling != NullHandling::OUTPUT_NOT_NULL);
-    if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
-      ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
+    Datum out;
+    ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
+    RETURN_NOT_OK(kernel_->exec_chunked(kernel_ctx_, batch, &out));
+    if (!kernel_->finalize) {
+      // If there is no result finalizer (e.g. for hash-based functions, we can
+      // emit the processed batch right away rather than waiting
+      RETURN_NOT_OK(listener->OnResult(std::move(out)));
+    } else {
+      results_.emplace_back(std::move(out));
     }
     return Status::OK();
   }
 
-  std::unique_ptr<ExecBatchIterator> batch_iterator_;
+  ExecSpanIterator span_iterator_;
   std::vector<Datum> results_;
 };
 
@@ -1270,7 +1288,7 @@ std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalarAggregate() {
   return ::arrow::internal::make_unique<detail::ScalarAggExecutor>();
 }
 
-Result<int64_t> InferBatchLength(const std::vector<Datum>& values) {
+int64_t InferBatchLength(const std::vector<Datum>& values, bool* all_same) {
   int64_t length = -1;
   bool are_all_scalar = true;
   for (const Datum& arg : values) {
@@ -1280,7 +1298,8 @@ Result<int64_t> InferBatchLength(const std::vector<Datum>& values) {
         length = arg_length;
       } else {
         if (length != arg_length) {
-          return Status::Invalid("Array arguments must all be the same length");
+          *all_same = false;
+          return length;
         }
       }
       are_all_scalar = false;
@@ -1290,7 +1309,8 @@ Result<int64_t> InferBatchLength(const std::vector<Datum>& values) {
         length = arg_length;
       } else {
         if (length != arg_length) {
-          return Status::Invalid("Array arguments must all be the same length");
+          *all_same = false;
+          return length;
         }
       }
       are_all_scalar = false;
@@ -1302,6 +1322,7 @@ Result<int64_t> InferBatchLength(const std::vector<Datum>& values) {
   } else if (length < 0) {
     length = 0;
   }
+  *all_same = true;
   return length;
 }
 
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index ba41bfb5b6e..8fd938ce299 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -20,7 +20,6 @@
 
 #pragma once
 
-#include <algorithm>
 #include <atomic>
 #include <cstdint>
 #include <limits>
@@ -397,8 +396,12 @@ struct ARROW_EXPORT ExecSpan {
   }
 
   bool is_all_scalar() const {
-    return std::all_of(this->values.begin(), this->values.end(),
-                       [](const ExecValue& v) { return v.is_scalar(); });
+    for (const ExecValue& value : this->values) {
+      if (value.is_array()) {
+        return false;
+      }
+    }
+    return true;
   }
 
   /// \brief Return the value at the i-th index
diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc
index 2e0fe6ff34b..b796f5cda3b 100644
--- a/cpp/src/arrow/compute/exec/expression.cc
+++ b/cpp/src/arrow/compute/exec/expression.cc
@@ -419,7 +419,7 @@ Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_
     }
   }
 
-  compute::KernelContext kernel_context(exec_context);
+  compute::KernelContext kernel_context(exec_context, call.kernel);
   if (call.kernel->init) {
     const FunctionOptions* options =
         call.options ? call.options.get() : call.function->default_options();
@@ -593,7 +593,7 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i
 
   auto executor = compute::detail::KernelExecutor::MakeScalar();
 
-  compute::KernelContext kernel_context(exec_context);
+  compute::KernelContext kernel_context(exec_context, call->kernel);
   kernel_context.SetState(call->kernel_state.get());
 
   auto kernel = call->kernel;
diff --git a/cpp/src/arrow/compute/exec/hash_join.cc b/cpp/src/arrow/compute/exec/hash_join.cc
index 1ebe11e7046..a145863e597 100644
--- a/cpp/src/arrow/compute/exec/hash_join.cc
+++ b/cpp/src/arrow/compute/exec/hash_join.cc
@@ -127,7 +127,7 @@ class HashJoinBasicImpl : public HashJoinImpl {
       *opt_projected_batch = projected;
     }
 
-    return encoder->EncodeAndAppend(projected);
+    return encoder->EncodeAndAppend(ExecSpan(projected));
   }
 
   void ProbeBatch_Lookup(ThreadLocalState* local_state, const RowEncoder& exec_batch_keys,
diff --git a/cpp/src/arrow/compute/exec/hash_join_dict.cc b/cpp/src/arrow/compute/exec/hash_join_dict.cc
index 63d7d1143c9..731a5662d7d 100644
--- a/cpp/src/arrow/compute/exec/hash_join_dict.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_dict.cc
@@ -234,8 +234,7 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr<Array> dictiona
     return Status::Invalid(
         "Dictionary length in hash join must fit into signed 32-bit integer.");
   }
-  ExecBatch batch({dictionary->data()}, length);
-  RETURN_NOT_OK(encoder.EncodeAndAppend(batch));
+  RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary->data()}, length)));
 
   std::vector<int32_t> entries_to_take;
 
@@ -296,7 +295,7 @@ Result<std::shared_ptr<ArrayData>> HashJoinDictBuild::RemapInputValues(
   bool is_scalar = values.is_scalar();
   int64_t encoded_length = is_scalar ? 1 : batch_length;
   ExecBatch batch({values}, encoded_length);
-  RETURN_NOT_OK(encoder.EncodeAndAppend(batch));
+  RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan(batch)));
 
   // Allocate output buffers
   //
@@ -426,8 +425,8 @@ Result<std::shared_ptr<ArrayData>> HashJoinDictProbe::RemapInput(
         std::vector<ValueDescr> encoder_types;
         encoder_types.emplace_back(dict_type.value_type(), ValueDescr::ARRAY);
         encoder_.Init(encoder_types, ctx);
-        ExecBatch batch({dict->data()}, dict->length());
-        RETURN_NOT_OK(encoder_.EncodeAndAppend(batch));
+        RETURN_NOT_OK(
+            encoder_.EncodeAndAppend(ExecSpan({*dict->data()}, dict->length())));
       }
     }
 
@@ -547,7 +546,7 @@ Status HashJoinDictBuildMulti::EncodeBatch(
                                       proj_map.data_type(HashJoinProjection::KEY, icol)));
     }
   }
-  return encoder->EncodeAndAppend(projected);
+  return encoder->EncodeAndAppend(ExecSpan(projected));
 }
 
 Status HashJoinDictBuildMulti::PostDecode(
@@ -656,7 +655,7 @@ Status HashJoinDictProbeMulti::EncodeBatch(
   }
 
   local_state.post_remap_encoder.Clear();
-  RETURN_NOT_OK(local_state.post_remap_encoder.EncodeAndAppend(projected));
+  RETURN_NOT_OK(local_state.post_remap_encoder.EncodeAndAppend(ExecSpan(projected)));
   *out_encoder = &local_state.post_remap_encoder;
 
   return Status::OK();
diff --git a/cpp/src/arrow/compute/exec/hash_join_node_test.cc b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
index e752870486a..46600a96da3 100644
--- a/cpp/src/arrow/compute/exec/hash_join_node_test.cc
+++ b/cpp/src/arrow/compute/exec/hash_join_node_test.cc
@@ -496,7 +496,7 @@ std::vector<std::shared_ptr<Array>> GenRandomUniqueRecords(
   for (size_t i = 0; i < result.size(); ++i) {
     batch.values[i] = result[i];
   }
-  Status status = encoder.EncodeAndAppend(batch);
+  Status status = encoder.EncodeAndAppend(ExecSpan(batch));
   ARROW_DCHECK(status.ok());
 
   std::unordered_map<std::string, int> uniques;
diff --git a/cpp/src/arrow/compute/exec/tpch_node.cc b/cpp/src/arrow/compute/exec/tpch_node.cc
index c447de6cff7..d8f2c60312d 100644
--- a/cpp/src/arrow/compute/exec/tpch_node.cc
+++ b/cpp/src/arrow/compute/exec/tpch_node.cc
@@ -844,7 +844,7 @@ class PartAndPartSupplierGenerator {
   Status AllocatePartBatch(size_t thread_index, int column) {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     ARROW_DCHECK(tld.part[column].kind() == Datum::NONE);
-    int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[column]);
+    int32_t byte_width = kPartTypes[column]->byte_width();
     ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Buffer> buff,
                           AllocateBuffer(tld.part_to_generate * byte_width));
     ArrayData ad(kPartTypes[column], tld.part_to_generate, {nullptr, std::move(buff)});
@@ -917,7 +917,7 @@ class PartAndPartSupplierGenerator {
       RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_MFGR));
       char* p_mfgr = reinterpret_cast<char*>(
           tld.part[PART::P_MFGR].array()->buffers[1]->mutable_data());
-      int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]);
+      int32_t byte_width = kPartTypes[PART::P_MFGR]->byte_width();
       for (int64_t irow = 0; irow < tld.part_to_generate; irow++) {
         std::strncpy(p_mfgr + byte_width * irow, manufacturer, byte_width);
         char mfgr_id = '0' + dist(tld.rng);
@@ -939,8 +939,8 @@ class PartAndPartSupplierGenerator {
           tld.part[PART::P_MFGR].array()->buffers[1]->data());
       char* p_brand = reinterpret_cast<char*>(
           tld.part[PART::P_BRAND].array()->buffers[1]->mutable_data());
-      int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_BRAND]);
-      int32_t mfgr_byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_MFGR]);
+      int32_t byte_width = kPartTypes[PART::P_BRAND]->byte_width();
+      int32_t mfgr_byte_width = kPartTypes[PART::P_MFGR]->byte_width();
       const size_t mfgr_id_offset = std::strlen("Manufacturer#");
       for (int64_t irow = 0; irow < tld.part_to_generate; irow++) {
         char* row = p_brand + byte_width * irow;
@@ -1023,7 +1023,7 @@ class PartAndPartSupplierGenerator {
       RETURN_NOT_OK(AllocatePartBatch(thread_index, PART::P_CONTAINER));
       char* p_container = reinterpret_cast<char*>(
           tld.part[PART::P_CONTAINER].array()->buffers[1]->mutable_data());
-      int32_t byte_width = arrow::internal::GetByteWidth(*kPartTypes[PART::P_CONTAINER]);
+      int32_t byte_width = kPartTypes[PART::P_CONTAINER]->byte_width();
       for (int64_t irow = 0; irow < tld.part_to_generate; irow++) {
         int container1_idx = dist1(tld.rng);
         int container2_idx = dist2(tld.rng);
@@ -1090,7 +1090,7 @@ class PartAndPartSupplierGenerator {
 
   Status AllocatePartSuppBatch(size_t thread_index, size_t ibatch, int column) {
     ThreadLocalData& tld = thread_local_data_[thread_index];
-    int32_t byte_width = arrow::internal::GetByteWidth(*kPartsuppTypes[column]);
+    int32_t byte_width = kPartsuppTypes[column]->byte_width();
     ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Buffer> buff,
                           AllocateResizableBuffer(batch_size_ * byte_width));
     ArrayData ad(kPartsuppTypes[column], batch_size_, {nullptr, std::move(buff)});
@@ -1101,7 +1101,7 @@ class PartAndPartSupplierGenerator {
   Status SetPartSuppColumnSize(size_t thread_index, size_t ibatch, int column,
                                size_t new_size) {
     ThreadLocalData& tld = thread_local_data_[thread_index];
-    int32_t byte_width = arrow::internal::GetByteWidth(*kPartsuppTypes[column]);
+    int32_t byte_width = kPartsuppTypes[column]->byte_width();
     tld.partsupp[ibatch][column].array()->length = static_cast<int64_t>(new_size);
     ResizableBuffer* buff = checked_cast<ResizableBuffer*>(
         tld.partsupp[ibatch][column].array()->buffers[1].get());
@@ -1554,7 +1554,7 @@ class OrdersAndLineItemGenerator {
   Status AllocateOrdersBatch(size_t thread_index, int column) {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     ARROW_DCHECK(tld.orders[column].kind() == Datum::NONE);
-    int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[column]);
+    int32_t byte_width = kOrdersTypes[column]->byte_width();
     ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Buffer> buff,
                           AllocateBuffer(tld.orders_to_generate * byte_width));
     ArrayData ad(kOrdersTypes[column], tld.orders_to_generate,
@@ -1711,8 +1711,7 @@ class OrdersAndLineItemGenerator {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     if (tld.orders[ORDERS::O_ORDERPRIORITY].kind() == Datum::NONE) {
       RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_ORDERPRIORITY));
-      int32_t byte_width =
-          arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_ORDERPRIORITY]);
+      int32_t byte_width = kOrdersTypes[ORDERS::O_ORDERPRIORITY]->byte_width();
       std::uniform_int_distribution<int32_t> dist(0, kNumPriorities - 1);
       char* o_orderpriority = reinterpret_cast<char*>(
           tld.orders[ORDERS::O_ORDERPRIORITY].array()->buffers[1]->mutable_data());
@@ -1728,7 +1727,7 @@ class OrdersAndLineItemGenerator {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     if (tld.orders[ORDERS::O_CLERK].kind() == Datum::NONE) {
       RETURN_NOT_OK(AllocateOrdersBatch(thread_index, ORDERS::O_CLERK));
-      int32_t byte_width = arrow::internal::GetByteWidth(*kOrdersTypes[ORDERS::O_CLERK]);
+      int32_t byte_width = kOrdersTypes[ORDERS::O_CLERK]->byte_width();
       int64_t max_clerk_id = static_cast<int64_t>(scale_factor_ * 1000);
       std::uniform_int_distribution<int64_t> dist(1, max_clerk_id);
       char* o_clerk = reinterpret_cast<char*>(
@@ -1792,7 +1791,7 @@ class OrdersAndLineItemGenerator {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     if (tld.lineitem[ibatch][column].kind() == Datum::NONE) {
       ARROW_DCHECK(ibatch != 0 || tld.first_batch_offset == 0);
-      int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[column]);
+      int32_t byte_width = kLineitemTypes[column]->byte_width();
       ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Buffer> buff,
                             AllocateResizableBuffer(batch_size_ * byte_width));
       ArrayData ad(kLineitemTypes[column], batch_size_, {nullptr, std::move(buff)});
@@ -1807,7 +1806,7 @@ class OrdersAndLineItemGenerator {
   Status SetLineItemColumnSize(size_t thread_index, size_t ibatch, int column,
                                size_t new_size) {
     ThreadLocalData& tld = thread_local_data_[thread_index];
-    int32_t byte_width = arrow::internal::GetByteWidth(*kLineitemTypes[column]);
+    int32_t byte_width = kLineitemTypes[column]->byte_width();
     tld.lineitem[ibatch][column].array()->length = static_cast<int64_t>(new_size);
     ResizableBuffer* buff = checked_cast<ResizableBuffer*>(
         tld.lineitem[ibatch][column].array()->buffers[1].get());
@@ -2283,8 +2282,7 @@ class OrdersAndLineItemGenerator {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     if (!tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT]) {
       tld.generated_lineitem[LINEITEM::L_SHIPINSTRUCT] = true;
-      int32_t byte_width =
-          arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPINSTRUCT]);
+      int32_t byte_width = kLineitemTypes[LINEITEM::L_SHIPINSTRUCT]->byte_width();
       size_t ibatch = 0;
       std::uniform_int_distribution<size_t> dist(0, kNumInstructions - 1);
       for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) {
@@ -2318,8 +2316,7 @@ class OrdersAndLineItemGenerator {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     if (!tld.generated_lineitem[LINEITEM::L_SHIPMODE]) {
       tld.generated_lineitem[LINEITEM::L_SHIPMODE] = true;
-      int32_t byte_width =
-          arrow::internal::GetByteWidth(*kLineitemTypes[LINEITEM::L_SHIPMODE]);
+      int32_t byte_width = kLineitemTypes[LINEITEM::L_SHIPMODE]->byte_width();
       size_t ibatch = 0;
       std::uniform_int_distribution<size_t> dist(0, kNumModes - 1);
       for (int64_t irow = 0; irow < tld.lineitem_to_generate; ibatch++) {
@@ -2530,7 +2527,7 @@ class SupplierGenerator : public TpchTableGenerator {
   Status AllocateColumn(size_t thread_index, int column) {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE);
-    int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]);
+    int32_t byte_width = kTypes[column]->byte_width();
     ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Buffer> buff,
                           AllocateBuffer(tld.to_generate * byte_width));
     ArrayData ad(kTypes[column], tld.to_generate, {nullptr, std::move(buff)});
@@ -2558,7 +2555,7 @@ class SupplierGenerator : public TpchTableGenerator {
       const int32_t* s_suppkey = reinterpret_cast<const int32_t*>(
           tld.batch[SUPPLIER::S_SUPPKEY].array()->buffers[1]->data());
       RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_NAME));
-      int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_NAME]);
+      int32_t byte_width = kTypes[SUPPLIER::S_NAME]->byte_width();
       char* s_name = reinterpret_cast<char*>(
           tld.batch[SUPPLIER::S_NAME].array()->buffers[1]->mutable_data());
       // Look man, I'm just following the spec ok? Section 4.2.3 as of March 1 2022
@@ -2600,7 +2597,7 @@ class SupplierGenerator : public TpchTableGenerator {
     if (tld.batch[SUPPLIER::S_PHONE].kind() == Datum::NONE) {
       RETURN_NOT_OK(S_NATIONKEY(thread_index));
       RETURN_NOT_OK(AllocateColumn(thread_index, SUPPLIER::S_PHONE));
-      int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[SUPPLIER::S_PHONE]);
+      int32_t byte_width = kTypes[SUPPLIER::S_PHONE]->byte_width();
       const int32_t* s_nationkey = reinterpret_cast<const int32_t*>(
           tld.batch[SUPPLIER::S_NATIONKEY].array()->buffers[1]->data());
       char* s_phone = reinterpret_cast<char*>(
@@ -2913,7 +2910,7 @@ class CustomerGenerator : public TpchTableGenerator {
   Status AllocateColumn(size_t thread_index, int column) {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     ARROW_DCHECK(tld.batch[column].kind() == Datum::NONE);
-    int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[column]);
+    int32_t byte_width = kTypes[column]->byte_width();
     ARROW_ASSIGN_OR_RAISE(std::unique_ptr<Buffer> buff,
                           AllocateBuffer(tld.to_generate * byte_width));
     ArrayData ad(kTypes[column], tld.to_generate, {nullptr, std::move(buff)});
@@ -2994,7 +2991,7 @@ class CustomerGenerator : public TpchTableGenerator {
     if (tld.batch[CUSTOMER::C_PHONE].kind() == Datum::NONE) {
       RETURN_NOT_OK(C_NATIONKEY(thread_index));
       RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_PHONE));
-      int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_PHONE]);
+      int32_t byte_width = kTypes[CUSTOMER::C_PHONE]->byte_width();
       const int32_t* c_nationkey = reinterpret_cast<const int32_t*>(
           tld.batch[CUSTOMER::C_NATIONKEY].array()->buffers[1]->data());
       char* c_phone = reinterpret_cast<char*>(
@@ -3023,7 +3020,7 @@ class CustomerGenerator : public TpchTableGenerator {
     ThreadLocalData& tld = thread_local_data_[thread_index];
     if (tld.batch[CUSTOMER::C_MKTSEGMENT].kind() == Datum::NONE) {
       RETURN_NOT_OK(AllocateColumn(thread_index, CUSTOMER::C_MKTSEGMENT));
-      int32_t byte_width = arrow::internal::GetByteWidth(*kTypes[CUSTOMER::C_MKTSEGMENT]);
+      int32_t byte_width = kTypes[CUSTOMER::C_MKTSEGMENT]->byte_width();
       char* c_mktsegment = reinterpret_cast<char*>(
           tld.batch[CUSTOMER::C_MKTSEGMENT].array()->buffers[1]->mutable_data());
       std::uniform_int_distribution<int32_t> dist(0, kNumSegments - 1);
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index fac78da6db1..c475a61c1ba 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -179,7 +179,7 @@ class ARROW_EXPORT KernelExecutor {
   static std::unique_ptr<KernelExecutor> MakeScalarAggregate();
 };
 
-Result<int64_t> InferBatchLength(const std::vector<Datum>& values);
+int64_t InferBatchLength(const std::vector<Datum>& values, bool* all_same);
 
 /// \brief Populate validity bitmap with the intersection of the nullity of the
 /// arguments. If a preallocated bitmap is not provided, then one will be
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index c6b0992a458..b5ebc67d180 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -252,11 +252,11 @@ Result<Datum> Function::ExecuteInternal(const std::vector<Datum>& args,
     return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
+  ARROW_ASSIGN_OR_RAISE(const Kernel* kernel, DispatchBest(&inputs));
   ARROW_ASSIGN_OR_RAISE(std::vector<Datum> args_with_casts, Cast(args, inputs, ctx));
 
   std::unique_ptr<KernelState> state;
-  KernelContext kernel_ctx{ctx};
+  KernelContext kernel_ctx{ctx, kernel};
   if (kernel->init) {
     ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
     kernel_ctx.SetState(state.get());
@@ -266,21 +266,23 @@ Result<Datum> Function::ExecuteInternal(const std::vector<Datum>& args,
 
   detail::DatumAccumulator listener;
 
-  // Set length to 0 unless it's a scalar function (vector functions don't use
-  // it).
-  ExecBatch input(std::move(args_with_casts), 0);
-  if (kind() == Function::SCALAR) {
-    ARROW_ASSIGN_OR_RAISE(int64_t inferred_length,
-                          detail::InferBatchLength(input.values));
-    if (passed_length == -1) {
-      input.length = inferred_length;
-    } else {
-      // ARROW-16819: will clean up more later
-      if (input.num_values() > 0 && passed_length != inferred_length) {
-        return Status::Invalid("Passed batch length did not equal actual array lengths");
-      }
+  ExecBatch input(std::move(args_with_casts), /*length=*/0);
+  if (input.num_values() == 0) {
+    if (passed_length != -1) {
       input.length = passed_length;
     }
+  } else {
+    bool all_same_length = false;
+    int64_t inferred_length = detail::InferBatchLength(input.values, &all_same_length);
+    input.length = inferred_length;
+    if (kind() == Function::SCALAR) {
+      DCHECK(passed_length == -1 || passed_length == inferred_length);
+    } else if (kind() == Function::VECTOR) {
+      auto vkernel = static_cast<const VectorKernel*>(kernel);
+      if (!(all_same_length || !vkernel->can_execute_chunkwise)) {
+        return Status::Invalid("Vector kernel arguments must all be the same length");
+      }
+    }
   }
   RETURN_NOT_OK(executor->Execute(input, &listener));
   const auto out = executor->WrapResults(input.values, listener.values());
@@ -366,7 +368,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) {
 }
 
 Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                                 ArrayKernelExecOld exec, KernelInit init) {
+                                 ArrayKernelExec exec, KernelInit init) {
   RETURN_NOT_OK(CheckArity(in_types));
 
   if (arity_.is_varargs && in_types.size() != 1) {
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index 6e3c8374335..c32c8766a91 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -344,7 +344,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
   /// state initialization, no data preallocation, and no preallocation of the
   /// validity bitmap.
   Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                   ArrayKernelExecOld exec, KernelInit init = NULLPTR);
+                   ArrayKernelExec exec, KernelInit init = NULLPTR);
 
   /// \brief Add a kernel (function implementation). Returns error if the
   /// kernel's signature does not match the function's arity.
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
index ec5f3bc170c..f06f225f5b9 100644
--- a/cpp/src/arrow/compute/function_test.cc
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -214,10 +214,6 @@ auto ExecNYI = [](KernelContext* ctx, const ExecSpan& args, ExecResult* out) {
   return Status::NotImplemented("NYI");
 };
 
-auto ExecNYIOld = [](KernelContext* ctx, const ExecBatch& args, Datum* out) {
-  return Status::NotImplemented("NYI");
-};
-
 template <typename FunctionType, typename ExecType>
 void CheckAddDispatch(FunctionType* func, ExecType exec) {
   using KernelType = typename FunctionType::KernelType;
@@ -272,7 +268,7 @@ TEST(ScalarVectorFunction, DispatchExact) {
   CheckAddDispatch(&func1, ExecNYI);
 
   // ARROW-16576: will migrate later to new span-based kernel exec API
-  CheckAddDispatch(&func2, ExecNYIOld);
+  CheckAddDispatch(&func2, ExecNYI);
 }
 
 TEST(ArrayFunction, VarArgs) {
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index e115c5194bc..93a1c605a99 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -52,7 +52,10 @@ struct ARROW_EXPORT KernelState {
 /// \brief Context/state for the execution of a particular kernel.
 class ARROW_EXPORT KernelContext {
  public:
-  explicit KernelContext(ExecContext* exec_ctx) : exec_ctx_(exec_ctx) {}
+  // Can pass optional backreference; not used consistently for the
+  // moment but will be made so in the future
+  explicit KernelContext(ExecContext* exec_ctx, const Kernel* kernel = NULLPTR)
+      : exec_ctx_(exec_ctx), kernel_(kernel) {}
 
   /// \brief Allocate buffer from the context's memory pool. The contents are
   /// not initialized.
@@ -68,6 +71,10 @@ class ARROW_EXPORT KernelContext {
   /// be minded separately.
   void SetState(KernelState* state) { state_ = state; }
 
+  // Set kernel that is being invoked since some kernel
+  // implementations will examine the kernel state.
+  void SetKernel(const Kernel* kernel) { kernel_ = kernel; }
+
   KernelState* state() { return state_; }
 
   /// \brief Configuration related to function execution that is to be shared
@@ -78,9 +85,12 @@ class ARROW_EXPORT KernelContext {
   /// MemoryPool contained in the ExecContext used to create the KernelContext.
   MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); }
 
+  const Kernel* kernel() const { return kernel_; }
+
  private:
   ExecContext* exec_ctx_;
   KernelState* state_ = NULLPTR;
+  const Kernel* kernel_ = NULLPTR;
 };
 
 /// \brief An type-checking interface to permit customizable validation rules
@@ -548,10 +558,6 @@ struct Kernel {
 using ArrayKernelExec =
     std::function<Status(KernelContext*, const ExecSpan&, ExecResult*)>;
 
-/// \brief Kernel execution API being phased out per ARROW-16756
-using ArrayKernelExecOld =
-    std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
-
 /// \brief Kernel data structure for implementations of ScalarFunction. In
 /// addition to the members found in Kernel, contains the null handling
 /// and memory pre-allocation preferences.
@@ -584,6 +590,9 @@ struct ScalarKernel : public Kernel {
   // bitmaps is a reasonable default
   NullHandling::type null_handling = NullHandling::INTERSECTION;
   MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
+
+  // Additional kernel-specific data
+  std::shared_ptr<KernelState> data;
 };
 
 // ----------------------------------------------------------------------
@@ -597,16 +606,19 @@ struct VectorKernel : public Kernel {
   /// \brief See VectorKernel::finalize member for usage
   using FinalizeFunc = std::function<Status(KernelContext*, std::vector<Datum>*)>;
 
+  /// \brief Function for executing a stateful VectorKernel against a
+  /// ChunkedArray input. Does not need to be defined for all VectorKernels
+  typedef Status (*ChunkedExec)(KernelContext*, const ExecBatch&, Datum* out);
+
   VectorKernel() = default;
 
-  VectorKernel(std::vector<InputType> in_types, OutputType out_type,
-               ArrayKernelExecOld exec, KernelInit init = NULLPTR,
-               FinalizeFunc finalize = NULLPTR)
+  VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
+               KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
       : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
         exec(std::move(exec)),
         finalize(std::move(finalize)) {}
 
-  VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExecOld exec,
+  VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
                KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
       : Kernel(std::move(sig), std::move(init)),
         exec(std::move(exec)),
@@ -614,7 +626,10 @@ struct VectorKernel : public Kernel {
 
   /// \brief Perform a single invocation of this kernel. Any required state is
   /// managed through the KernelContext.
-  ArrayKernelExecOld exec;
+  ArrayKernelExec exec;
+
+  /// \brief Execute the kernel on a ChunkedArray. Does not need to be defined
+  ChunkedExec exec_chunked = NULLPTR;
 
   /// \brief For VectorKernel, convert intermediate results into finalized
   /// results. Mutates input argument. Some kernels may accumulate state
@@ -637,7 +652,7 @@ struct VectorKernel : public Kernel {
   /// functionality.
   bool can_write_into_slices = true;
 
-  /// Some vector kernels can do chunkwise execution using ExecBatchIterator,
+  /// Some vector kernels can do chunkwise execution using ExecSpanIterator,
   /// in some cases accumulating some state. Other kernels (like Take) need to
   /// be passed whole arrays and don't work on ChunkedArray inputs
   bool can_execute_chunkwise = true;
diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index f8c56b2a220..6676b86436a 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -42,9 +42,9 @@ constexpr uint64_t kCountEOF = ~0ULL;
 
 template <typename InType, typename CType = typename TypeTraits<InType>::CType>
 Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
-                                                  Datum* out) {
-  DCHECK_EQ(Type::STRUCT, out->type()->id());
-  const auto& out_type = checked_cast<const StructType&>(*out->type());
+                                                  const DataType& type, ExecResult* out) {
+  DCHECK_EQ(Type::STRUCT, type.id());
+  const auto& out_type = checked_cast<const StructType&>(type);
   DCHECK_EQ(2, out_type.num_fields());
   const auto& mode_type = out_type.field(0)->type();
   const auto& count_type = int64();
@@ -64,14 +64,15 @@ Result<std::pair<CType*, int64_t*>> PrepareOutput(int64_t n, KernelContext* ctx,
     count_buffer = count_data->template GetMutableValues<int64_t>(1);
   }
 
-  *out = Datum(ArrayData::Make(out->type(), n, {nullptr}, {mode_data, count_data}, 0));
+  out->value = ArrayData::Make(type.Copy(), n, {nullptr}, {mode_data, count_data}, 0);
   return std::make_pair(mode_buffer, count_buffer);
 }
 
 // find top-n value:count pairs with minimal heap
 // suboptimal for tiny or large n, possibly okay as we're not in hot path
 template <typename InType, typename Generator>
-Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
+Status Finalize(KernelContext* ctx, const DataType& type, ExecResult* out,
+                Generator&& gen) {
   using CType = typename TypeTraits<InType>::CType;
 
   using ValueCountPair = std::pair<CType, uint64_t>;
@@ -101,7 +102,7 @@ Status Finalize(KernelContext* ctx, Datum* out, Generator&& gen) {
   CType* mode_buffer;
   int64_t* count_buffer;
   ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
-                        PrepareOutput<InType>(n, ctx, out));
+                        PrepareOutput<InType>(n, ctx, type, out));
 
   for (int64_t i = n - 1; i >= 0; --i) {
     std::tie(mode_buffer[i], count_buffer[i]) = min_heap.top();
@@ -127,18 +128,7 @@ struct CountModer {
     this->counts.resize(value_range, 0);
   }
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    // count values in all chunks, ignore nulls
-    const Datum& datum = batch[0];
-
-    const ModeOptions& options = ModeState::Get(ctx);
-    if ((!options.skip_nulls && datum.null_count() > 0) ||
-        (datum.length() - datum.null_count() < options.min_count)) {
-      return PrepareOutput<T>(/*n=*/0, ctx, out).status();
-    }
-
-    CountValues<CType>(this->counts.data(), datum, this->min);
-
+  Status GetResult(KernelContext* ctx, const DataType& type, ExecResult* out) {
     // generator to emit next value:count pair
     int index = 0;
     auto gen = [&]() {
@@ -153,41 +143,67 @@ struct CountModer {
       return std::pair<CType, uint64_t>(0, kCountEOF);
     };
 
-    return Finalize<T>(ctx, out, std::move(gen));
+    return Finalize<T>(ctx, type, out, std::move(gen));
+  }
+
+  Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    // count values in all chunks, ignore nulls
+    const ArraySpan& values = batch[0].array;
+    const ModeOptions& options = ModeState::Get(ctx);
+    if ((!options.skip_nulls && values.GetNullCount() > 0) ||
+        (values.length - values.GetNullCount() < options.min_count)) {
+      return PrepareOutput<T>(/*n=*/0, ctx, *out->type(), out).status();
+    }
+
+    CountValues<CType>(values, this->min, this->counts.data());
+    return GetResult(ctx, *out->type(), out);
+  }
+
+  Status ExecChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    // count values in all chunks, ignore nulls
+    const ChunkedArray& values = *batch[0].chunked_array();
+    const ModeOptions& options = ModeState::Get(ctx);
+    ExecResult result;
+    if ((!options.skip_nulls && values.null_count() > 0) ||
+        (values.length() - values.null_count() < options.min_count)) {
+      RETURN_NOT_OK(PrepareOutput<T>(/*n=*/0, ctx, *out->type(), &result));
+    } else {
+      CountValues<CType>(values, this->min, this->counts.data());
+      RETURN_NOT_OK(GetResult(ctx, *out->type(), &result));
+    }
+    *out = result.array_data();
+    return Status::OK();
   }
 };
 
 // booleans can be handled more straightforward
 template <>
 struct CountModer<BooleanType> {
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const Datum& datum = batch[0];
+  int64_t counts[2] = {0, 0};
 
-    const ModeOptions& options = ModeState::Get(ctx);
-    if ((!options.skip_nulls && datum.null_count() > 0) ||
-        (datum.length() - datum.null_count() < options.min_count)) {
-      return PrepareOutput<BooleanType>(/*n=*/0, ctx, out).status();
+  void UpdateCounts(const ArraySpan& values) {
+    if (values.length > values.GetNullCount()) {
+      const int64_t true_count = GetTrueCount(values);
+      counts[true] += true_count;
+      counts[false] += values.length - values.null_count - true_count;
     }
+  }
 
-    int64_t counts[2]{};
-
-    for (const auto& array : datum.chunks()) {
-      if (array->length() > array->null_count()) {
-        const int64_t true_count =
-            arrow::internal::checked_pointer_cast<BooleanArray>(array)->true_count();
-        const int64_t false_count = array->length() - array->null_count() - true_count;
-        counts[true] += true_count;
-        counts[false] += false_count;
-      }
+  void UpdateCounts(const ChunkedArray& values) {
+    for (const auto& chunk : values.chunks()) {
+      UpdateCounts(*chunk->data());
     }
+  }
 
-    const int64_t distinct_values = (counts[0] != 0) + (counts[1] != 0);
+  Status WrapResult(KernelContext* ctx, const ModeOptions& options, const DataType& type,
+                    ExecResult* out) {
+    const int64_t distinct_values = (this->counts[0] != 0) + (this->counts[1] != 0);
     const int64_t n = std::min(options.n, distinct_values);
 
     bool* mode_buffer;
     int64_t* count_buffer;
     ARROW_ASSIGN_OR_RAISE(std::tie(mode_buffer, count_buffer),
-                          PrepareOutput<BooleanType>(n, ctx, out));
+                          PrepareOutput<BooleanType>(n, ctx, type, out));
 
     if (n >= 1) {
       const bool index = counts[1] > counts[0];
@@ -201,6 +217,32 @@ struct CountModer<BooleanType> {
 
     return Status::OK();
   }
+
+  Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const ArraySpan& values = batch[0].array;
+    const ModeOptions& options = ModeState::Get(ctx);
+    if ((!options.skip_nulls && values.GetNullCount() > 0) ||
+        (values.length - values.null_count < options.min_count)) {
+      return PrepareOutput<BooleanType>(/*n=*/0, ctx, *out->type(), out).status();
+    }
+    UpdateCounts(values);
+    return WrapResult(ctx, options, *out->type(), out);
+  }
+
+  Status ExecChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ChunkedArray& values = *batch[0].chunked_array();
+    const ModeOptions& options = ModeState::Get(ctx);
+    ExecResult result;
+    if ((!options.skip_nulls && values.null_count() > 0) ||
+        (values.length() - values.null_count() < options.min_count)) {
+      RETURN_NOT_OK(PrepareOutput<BooleanType>(/*n=*/0, ctx, *out->type(), &result));
+    } else {
+      UpdateCounts(values);
+      RETURN_NOT_OK(WrapResult(ctx, options, *out->type(), &result));
+    }
+    *out = result.array_data();
+    return Status::OK();
+  }
 };
 
 // copy and sort approach for floating points, decimals, or integers with wide
@@ -222,40 +264,38 @@ struct SortModer {
     return static_cast<CType>(0);
   }
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const Datum& datum = batch[0];
-    const int64_t in_length = datum.length() - datum.null_count();
-
+  template <typename Container>
+  Status ComputeMode(KernelContext* ctx, const Container& arr, int64_t length,
+                     int64_t null_count, const DataType& type, ExecResult* out) {
     const ModeOptions& options = ModeState::Get(ctx);
-    if ((!options.skip_nulls && datum.null_count() > 0) ||
-        (in_length < options.min_count)) {
-      return PrepareOutput<T>(/*n=*/0, ctx, out).status();
+    const int64_t in_length = length - null_count;
+    if ((!options.skip_nulls && null_count > 0) || (in_length < options.min_count)) {
+      return PrepareOutput<T>(/*n=*/0, ctx, type, out).status();
     }
 
     // copy all chunks to a buffer, ignore nulls and nans
-    std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+    std::vector<CType, Allocator> values(Allocator(ctx->memory_pool()));
 
     uint64_t nan_count = 0;
-    if (in_length > 0) {
-      in_buffer.resize(in_length);
-      CopyNonNullValues(datum, in_buffer.data());
+    if (length > 0) {
+      values.resize(length - null_count);
+      CopyNonNullValues(arr, values.data());
 
       // drop nan
       if (is_floating_type<T>::value) {
-        const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
-                                        [](CType v) { return v != v; });
-        nan_count = in_buffer.end() - it;
-        in_buffer.resize(it - in_buffer.begin());
+        const auto& it =
+            std::remove_if(values.begin(), values.end(), [](CType v) { return v != v; });
+        nan_count = values.end() - it;
+        values.resize(it - values.begin());
       }
     }
-
     // sort the input data to count same values
-    std::sort(in_buffer.begin(), in_buffer.end());
+    std::sort(values.begin(), values.end());
 
     // generator to emit next value:count pair
-    auto it = in_buffer.cbegin();
+    auto it = values.cbegin();
     auto gen = [&]() {
-      if (ARROW_PREDICT_FALSE(it == in_buffer.cend())) {
+      if (ARROW_PREDICT_FALSE(it == values.cend())) {
         // handle NAN at last
         if (nan_count > 0) {
           auto value_count = std::make_pair(GetNan(), nan_count);
@@ -270,37 +310,68 @@ struct SortModer {
       do {
         ++it;
         ++count;
-      } while (it != in_buffer.cend() && *it == value);
+      } while (it != values.cend() && *it == value);
       return std::make_pair(value, count);
     };
 
-    return Finalize<T>(ctx, out, std::move(gen));
+    return Finalize<T>(ctx, type, out, std::move(gen));
+  }
+
+  Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const ArraySpan& values = batch[0].array;
+    return ComputeMode(ctx, values, values.length, values.GetNullCount(), *out->type(),
+                       out);
+  }
+
+  Status ExecChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ChunkedArray& values = *batch[0].chunked_array();
+    ExecResult result;
+    RETURN_NOT_OK(ComputeMode(ctx, values, values.length(), values.null_count(),
+                              *out->type(), &result));
+    *out = result.array_data();
+    return Status::OK();
   }
 };
 
+template <typename CType, typename Container>
+bool ShouldUseCountMode(const Container& values, int64_t num_valid, CType* min,
+                        CType* max) {
+  // cross point to benefit from counting approach
+  // about 2x improvement for int32/64 from micro-benchmarking
+  static constexpr int kMinArraySize = 8192;
+  static constexpr int kMaxValueRange = 32768;
+
+  if (num_valid >= kMinArraySize) {
+    std::tie(*min, *max) = GetMinMax<CType>(values);
+    return static_cast<uint64_t>(*max) - static_cast<uint64_t>(*min) <= kMaxValueRange;
+  }
+  return false;
+}
+
 // pick counting or sorting approach per integers value range
 template <typename T>
 struct CountOrSortModer {
   using CType = typename T::c_type;
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    // cross point to benefit from counting approach
-    // about 2x improvement for int32/64 from micro-benchmarking
-    static constexpr int kMinArraySize = 8192;
-    static constexpr int kMaxValueRange = 32768;
-
-    const Datum& datum = batch[0];
-    if (datum.length() - datum.null_count() >= kMinArraySize) {
-      CType min, max;
-      std::tie(min, max) = GetMinMax<CType>(datum);
-
-      if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
-        return CountModer<T>(min, max).Exec(ctx, batch, out);
-      }
+  Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const ArraySpan& values = batch[0].array;
+    CType min, max;
+    if (ShouldUseCountMode<CType>(values, values.length - values.GetNullCount(), &min,
+                                  &max)) {
+      return CountModer<T>(min, max).Exec(ctx, batch, out);
     }
-
     return SortModer<T>().Exec(ctx, batch, out);
   }
+
+  Status ExecChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ChunkedArray& values = *batch[0].chunked_array();
+    CType min, max;
+    if (ShouldUseCountMode<CType>(values, values.length() - values.null_count(), &min,
+                                  &max)) {
+      return CountModer<T>(min, max).ExecChunked(ctx, batch, out);
+    }
+    return SortModer<T>().ExecChunked(ctx, batch, out);
+  }
 };
 
 template <typename InType, typename Enable = void>
@@ -340,18 +411,18 @@ struct Moder<InType, enable_if_decimal<InType>> {
 };
 
 template <typename T>
-Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
+Status ScalarMode(KernelContext* ctx, const Scalar& scalar, ExecResult* out) {
   using CType = typename TypeTraits<T>::CType;
 
   const ModeOptions& options = ModeState::Get(ctx);
   if ((!options.skip_nulls && !scalar.is_valid) ||
       (static_cast<uint32_t>(scalar.is_valid) < options.min_count)) {
-    return PrepareOutput<T>(/*n=*/0, ctx, out).status();
+    return PrepareOutput<T>(/*n=*/0, ctx, *out->type(), out).status();
   }
 
   if (scalar.is_valid) {
     bool called = false;
-    return Finalize<T>(ctx, out, [&]() {
+    return Finalize<T>(ctx, *out->type(), out, [&]() {
       if (!called) {
         called = true;
         return std::pair<CType, uint64_t>(UnboxScalar<T>::Unbox(scalar), 1);
@@ -359,37 +430,48 @@ Status ScalarMode(KernelContext* ctx, const Scalar& scalar, Datum* out) {
       return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
     });
   }
-  return Finalize<T>(ctx, out, []() {
+  return Finalize<T>(ctx, *out->type(), out, []() {
     return std::pair<CType, uint64_t>(static_cast<CType>(0), kCountEOF);
   });
 }
 
-template <typename _, typename InType>
-struct ModeExecutor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (ctx->state() == nullptr) {
-      return Status::Invalid("Mode requires ModeOptions");
-    }
-    const ModeOptions& options = ModeState::Get(ctx);
-    if (options.n <= 0) {
-      return Status::Invalid("ModeOptions::n must be strictly positive");
-    }
+Status CheckOptions(KernelContext* ctx) {
+  if (ctx->state() == nullptr) {
+    return Status::Invalid("Mode requires ModeOptions");
+  }
+  const ModeOptions& options = ModeState::Get(ctx);
+  if (options.n <= 0) {
+    return Status::Invalid("ModeOptions::n must be strictly positive");
+  }
+  return Status::OK();
+}
 
+template <typename OutTypeUnused, typename InType>
+struct ModeExecutor {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    RETURN_NOT_OK(CheckOptions(ctx));
     if (batch[0].is_scalar()) {
-      return ScalarMode<InType>(ctx, *batch[0].scalar(), out);
+      return ScalarMode<InType>(ctx, *batch[0].scalar, out);
     }
-
     return Moder<InType>().impl.Exec(ctx, batch, out);
   }
 };
 
+template <typename OutTypeUnused, typename InType>
+struct ModeExecutorChunked {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    RETURN_NOT_OK(CheckOptions(ctx));
+    return Moder<InType>().impl.ExecChunked(ctx, batch, out);
+  }
+};
+
 Result<ValueDescr> ModeType(KernelContext*, const std::vector<ValueDescr>& descrs) {
   return ValueDescr::Array(
       struct_({field(kModeFieldName, descrs[0].type), field(kCountFieldName, int64())}));
 }
 
-VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type,
-                           ArrayKernelExecOld exec) {
+VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type, ArrayKernelExec exec,
+                           VectorKernel::ChunkedExec exec_chunked) {
   VectorKernel kernel;
   kernel.init = ModeState::Init;
   kernel.can_execute_chunkwise = false;
@@ -409,6 +491,7 @@ VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type,
     }
   }
   kernel.exec = std::move(exec);
+  kernel.exec_chunked = exec_chunked;
   return kernel;
 }
 
@@ -431,17 +514,22 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry) {
   auto func = std::make_shared<VectorFunction>("mode", Arity::Unary(), mode_doc,
                                                &default_options);
   DCHECK_OK(func->AddKernel(
-      NewModeKernel(boolean(), ModeExecutor<StructType, BooleanType>::Exec)));
+      NewModeKernel(boolean(), ModeExecutor<StructType, BooleanType>::Exec,
+                    ModeExecutorChunked<StructType, BooleanType>::Exec)));
   for (const auto& type : NumericTypes()) {
     // TODO(wesm):
-    DCHECK_OK(func->AddKernel(
-        NewModeKernel(type, GenerateNumericOld<ModeExecutor, StructType>(*type))));
+    DCHECK_OK(func->AddKernel(NewModeKernel(
+        type, GenerateNumeric<ModeExecutor, StructType>(*type),
+        GenerateNumeric<ModeExecutorChunked, StructType, VectorKernel::ChunkedExec>(
+            *type))));
   }
   // Type parameters are ignored
   DCHECK_OK(func->AddKernel(
-      NewModeKernel(decimal128(1, 0), ModeExecutor<StructType, Decimal128Type>::Exec)));
+      NewModeKernel(decimal128(1, 0), ModeExecutor<StructType, Decimal128Type>::Exec,
+                    ModeExecutorChunked<StructType, Decimal128Type>::Exec)));
   DCHECK_OK(func->AddKernel(
-      NewModeKernel(decimal256(1, 0), ModeExecutor<StructType, Decimal256Type>::Exec)));
+      NewModeKernel(decimal256(1, 0), ModeExecutor<StructType, Decimal256Type>::Exec,
+                    ModeExecutorChunked<StructType, Decimal256Type>::Exec)));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
index d18d8425946..7b989bfe5f5 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
@@ -89,39 +89,19 @@ struct SortQuantiler {
   using CType = typename TypeTraits<InType>::CType;
   using Allocator = arrow::stl::allocator<CType>;
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const QuantileOptions& options = QuantileState::Get(ctx);
-    const Datum& datum = batch[0];
-
-    // copy all chunks to a buffer, ignore nulls and nans
-    std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
-    int64_t in_length = 0;
-    if ((!options.skip_nulls && datum.null_count() > 0) ||
-        (datum.length() - datum.null_count() < options.min_count)) {
-      in_length = 0;
-    } else {
-      in_length = datum.length() - datum.null_count();
-    }
-
-    if (in_length > 0) {
-      in_buffer.resize(in_length);
-      CopyNonNullValues(datum, in_buffer.data());
-
-      // drop nan
-      if (is_floating_type<InType>::value) {
-        const auto& it = std::remove_if(in_buffer.begin(), in_buffer.end(),
-                                        [](CType v) { return v != v; });
-        in_buffer.resize(it - in_buffer.begin());
-      }
-    }
-
+  Status ComputeQuantile(KernelContext* ctx, const QuantileOptions& options,
+                         const std::shared_ptr<DataType>& type,
+                         std::vector<CType, Allocator>& in_buffer, ExecResult* out) {
     // prepare out array
     // out type depends on options
     const bool is_datapoint = IsDataPoint(options);
-    const std::shared_ptr<DataType> out_type = is_datapoint ? datum.type() : float64();
+    const std::shared_ptr<DataType> out_type = is_datapoint ? type : float64();
     int64_t out_length = options.q.size();
     if (in_buffer.empty()) {
-      return MakeArrayOfNull(out_type, out_length, ctx->memory_pool()).Value(out);
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> result,
+                            MakeArrayOfNull(out_type, out_length, ctx->memory_pool()));
+      out->value = result->data();
+      return Status::OK();
     }
     auto out_data = ArrayData::Make(out_type, out_length, 0);
     out_data->buffers.resize(2, nullptr);
@@ -129,7 +109,7 @@ struct SortQuantiler {
     // calculate quantiles
     if (out_length > 0) {
       ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
-                            ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+                            ctx->Allocate(out_length * out_type->byte_width()));
 
       // find quantiles in descending order
       std::vector<int64_t> q_indices(out_length);
@@ -153,14 +133,59 @@ struct SortQuantiler {
         double* out_buffer = out_data->template GetMutableValues<double>(1);
         for (int64_t i = 0; i < out_length; ++i) {
           const int64_t q_index = q_indices[i];
-          out_buffer[q_index] =
-              GetQuantileByInterp(in_buffer, &last_index, options.q[q_index],
-                                  options.interpolation, *datum.type());
+          out_buffer[q_index] = GetQuantileByInterp(
+              in_buffer, &last_index, options.q[q_index], options.interpolation, *type);
         }
       }
     }
 
-    *out = Datum(std::move(out_data));
+    out->value = std::move(out_data);
+    return Status::OK();
+  }
+
+  template <typename Container>
+  void FillBuffer(const QuantileOptions& options, const Container& container,
+                  int64_t length, int64_t null_count,
+                  std::vector<CType, Allocator>* in_buffer) {
+    int64_t in_length = 0;
+    if ((!options.skip_nulls && null_count > 0) ||
+        (length - null_count < options.min_count)) {
+      in_length = 0;
+    } else {
+      in_length = length - null_count;
+    }
+
+    if (in_length > 0) {
+      in_buffer->resize(in_length);
+      CopyNonNullValues(container, in_buffer->data());
+
+      // drop nan
+      if (is_floating_type<InType>::value) {
+        const auto& it = std::remove_if(in_buffer->begin(), in_buffer->end(),
+                                        [](CType v) { return v != v; });
+        in_buffer->resize(it - in_buffer->begin());
+      }
+    }
+  }
+
+  Status Exec(KernelContext* ctx, const ArraySpan& values, ExecResult* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
+
+    // copy all chunks to a buffer, ignore nulls and nans
+    std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+    FillBuffer(options, values, values.length, values.GetNullCount(), &in_buffer);
+    return ComputeQuantile(ctx, options, values.type->Copy(), in_buffer, out);
+  }
+
+  Status Exec(KernelContext* ctx, const ChunkedArray& values, Datum* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
+
+    // copy all chunks to a buffer, ignore nulls and nans
+    std::vector<CType, Allocator> in_buffer(Allocator(ctx->memory_pool()));
+    FillBuffer(options, values, values.length(), values.null_count(), &in_buffer);
+    ExecResult result;
+    RETURN_NOT_OK(ComputeQuantile(ctx, options, values.type(), in_buffer, &result));
+    *out = result.array_data();
     return Status::OK();
   }
 
@@ -245,17 +270,8 @@ struct CountQuantiler {
     this->counts.resize(value_range, 0);
   }
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const QuantileOptions& options = QuantileState::Get(ctx);
-
-    // count values in all chunks, ignore nulls
-    const Datum& datum = batch[0];
-    int64_t in_length = 0;
-    if ((options.skip_nulls || (!options.skip_nulls && datum.null_count() == 0)) &&
-        (datum.length() - datum.null_count() >= options.min_count)) {
-      in_length = CountValues<CType>(this->counts.data(), datum, this->min);
-    }
-
+  Status ComputeQuantile(KernelContext* ctx, const QuantileOptions& options,
+                         int64_t in_length, ExecResult* out) {
     // prepare out array
     // out type depends on options
     const bool is_datapoint = IsDataPoint(options);
@@ -263,7 +279,10 @@ struct CountQuantiler {
         is_datapoint ? TypeTraits<InType>::type_singleton() : float64();
     int64_t out_length = options.q.size();
     if (in_length == 0) {
-      return MakeArrayOfNull(out_type, out_length, ctx->memory_pool()).Value(out);
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> result,
+                            MakeArrayOfNull(out_type, out_length, ctx->memory_pool()));
+      out->value = std::move(result->data());
+      return Status::OK();
     }
     auto out_data = ArrayData::Make(out_type, out_length, 0);
     out_data->buffers.resize(2, nullptr);
@@ -271,7 +290,7 @@ struct CountQuantiler {
     // calculate quantiles
     if (out_length > 0) {
       ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
-                            ctx->Allocate(out_length * GetBitWidth(*out_type) / 8));
+                            ctx->Allocate(out_length * out_type->byte_width()));
 
       // find quantiles in ascending order
       std::vector<int64_t> q_indices(out_length);
@@ -298,8 +317,36 @@ struct CountQuantiler {
         }
       }
     }
+    out->value = std::move(out_data);
+    return Status::OK();
+  }
+
+  Status Exec(KernelContext* ctx, const ArraySpan& values, ExecResult* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
+
+    // count values in all chunks, ignore nulls
+    int64_t in_length = 0;
+    if ((options.skip_nulls || (!options.skip_nulls && values.GetNullCount() == 0)) &&
+        (values.length - values.GetNullCount() >= options.min_count)) {
+      in_length = CountValues<CType>(values, this->min, this->counts.data());
+    }
+
+    return ComputeQuantile(ctx, options, in_length, out);
+  }
+
+  Status Exec(KernelContext* ctx, const ChunkedArray& values, Datum* out) {
+    const QuantileOptions& options = QuantileState::Get(ctx);
 
-    *out = Datum(std::move(out_data));
+    // count values in all chunks, ignore nulls
+    int64_t in_length = 0;
+    if ((options.skip_nulls || (!options.skip_nulls && values.null_count() == 0)) &&
+        (values.length() - values.null_count() >= options.min_count)) {
+      in_length = CountValues<CType>(values, this->min, this->counts.data());
+    }
+
+    ExecResult result;
+    RETURN_NOT_OK(ComputeQuantile(ctx, options, in_length, &result));
+    *out = result.array_data();
     return Status::OK();
   }
 
@@ -365,23 +412,31 @@ template <typename InType>
 struct CountOrSortQuantiler {
   using CType = typename InType::c_type;
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    // cross point to benefit from histogram approach
-    // parameters estimated from ad-hoc benchmarks manually
-    static constexpr int kMinArraySize = 65536;
-    static constexpr int kMaxValueRange = 65536;
+  // cross point to benefit from histogram approach
+  // parameters estimated from ad-hoc benchmarks manually
+  static constexpr int kMinArraySize = 65536;
+  static constexpr int kMaxValueRange = 65536;
 
-    const Datum& datum = batch[0];
-    if (datum.length() - datum.null_count() >= kMinArraySize) {
+  Status Exec(KernelContext* ctx, const ArraySpan& values, ExecResult* out) {
+    if (values.length - values.GetNullCount() >= kMinArraySize) {
       CType min, max;
-      std::tie(min, max) = GetMinMax<CType>(datum);
-
+      std::tie(min, max) = GetMinMax<CType>(values);
       if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
-        return CountQuantiler<InType>(min, max).Exec(ctx, batch, out);
+        return CountQuantiler<InType>(min, max).Exec(ctx, values, out);
       }
     }
+    return SortQuantiler<InType>().Exec(ctx, values, out);
+  }
 
-    return SortQuantiler<InType>().Exec(ctx, batch, out);
+  Status Exec(KernelContext* ctx, const ChunkedArray& values, Datum* out) {
+    if (values.length() - values.null_count() >= kMinArraySize) {
+      CType min, max;
+      std::tie(min, max) = GetMinMax<CType>(values);
+      if (static_cast<uint64_t>(max) - static_cast<uint64_t>(min) <= kMaxValueRange) {
+        return CountQuantiler<InType>(min, max).Exec(ctx, values, out);
+      }
+    }
+    return SortQuantiler<InType>().Exec(ctx, values, out);
   }
 };
 
@@ -417,15 +472,14 @@ struct ExactQuantiler<InType, enable_if_t<is_decimal_type<InType>::value>> {
 };
 
 template <typename T>
-Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
-                      const Scalar& scalar, Datum* out) {
+Status ScalarQuantile(KernelContext* ctx, const Scalar& scalar, ExecResult* out) {
+  const QuantileOptions& options = QuantileState::Get(ctx);
   using CType = typename TypeTraits<T>::CType;
-  ArrayData* output = out->mutable_array();
+  ArrayData* output = out->array_data().get();
   output->length = options.q.size();
   auto out_type = IsDataPoint(options) ? scalar.type : float64();
-  ARROW_ASSIGN_OR_RAISE(
-      output->buffers[1],
-      ctx->Allocate(output->length * bit_util::BytesForBits(GetBitWidth(*out_type))));
+  ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+                        ctx->Allocate(output->length * out_type->byte_width()));
 
   if (!scalar.is_valid || options.min_count > 1) {
     output->null_count = output->length;
@@ -456,28 +510,39 @@ Status ScalarQuantile(KernelContext* ctx, const QuantileOptions& options,
   return Status::OK();
 }
 
-template <typename _, typename InType>
-struct QuantileExecutor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (ctx->state() == nullptr) {
-      return Status::Invalid("Quantile requires QuantileOptions");
-    }
+Status CheckQuantileOptions(KernelContext* ctx) {
+  if (ctx->state() == nullptr) {
+    return Status::Invalid("Quantile requires QuantileOptions");
+  }
 
-    const QuantileOptions& options = QuantileState::Get(ctx);
-    if (options.q.empty()) {
-      return Status::Invalid("Requires quantile argument");
-    }
-    for (double q : options.q) {
-      if (q < 0 || q > 1) {
-        return Status::Invalid("Quantile must be between 0 and 1");
-      }
+  const QuantileOptions& options = QuantileState::Get(ctx);
+  if (options.q.empty()) {
+    return Status::Invalid("Requires quantile argument");
+  }
+  for (double q : options.q) {
+    if (q < 0 || q > 1) {
+      return Status::Invalid("Quantile must be between 0 and 1");
     }
+  }
+  return Status::OK();
+}
 
+template <typename OutputTypeUnused, typename InType>
+struct QuantileExecutor {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    RETURN_NOT_OK(CheckQuantileOptions(ctx));
     if (batch[0].is_scalar()) {
-      return ScalarQuantile<InType>(ctx, options, *batch[0].scalar(), out);
+      return ScalarQuantile<InType>(ctx, *batch[0].scalar, out);
     }
+    return ExactQuantiler<InType>().impl.Exec(ctx, batch[0].array, out);
+  }
+};
 
-    return ExactQuantiler<InType>().impl.Exec(ctx, batch, out);
+template <typename OutputTypeUnused, typename InType>
+struct QuantileExecutorChunked {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    RETURN_NOT_OK(CheckQuantileOptions(ctx));
+    return ExactQuantiler<InType>().impl.Exec(ctx, *batch[0].chunked_array(), out);
   }
 };
 
@@ -500,19 +565,24 @@ void AddQuantileKernels(VectorFunction* func) {
   for (const auto& ty : NumericTypes()) {
     base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
     // output type is determined at runtime, set template argument to nulltype
-    base.exec = GenerateNumericOld<QuantileExecutor, NullType>(*ty);
+    base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
+    base.exec_chunked =
+        GenerateNumeric<QuantileExecutorChunked, NullType, VectorKernel::ChunkedExec>(
+            *ty);
     DCHECK_OK(func->AddKernel(base));
   }
   {
     base.signature =
         KernelSignature::Make({InputType(Type::DECIMAL128)}, OutputType(ResolveOutput));
     base.exec = QuantileExecutor<NullType, Decimal128Type>::Exec;
+    base.exec_chunked = QuantileExecutorChunked<NullType, Decimal128Type>::Exec;
     DCHECK_OK(func->AddKernel(base));
   }
   {
     base.signature =
         KernelSignature::Make({InputType(Type::DECIMAL256)}, OutputType(ResolveOutput));
     base.exec = QuantileExecutor<NullType, Decimal256Type>::Exec;
+    base.exec_chunked = QuantileExecutorChunked<NullType, Decimal256Type>::Exec;
     DCHECK_OK(func->AddKernel(base));
   }
 }
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index c696e6376f7..9e32f9e7f6d 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -33,18 +33,6 @@ Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   return Status::NotImplemented("This kernel is malformed");
 }
 
-Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  return Status::NotImplemented("This kernel is malformed");
-}
-
-ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) {
-  return [exec](KernelContext* ctx, const ExecSpan& span, ExecResult* out) {
-    ExecSpan flipped_span = span;
-    std::swap(flipped_span.values[0], flipped_span.values[1]);
-    return exec(ctx, flipped_span, out);
-  };
-}
-
 const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes() {
   static DataTypeVector example_parametric_types = {
       decimal128(12, 2),
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 8c3c7e3d423..bc21c4efb6a 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -456,14 +456,6 @@ Result<ValueDescr> FirstType(KernelContext*, const std::vector<ValueDescr>& desc
 Result<ValueDescr> LastType(KernelContext*, const std::vector<ValueDescr>& descrs);
 Result<ValueDescr> ListValuesType(KernelContext*, const std::vector<ValueDescr>& args);
 
-// ----------------------------------------------------------------------
-// Generate an array kernel given template classes
-
-Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
-Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out);
-
-ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec);
-
 // ----------------------------------------------------------------------
 // Helpers for iterating over common DataType instances for adding kernels to
 // functions
@@ -1032,41 +1024,29 @@ struct GetTypeId {
 
 }  // namespace detail
 
-// GD for numeric types (integer and floating point)
-template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateNumeric(detail::GetTypeId get_id) {
-  switch (get_id.id) {
-    case Type::INT8:
-      return Generator<Type0, Int8Type, Args...>::Exec;
-    case Type::UINT8:
-      return Generator<Type0, UInt8Type, Args...>::Exec;
-    case Type::INT16:
-      return Generator<Type0, Int16Type, Args...>::Exec;
-    case Type::UINT16:
-      return Generator<Type0, UInt16Type, Args...>::Exec;
-    case Type::INT32:
-      return Generator<Type0, Int32Type, Args...>::Exec;
-    case Type::UINT32:
-      return Generator<Type0, UInt32Type, Args...>::Exec;
-    case Type::INT64:
-      return Generator<Type0, Int64Type, Args...>::Exec;
-    case Type::UINT64:
-      return Generator<Type0, UInt64Type, Args...>::Exec;
-    case Type::FLOAT:
-      return Generator<Type0, FloatType, Args...>::Exec;
-    case Type::DOUBLE:
-      return Generator<Type0, DoubleType, Args...>::Exec;
-    default:
-      DCHECK(false);
-      return ExecFail;
+template <typename KernelType>
+struct FailFunctor {};
+
+template <>
+struct FailFunctor<ArrayKernelExec> {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    return Status::NotImplemented("This kernel is malformed");
   }
-}
+};
 
-// TODO(wesm): for ARROW-16756, while in transition to a new kernel
-// API I duplicated this generator dispatcher to be able to create old
-// kernel types
-template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExecOld GenerateNumericOld(detail::GetTypeId get_id) {
+template <>
+struct FailFunctor<VectorKernel::ChunkedExec> {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return Status::NotImplemented("This kernel is malformed");
+  }
+};
+
+Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
+
+// GD for numeric types (integer and floating point)
+template <template <typename...> class Generator, typename Type0,
+          typename KernelType = ArrayKernelExec, typename... Args>
+KernelType GenerateNumeric(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1090,7 +1070,7 @@ ArrayKernelExecOld GenerateNumericOld(detail::GetTypeId get_id) {
       return Generator<Type0, DoubleType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFailOld;
+      return FailFunctor<KernelType>::Exec;
   }
 }
 
@@ -1169,8 +1149,9 @@ ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) {
   }
 }
 
-template <template <typename...> class KernelGenerator, typename Op, typename... Args>
-ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
+template <template <typename...> class KernelGenerator, typename Op,
+          typename KernelType = ArrayKernelExec, typename... Args>
+KernelType ArithmeticExecFromOp(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<Int8Type, Int8Type, Op, Args...>::Exec;
@@ -1196,40 +1177,7 @@ ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
       return KernelGenerator<DoubleType, DoubleType, Op, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
-  }
-}
-
-// ARROW-16756: temporarily duplicated until we get all the kernels
-// migrated to the new API
-template <template <typename...> class KernelGenerator, typename Op, typename... Args>
-ArrayKernelExecOld ArithmeticExecFromOpOld(detail::GetTypeId get_id) {
-  switch (get_id.id) {
-    case Type::INT8:
-      return KernelGenerator<Int8Type, Int8Type, Op, Args...>::Exec;
-    case Type::UINT8:
-      return KernelGenerator<UInt8Type, UInt8Type, Op, Args...>::Exec;
-    case Type::INT16:
-      return KernelGenerator<Int16Type, Int16Type, Op, Args...>::Exec;
-    case Type::UINT16:
-      return KernelGenerator<UInt16Type, UInt16Type, Op, Args...>::Exec;
-    case Type::INT32:
-      return KernelGenerator<Int32Type, Int32Type, Op, Args...>::Exec;
-    case Type::UINT32:
-      return KernelGenerator<UInt32Type, UInt32Type, Op, Args...>::Exec;
-    case Type::DURATION:
-    case Type::INT64:
-    case Type::TIMESTAMP:
-      return KernelGenerator<Int64Type, Int64Type, Op, Args...>::Exec;
-    case Type::UINT64:
-      return KernelGenerator<UInt64Type, UInt64Type, Op, Args...>::Exec;
-    case Type::FLOAT:
-      return KernelGenerator<FloatType, FloatType, Op, Args...>::Exec;
-    case Type::DOUBLE:
-      return KernelGenerator<DoubleType, DoubleType, Op, Args...>::Exec;
-    default:
-      DCHECK(false);
-      return ExecFailOld;
+      return FailFunctor<KernelType>::Exec;
   }
 }
 
@@ -1309,8 +1257,9 @@ ArrayKernelExec GenerateSignedInteger(detail::GetTypeId get_id) {
 // bits).
 //
 // See "Numeric" above for description of the generator functor
-template <template <typename...> class Generator, typename... Args>
-ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
+template <template <typename...> class Generator, typename KernelType = ArrayKernelExec,
+          typename... Args>
+KernelType GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::NA:
       return Generator<NullType, Args...>::Exec;
@@ -1342,67 +1291,14 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
       return Generator<MonthDayNanoIntervalType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFail;
-  }
-}
-
-// XXX: Duplicated temporarily
-template <template <typename...> class Generator, typename... Args>
-ArrayKernelExecOld GenerateTypeAgnosticPrimitiveOld(detail::GetTypeId get_id) {
-  switch (get_id.id) {
-    case Type::NA:
-      return Generator<NullType, Args...>::Exec;
-    case Type::BOOL:
-      return Generator<BooleanType, Args...>::Exec;
-    case Type::UINT8:
-    case Type::INT8:
-      return Generator<UInt8Type, Args...>::Exec;
-    case Type::UINT16:
-    case Type::INT16:
-      return Generator<UInt16Type, Args...>::Exec;
-    case Type::UINT32:
-    case Type::INT32:
-    case Type::FLOAT:
-    case Type::DATE32:
-    case Type::TIME32:
-    case Type::INTERVAL_MONTHS:
-      return Generator<UInt32Type, Args...>::Exec;
-    case Type::UINT64:
-    case Type::INT64:
-    case Type::DOUBLE:
-    case Type::DATE64:
-    case Type::TIMESTAMP:
-    case Type::TIME64:
-    case Type::DURATION:
-    case Type::INTERVAL_DAY_TIME:
-      return Generator<UInt64Type, Args...>::Exec;
-    case Type::INTERVAL_MONTH_DAY_NANO:
-      return Generator<MonthDayNanoIntervalType, Args...>::Exec;
-    default:
-      DCHECK(false);
-      return ExecFailOld;
+      return FailFunctor<KernelType>::Exec;
   }
 }
 
 // similar to GenerateTypeAgnosticPrimitive, but for base variable binary types
-template <template <typename...> class Generator, typename... Args>
-ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
-  switch (get_id.id) {
-    case Type::BINARY:
-    case Type::STRING:
-      return Generator<BinaryType, Args...>::Exec;
-    case Type::LARGE_BINARY:
-    case Type::LARGE_STRING:
-      return Generator<LargeBinaryType, Args...>::Exec;
-    default:
-      DCHECK(false);
-      return ExecFail;
-  }
-}
-
-// XXX: Duplicated temporarily
-template <template <typename...> class Generator, typename... Args>
-ArrayKernelExecOld GenerateTypeAgnosticVarBinaryBaseOld(detail::GetTypeId get_id) {
+template <template <typename...> class Generator, typename KernelType = ArrayKernelExec,
+          typename... Args>
+KernelType GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
     case Type::STRING:
@@ -1412,7 +1308,7 @@ ArrayKernelExecOld GenerateTypeAgnosticVarBinaryBaseOld(detail::GetTypeId get_id
       return Generator<LargeBinaryType, Args...>::Exec;
     default:
       DCHECK(false);
-      return ExecFailOld;
+      return FailFunctor<KernelType>::Exec;
   }
 }
 
@@ -1455,22 +1351,6 @@ ArrayKernelExec GenerateVarBinaryBase(detail::GetTypeId get_id) {
   }
 }
 
-// TODO: Duplicated in ARROW-16756
-template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExecOld GenerateVarBinaryBaseOld(detail::GetTypeId get_id) {
-  switch (get_id.id) {
-    case Type::BINARY:
-    case Type::STRING:
-      return Generator<Type0, BinaryType, Args...>::Exec;
-    case Type::LARGE_BINARY:
-    case Type::LARGE_STRING:
-      return Generator<Type0, LargeBinaryType, Args...>::Exec;
-    default:
-      DCHECK(false);
-      return ExecFailOld;
-  }
-}
-
 // See BaseBinary documentation
 template <template <typename...> class Generator, typename Type0, typename... Args>
 ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
@@ -1529,20 +1409,6 @@ ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
   }
 }
 
-// Temporarily duplicated for ARROW-16756
-template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExecOld GenerateDecimalOld(detail::GetTypeId get_id) {
-  switch (get_id.id) {
-    case Type::DECIMAL128:
-      return Generator<Type0, Decimal128Type, Args...>::Exec;
-    case Type::DECIMAL256:
-      return Generator<Type0, Decimal256Type, Args...>::Exec;
-    default:
-      DCHECK(false);
-      return ExecFailOld;
-  }
-}
-
 // END of kernel generator-dispatchers
 // ----------------------------------------------------------------------
 
diff --git a/cpp/src/arrow/compute/kernels/copy_data_internal.h b/cpp/src/arrow/compute/kernels/copy_data_internal.h
index 5a5d4463456..2e13563980c 100644
--- a/cpp/src/arrow/compute/kernels/copy_data_internal.h
+++ b/cpp/src/arrow/compute/kernels/copy_data_internal.h
@@ -40,7 +40,7 @@ struct CopyDataUtils<BooleanType> {
     arrow::internal::CopyBitmap(in, in_offset, length, out, out_offset);
   }
 
-  static void CopyData(const DataType&, const ArrayData& in, const int64_t in_offset,
+  static void CopyData(const DataType&, const ArraySpan& in, const int64_t in_offset,
                        uint8_t* out, const int64_t out_offset, const int64_t length) {
     const auto in_arr = in.GetValues<uint8_t>(1, /*absolute_offset=*/0);
     CopyData(*in.type, in_arr, in_offset, out, out_offset, length);
@@ -51,7 +51,7 @@ template <>
 struct CopyDataUtils<FixedSizeBinaryType> {
   static void CopyData(const DataType& ty, const Scalar& in, const int64_t in_offset,
                        uint8_t* out, const int64_t out_offset, const int64_t length) {
-    const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+    const int32_t width = ty.byte_width();
     uint8_t* begin = out + (width * out_offset);
     const auto& scalar = checked_cast<const arrow::internal::PrimitiveScalarBase&>(in);
     // Null scalar may have null value buffer
@@ -69,14 +69,14 @@ struct CopyDataUtils<FixedSizeBinaryType> {
 
   static void CopyData(const DataType& ty, const uint8_t* in, const int64_t in_offset,
                        uint8_t* out, const int64_t out_offset, const int64_t length) {
-    const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+    const int32_t width = ty.byte_width();
     uint8_t* begin = out + (width * out_offset);
     std::memcpy(begin, in + in_offset * width, length * width);
   }
 
-  static void CopyData(const DataType& ty, const ArrayData& in, const int64_t in_offset,
+  static void CopyData(const DataType& ty, const ArraySpan& in, const int64_t in_offset,
                        uint8_t* out, const int64_t out_offset, const int64_t length) {
-    const int32_t width = checked_cast<const FixedSizeBinaryType&>(ty).byte_width();
+    const int32_t width = ty.byte_width();
     const auto in_arr = in.GetValues<uint8_t>(1, in.offset * width);
     CopyData(ty, in_arr, in_offset, out, out_offset, length);
   }
@@ -100,7 +100,7 @@ struct CopyDataUtils<
                 length * sizeof(CType));
   }
 
-  static void CopyData(const DataType&, const ArrayData& in, const int64_t in_offset,
+  static void CopyData(const DataType&, const ArraySpan& in, const int64_t in_offset,
                        uint8_t* out, const int64_t out_offset, const int64_t length) {
     const auto in_arr = in.GetValues<uint8_t>(1, in.offset * sizeof(CType));
     CopyData(*in.type, in_arr, in_offset, out, out_offset, length);
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.cc b/cpp/src/arrow/compute/kernels/row_encoder.cc
index a64d388e363..81437de4ecd 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.cc
+++ b/cpp/src/arrow/compute/kernels/row_encoder.cc
@@ -61,7 +61,7 @@ Status KeyEncoder::DecodeNulls(MemoryPool* pool, int32_t length, uint8_t** encod
   return Status ::OK();
 }
 
-void BooleanKeyEncoder::AddLength(const Datum& data, int64_t batch_length,
+void BooleanKeyEncoder::AddLength(const ExecValue&, int64_t batch_length,
                                   int32_t* lengths) {
   for (int64_t i = 0; i < batch_length; ++i) {
     lengths[i] += kByteWidth + kExtraByteForNull;
@@ -72,11 +72,11 @@ void BooleanKeyEncoder::AddLengthNull(int32_t* length) {
   *length += kByteWidth + kExtraByteForNull;
 }
 
-Status BooleanKeyEncoder::Encode(const Datum& data, int64_t batch_length,
+Status BooleanKeyEncoder::Encode(const ExecValue& data, int64_t batch_length,
                                  uint8_t** encoded_bytes) {
   if (data.is_array()) {
     VisitArraySpanInline<BooleanType>(
-        *data.array(),
+        data.array,
         [&](bool value) {
           auto& encoded_ptr = *encoded_bytes++;
           *encoded_ptr++ = kValidByte;
@@ -126,7 +126,7 @@ Result<std::shared_ptr<ArrayData>> BooleanKeyEncoder::Decode(uint8_t** encoded_b
                          null_count);
 }
 
-void FixedWidthKeyEncoder::AddLength(const Datum& data, int64_t batch_length,
+void FixedWidthKeyEncoder::AddLength(const ExecValue&, int64_t batch_length,
                                      int32_t* lengths) {
   for (int64_t i = 0; i < batch_length; ++i) {
     lengths[i] += byte_width_ + kExtraByteForNull;
@@ -137,13 +137,12 @@ void FixedWidthKeyEncoder::AddLengthNull(int32_t* length) {
   *length += byte_width_ + kExtraByteForNull;
 }
 
-Status FixedWidthKeyEncoder::Encode(const Datum& data, int64_t batch_length,
+Status FixedWidthKeyEncoder::Encode(const ExecValue& data, int64_t batch_length,
                                     uint8_t** encoded_bytes) {
   if (data.is_array()) {
-    const auto& arr = *data.array();
-    ArrayData viewed(fixed_size_binary(byte_width_), arr.length, arr.buffers,
-                     arr.null_count, arr.offset);
-
+    ArraySpan viewed = data.array;
+    auto view_ty = fixed_size_binary(byte_width_);
+    viewed.type = view_ty.get();
     VisitArraySpanInline<FixedSizeBinaryType>(
         viewed,
         [&](util::string_view bytes) {
@@ -209,10 +208,15 @@ Result<std::shared_ptr<ArrayData>> FixedWidthKeyEncoder::Decode(uint8_t** encode
                          null_count);
 }
 
-Status DictionaryKeyEncoder::Encode(const Datum& data, int64_t batch_length,
+Status DictionaryKeyEncoder::Encode(const ExecValue& data, int64_t batch_length,
                                     uint8_t** encoded_bytes) {
-  auto dict = data.is_array() ? MakeArray(data.array()->dictionary)
-                              : data.scalar_as<DictionaryScalar>().value.dictionary;
+  std::shared_ptr<Array> dict;
+  if (data.is_array()) {
+    dict = data.array.dictionary().ToArray();
+  } else {
+    dict = data.scalar_as<DictionaryScalar>().value.dictionary;
+  }
+
   if (dictionary_) {
     if (!dictionary_->Equals(dict)) {
       // TODO(bkietz) unify if necessary. For now, just error if any batch's dictionary
@@ -224,9 +228,11 @@ Status DictionaryKeyEncoder::Encode(const Datum& data, int64_t batch_length,
   }
   if (data.is_array()) {
     return FixedWidthKeyEncoder::Encode(data, batch_length, encoded_bytes);
+  } else {
+    const std::shared_ptr<Scalar>& index = data.scalar_as<DictionaryScalar>().value.index;
+    return FixedWidthKeyEncoder::Encode(ExecValue(index.get()), batch_length,
+                                        encoded_bytes);
   }
-  return FixedWidthKeyEncoder::Encode(data.scalar_as<DictionaryScalar>().value.index,
-                                      batch_length, encoded_bytes);
 }
 
 Result<std::shared_ptr<ArrayData>> DictionaryKeyEncoder::Decode(uint8_t** encoded_bytes,
@@ -301,7 +307,7 @@ void RowEncoder::Clear() {
   bytes_.clear();
 }
 
-Status RowEncoder::EncodeAndAppend(const ExecBatch& batch) {
+Status RowEncoder::EncodeAndAppend(const ExecSpan& batch) {
   if (offsets_.empty()) {
     offsets_.resize(1);
     offsets_[0] = 0;
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.h b/cpp/src/arrow/compute/kernels/row_encoder.h
index 1a404ed66ba..0ccb18a71d0 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.h
+++ b/cpp/src/arrow/compute/kernels/row_encoder.h
@@ -39,11 +39,13 @@ struct KeyEncoder {
 
   virtual ~KeyEncoder() = default;
 
-  virtual void AddLength(const Datum&, int64_t batch_length, int32_t* lengths) = 0;
+  virtual void AddLength(const ExecValue& value, int64_t batch_length,
+                         int32_t* lengths) = 0;
 
   virtual void AddLengthNull(int32_t* length) = 0;
 
-  virtual Status Encode(const Datum&, int64_t batch_length, uint8_t** encoded_bytes) = 0;
+  virtual Status Encode(const ExecValue&, int64_t batch_length,
+                        uint8_t** encoded_bytes) = 0;
 
   virtual void EncodeNull(uint8_t** encoded_bytes) = 0;
 
@@ -62,11 +64,11 @@ struct KeyEncoder {
 struct BooleanKeyEncoder : KeyEncoder {
   static constexpr int kByteWidth = 1;
 
-  void AddLength(const Datum& data, int64_t batch_length, int32_t* lengths) override;
+  void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override;
 
   void AddLengthNull(int32_t* length) override;
 
-  Status Encode(const Datum& data, int64_t batch_length,
+  Status Encode(const ExecValue& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override;
 
   void EncodeNull(uint8_t** encoded_bytes) override;
@@ -80,11 +82,11 @@ struct FixedWidthKeyEncoder : KeyEncoder {
       : type_(std::move(type)),
         byte_width_(checked_cast<const FixedWidthType&>(*type_).bit_width() / 8) {}
 
-  void AddLength(const Datum& data, int64_t batch_length, int32_t* lengths) override;
+  void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override;
 
   void AddLengthNull(int32_t* length) override;
 
-  Status Encode(const Datum& data, int64_t batch_length,
+  Status Encode(const ExecValue& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override;
 
   void EncodeNull(uint8_t** encoded_bytes) override;
@@ -100,7 +102,7 @@ struct DictionaryKeyEncoder : FixedWidthKeyEncoder {
   DictionaryKeyEncoder(std::shared_ptr<DataType> type, MemoryPool* pool)
       : FixedWidthKeyEncoder(std::move(type)), pool_(pool) {}
 
-  Status Encode(const Datum& data, int64_t batch_length,
+  Status Encode(const ExecValue& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override;
 
   Result<std::shared_ptr<ArrayData>> Decode(uint8_t** encoded_bytes, int32_t length,
@@ -114,18 +116,18 @@ template <typename T>
 struct VarLengthKeyEncoder : KeyEncoder {
   using Offset = typename T::offset_type;
 
-  void AddLength(const Datum& data, int64_t batch_length, int32_t* lengths) override {
+  void AddLength(const ExecValue& data, int64_t batch_length, int32_t* lengths) override {
     if (data.is_array()) {
       int64_t i = 0;
       VisitArraySpanInline<T>(
-          *data.array(),
+          data.array,
           [&](util::string_view bytes) {
             lengths[i++] +=
                 kExtraByteForNull + sizeof(Offset) + static_cast<int32_t>(bytes.size());
           },
           [&] { lengths[i++] += kExtraByteForNull + sizeof(Offset); });
     } else {
-      const Scalar& scalar = *data.scalar();
+      const Scalar& scalar = *data.scalar;
       const int32_t buffer_size =
           scalar.is_valid ? static_cast<int32_t>(UnboxScalar<T>::Unbox(scalar).size())
                           : 0;
@@ -139,11 +141,11 @@ struct VarLengthKeyEncoder : KeyEncoder {
     *length += kExtraByteForNull + sizeof(Offset);
   }
 
-  Status Encode(const Datum& data, int64_t batch_length,
+  Status Encode(const ExecValue& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override {
     if (data.is_array()) {
       VisitArraySpanInline<T>(
-          *data.array(),
+          data.array,
           [&](util::string_view bytes) {
             auto& encoded_ptr = *encoded_bytes++;
             *encoded_ptr++ = kValidByte;
@@ -232,11 +234,11 @@ struct VarLengthKeyEncoder : KeyEncoder {
 };
 
 struct NullKeyEncoder : KeyEncoder {
-  void AddLength(const Datum&, int64_t batch_length, int32_t* lengths) override {}
+  void AddLength(const ExecValue&, int64_t batch_length, int32_t* lengths) override {}
 
   void AddLengthNull(int32_t* length) override {}
 
-  Status Encode(const Datum& data, int64_t batch_length,
+  Status Encode(const ExecValue& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override {
     return Status::OK();
   }
@@ -255,7 +257,7 @@ class ARROW_EXPORT RowEncoder {
 
   void Init(const std::vector<ValueDescr>& column_types, ExecContext* ctx);
   void Clear();
-  Status EncodeAndAppend(const ExecBatch& batch);
+  Status EncodeAndAppend(const ExecSpan& batch);
   Result<ExecBatch> Decode(int64_t num_rows, const int32_t* row_ids);
 
   inline std::string encoded_row(int32_t i) const {
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 00138adc793..9b2d3cce89f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -1447,6 +1447,8 @@ Status ExecRound(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
       state.options.ToString());
 }
 
+#undef ROUND_CASE
+
 // Like MakeUnaryArithmeticFunction, but for unary rounding functions that control
 // kernel dispatch based on RoundMode, only on non-null output.
 template <template <typename, RoundMode, typename...> class Op, typename OptionsType>
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index dad94c1ace7..bc29d75c57a 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -54,7 +54,8 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
 
   for (const auto& ty : NumericTypes()) {
     ArrayKernelExec exec =
-        GenerateNumeric<applicator::ScalarUnary, BooleanType, IsNonZero>(*ty);
+        GenerateNumeric<applicator::ScalarUnary, BooleanType, ArrayKernelExec, IsNonZero>(
+            *ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   for (const auto& ty : BaseBinaryTypes()) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 8ca907b7f5e..83585a816c4 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -310,6 +310,21 @@ std::shared_ptr<ScalarFunction> MakeCompareFunction(std::string name, FunctionDo
   return func;
 }
 
+struct FlippedData : public KernelState {
+  ArrayKernelExec unflipped_exec;
+  explicit FlippedData(ArrayKernelExec unflipped_exec) : unflipped_exec(unflipped_exec) {}
+};
+
+Status FlippedBinaryExec(KernelContext* ctx, const ExecSpan& span, ExecResult* out) {
+  const auto kernel = static_cast<const ScalarKernel*>(ctx->kernel());
+  DCHECK(kernel);
+  const auto kernel_data = static_cast<const FlippedData*>(kernel->data.get());
+
+  ExecSpan flipped_span = span;
+  std::swap(flipped_span.values[0], flipped_span.values[1]);
+  return kernel_data->unflipped_exec(ctx, flipped_span, out);
+}
+
 std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
                                                     const ScalarFunction& func,
                                                     FunctionDoc doc) {
@@ -317,7 +332,8 @@ std::shared_ptr<ScalarFunction> MakeFlippedFunction(std::string name,
       std::make_shared<CompareFunction>(name, Arity::Binary(), std::move(doc));
   for (const ScalarKernel* kernel : func.kernels()) {
     ScalarKernel flipped_kernel = *kernel;
-    flipped_kernel.exec = MakeFlippedBinaryExec(kernel->exec);
+    flipped_kernel.data = std::make_shared<FlippedData>(kernel->exec);
+    flipped_kernel.exec = FlippedBinaryExec;
     DCHECK_OK(flipped_func->AddKernel(std::move(flipped_kernel)));
   }
   return flipped_func;
@@ -708,7 +724,8 @@ std::shared_ptr<ScalarFunction> MakeScalarMinMax(std::string name, FunctionDoc d
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
   for (const auto& ty : BaseBinaryTypes()) {
-    auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryScalarMinMax, Op>(ty);
+    auto exec =
+        GenerateTypeAgnosticVarBinaryBase<BinaryScalarMinMax, ArrayKernelExec, Op>(ty);
     ScalarKernel kernel{KernelSignature::Make({ty}, ty, /*is_varargs=*/true), exec,
                         MinMaxState::Init};
     kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 2b7261a0bad..1c555771f51 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -1244,7 +1244,7 @@ void AddPrimitiveIfElseKernels(const std::shared_ptr<ScalarFunction>& scalar_fun
                                const std::vector<std::shared_ptr<DataType>>& types) {
   for (auto&& type : types) {
     auto exec =
-        internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec,
+        internal::GenerateTypeAgnosticPrimitive<ResolveIfElseExec, ArrayKernelExec,
                                                 /*AllocateMem=*/std::false_type>(*type);
     // cond array needs to be boolean always
     std::shared_ptr<KernelSignature> sig;
@@ -1269,7 +1269,7 @@ void AddBinaryIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_functi
                             const std::vector<std::shared_ptr<DataType>>& types) {
   for (auto&& type : types) {
     auto exec =
-        internal::GenerateTypeAgnosticVarBinaryBase<ResolveIfElseExec,
+        internal::GenerateTypeAgnosticVarBinaryBase<ResolveIfElseExec, ArrayKernelExec,
                                                     /*AllocateMem=*/std::true_type>(
             *type);
     // cond array needs to be boolean always
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
index d66d074d0ba..dc546b6a995 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -3262,7 +3262,8 @@ const JoinOptions* GetDefaultJoinOptions() {
 template <typename ListType>
 void AddBinaryJoinForListType(ScalarFunction* func) {
   for (const auto& ty : BaseBinaryTypes()) {
-    auto exec = GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ListType>(*ty);
+    auto exec =
+        GenerateTypeAgnosticVarBinaryBase<BinaryJoin, ArrayKernelExec, ListType>(*ty);
     auto list_ty = std::make_shared<ListType>(ty);
     DCHECK_OK(func->AddKernel({InputType(list_ty), InputType(ty)}, ty, std::move(exec)));
   }
diff --git a/cpp/src/arrow/compute/kernels/util_internal.cc b/cpp/src/arrow/compute/kernels/util_internal.cc
index 90f3687bee2..25d46d821b4 100644
--- a/cpp/src/arrow/compute/kernels/util_internal.cc
+++ b/cpp/src/arrow/compute/kernels/util_internal.cc
@@ -30,33 +30,6 @@ using internal::checked_cast;
 namespace compute {
 namespace internal {
 
-const uint8_t* GetValidityBitmap(const ArrayData& data) {
-  const uint8_t* bitmap = nullptr;
-  if (data.buffers[0]) {
-    bitmap = data.buffers[0]->data();
-  }
-  return bitmap;
-}
-
-int GetBitWidth(const DataType& type) {
-  return checked_cast<const FixedWidthType&>(type).bit_width();
-}
-
-PrimitiveArg GetPrimitiveArg(const ArrayData& arr) {
-  PrimitiveArg arg;
-  arg.is_valid = GetValidityBitmap(arr);
-  arg.data = arr.buffers[1]->data();
-  arg.bit_width = GetBitWidth(*arr.type);
-  arg.offset = arr.offset;
-  arg.length = arr.length;
-  if (arg.bit_width > 1) {
-    arg.data += arr.offset * arg.bit_width / 8;
-  }
-  // This may be kUnknownNullCount
-  arg.null_count = (arg.is_valid != nullptr) ? arr.null_count.load() : 0;
-  return arg;
-}
-
 // TODO(wesm): ARROW-16577: this will be unneeded later
 ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec, bool use_array_span,
                                                NullHandling::type null_handling) {
@@ -121,6 +94,25 @@ ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec, bool use_ar
   };
 }
 
+ExecValue GetExecValue(const Datum& value) {
+  ExecValue result;
+  if (value.is_array()) {
+    result.SetArray(*value.array());
+  } else {
+    result.SetScalar(value.scalar().get());
+  }
+  return result;
+}
+
+int64_t GetTrueCount(const ArraySpan& mask) {
+  if (mask.buffers[0].data != nullptr) {
+    return CountAndSetBits(mask.buffers[0].data, mask.offset, mask.buffers[1].data,
+                           mask.offset, mask.length);
+  } else {
+    return CountSetBits(mask.buffers[1].data, mask.offset, mask.length);
+  }
+}
+
 }  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/util_internal.h b/cpp/src/arrow/compute/kernels/util_internal.h
index df2f2c64f3a..dba99759eaf 100644
--- a/cpp/src/arrow/compute/kernels/util_internal.h
+++ b/cpp/src/arrow/compute/kernels/util_internal.h
@@ -26,9 +26,14 @@
 #include "arrow/compute/kernels/codegen_internal.h"
 #include "arrow/compute/type_fwd.h"
 #include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/math_constants.h"
 
 namespace arrow {
+
+using internal::CountAndSetBits;
+using internal::CountSetBits;
+
 namespace compute {
 namespace internal {
 
@@ -42,31 +47,6 @@ constexpr Unsigned to_unsigned(T signed_) {
   return static_cast<Unsigned>(signed_);
 }
 
-// An internal data structure for unpacking a primitive argument to pass to a
-// kernel implementation
-struct PrimitiveArg {
-  const uint8_t* is_valid;
-  // If the bit_width is a multiple of 8 (i.e. not boolean), then "data" should
-  // be shifted by offset * (bit_width / 8). For bit-packed data, the offset
-  // must be used when indexing.
-  const uint8_t* data;
-  int bit_width;
-  int64_t length;
-  int64_t offset;
-  // This may be kUnknownNullCount if the null_count has not yet been computed,
-  // so use null_count != 0 to determine "may have nulls".
-  int64_t null_count;
-};
-
-// Get validity bitmap data or return nullptr if there is no validity buffer
-const uint8_t* GetValidityBitmap(const ArrayData& data);
-
-int GetBitWidth(const DataType& type);
-
-// Reduce code size by dealing with the unboxing of the kernel inputs once
-// rather than duplicating compiled code to do all these in each kernel.
-PrimitiveArg GetPrimitiveArg(const ArrayData& arr);
-
 // Augment a unary ArrayKernelExec which supports only array-like inputs
 // with support for scalar inputs. Scalars will be transformed to 1-long arrays
 // with the scalar's value (or null if the scalar is null) as its only
@@ -81,12 +61,12 @@ ArrayKernelExec TrivialScalarUnaryAsArraysExec(
 // Return (min, max) of a numerical array, ignore nulls.
 // For empty array, return the maximal number limit as 'min', and minimal limit as 'max'.
 template <typename T>
-ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArrayData& data) {
+ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArraySpan& data) {
   T min = std::numeric_limits<T>::max();
   T max = std::numeric_limits<T>::lowest();
 
   const T* values = data.GetValues<T>(1);
-  arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+  arrow::internal::VisitSetBitRunsVoid(data.buffers[0].data, data.offset, data.length,
                                        [&](int64_t pos, int64_t len) {
                                          for (int64_t i = 0; i < len; ++i) {
                                            min = std::min(min, values[pos + i]);
@@ -98,13 +78,13 @@ ARROW_NOINLINE std::pair<T, T> GetMinMax(const ArrayData& data) {
 }
 
 template <typename T>
-std::pair<T, T> GetMinMax(const Datum& datum) {
+std::pair<T, T> GetMinMax(const ChunkedArray& arr) {
   T min = std::numeric_limits<T>::max();
   T max = std::numeric_limits<T>::lowest();
 
-  for (const auto& array : datum.chunks()) {
+  for (const auto& chunk : arr.chunks()) {
     T local_min, local_max;
-    std::tie(local_min, local_max) = GetMinMax<T>(*array->data());
+    std::tie(local_min, local_max) = GetMinMax<T>(*chunk->data());
     min = std::min(min, local_min);
     max = std::max(max, local_max);
   }
@@ -115,11 +95,11 @@ std::pair<T, T> GetMinMax(const Datum& datum) {
 // Count value occurrences of an array, ignore nulls.
 // 'counts' must be zeroed and with enough size.
 template <typename T>
-ARROW_NOINLINE int64_t CountValues(uint64_t* counts, const ArrayData& data, T min) {
+ARROW_NOINLINE int64_t CountValues(const ArraySpan& data, T min, uint64_t* counts) {
   const int64_t n = data.length - data.GetNullCount();
   if (n > 0) {
     const T* values = data.GetValues<T>(1);
-    arrow::internal::VisitSetBitRunsVoid(data.buffers[0], data.offset, data.length,
+    arrow::internal::VisitSetBitRunsVoid(data.buffers[0].data, data.offset, data.length,
                                          [&](int64_t pos, int64_t len) {
                                            for (int64_t i = 0; i < len; ++i) {
                                              ++counts[values[pos + i] - min];
@@ -130,23 +110,23 @@ ARROW_NOINLINE int64_t CountValues(uint64_t* counts, const ArrayData& data, T mi
 }
 
 template <typename T>
-int64_t CountValues(uint64_t* counts, const Datum& datum, T min) {
+int64_t CountValues(const ChunkedArray& values, T min, uint64_t* counts) {
   int64_t n = 0;
-  for (const auto& array : datum.chunks()) {
-    n += CountValues<T>(counts, *array->data(), min);
+  for (const auto& array : values.chunks()) {
+    n += CountValues<T>(*array->data(), min, counts);
   }
   return n;
 }
 
 // Copy numerical array values to a buffer, ignore nulls.
 template <typename T>
-ARROW_NOINLINE int64_t CopyNonNullValues(const ArrayData& data, T* out) {
+ARROW_NOINLINE int64_t CopyNonNullValues(const ArraySpan& data, T* out) {
   const int64_t n = data.length - data.GetNullCount();
   if (n > 0) {
     int64_t index = 0;
     const T* values = data.GetValues<T>(1);
     arrow::internal::VisitSetBitRunsVoid(
-        data.buffers[0], data.offset, data.length, [&](int64_t pos, int64_t len) {
+        data.buffers[0].data, data.offset, data.length, [&](int64_t pos, int64_t len) {
           memcpy(out + index, values + pos, len * sizeof(T));
           index += len;
         });
@@ -155,14 +135,18 @@ ARROW_NOINLINE int64_t CopyNonNullValues(const ArrayData& data, T* out) {
 }
 
 template <typename T>
-int64_t CopyNonNullValues(const Datum& datum, T* out) {
+int64_t CopyNonNullValues(const ChunkedArray& arr, T* out) {
   int64_t n = 0;
-  for (const auto& array : datum.chunks()) {
-    n += CopyNonNullValues(*array->data(), out + n);
+  for (const auto& chunk : arr.chunks()) {
+    n += CopyNonNullValues(*chunk->data(), out + n);
   }
   return n;
 }
 
+ExecValue GetExecValue(const Datum& value);
+
+int64_t GetTrueCount(const ArraySpan& mask);
+
 }  // namespace internal
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
index a42d8a72933..2eadbe01c4e 100644
--- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
@@ -54,7 +54,7 @@ template <typename OutType, typename InType>
 struct PartitionNthToIndices {
   using ArrayType = typename TypeTraits<InType>::ArrayType;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     using GetView = GetViewType<InType>;
 
     if (ctx->state() == nullptr) {
@@ -62,13 +62,13 @@ struct PartitionNthToIndices {
     }
     const auto& options = PartitionNthToIndicesState::Get(ctx);
 
-    ArrayType arr(batch[0].array());
+    ArrayType arr(batch[0].array.ToArrayData());
 
     const int64_t pivot = options.pivot;
     if (pivot > arr.length()) {
       return Status::IndexError("NthToIndices index out of bound");
     }
-    ArrayData* out_arr = out->mutable_array();
+    ArrayData* out_arr = out->array_data().get();
     uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
     uint64_t* out_end = out_begin + arr.length();
     std::iota(out_begin, out_end, 0);
@@ -92,11 +92,11 @@ struct PartitionNthToIndices {
 
 template <typename OutType>
 struct PartitionNthToIndices<OutType, NullType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     if (ctx->state() == nullptr) {
       return Status::Invalid("NthToIndices requires PartitionNthOptions");
     }
-    ArrayData* out_arr = out->mutable_array();
+    ArrayData* out_arr = out->array_data().get();
     uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
     uint64_t* out_end = out_begin + batch.length;
     std::iota(out_begin, out_end, 0);
@@ -107,33 +107,32 @@ struct PartitionNthToIndices<OutType, NullType> {
 // ----------------------------------------------------------------------
 // Array sorting implementations
 
-template <typename ArrayType, typename VisitorNotNull, typename VisitorNull>
-inline void VisitRawValuesInline(const ArrayType& values,
+template <typename c_type, typename VisitorNotNull, typename VisitorNull>
+inline void VisitRawValuesInline(const ArraySpan& values,
                                  VisitorNotNull&& visitor_not_null,
                                  VisitorNull&& visitor_null) {
-  const auto data = values.raw_values();
-  auto validity_buf = values.data()->buffers[0];
-  const uint8_t* bitmap = validity_buf == nullptr ? nullptr : validity_buf->data();
+  const c_type* data = values.GetValues<c_type>(1);
+  const uint8_t* bitmap = values.buffers[0].data;
   VisitBitBlocksVoid(
-      bitmap, values.offset(), values.length(),
-      [&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
+      bitmap, values.offset, values.length, [&](int64_t i) { visitor_not_null(data[i]); },
+      [&]() { visitor_null(); });
 }
 
 template <typename VisitorNotNull, typename VisitorNull>
-inline void VisitRawValuesInline(const BooleanArray& values,
+inline void VisitRawValuesInline(const ArraySpan& values,
                                  VisitorNotNull&& visitor_not_null,
                                  VisitorNull&& visitor_null) {
-  if (values.null_count() != 0) {
-    const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
-    const uint8_t* bitmap = values.data()->buffers[0]->data();
+  if (values.null_count != 0) {
+    const uint8_t* data = values.GetValues<uint8_t>(1, 0);
+    const uint8_t* bitmap = values.buffers[0].data;
     VisitBitBlocksVoid(
-        bitmap, values.offset(), values.length(),
-        [&](int64_t i) { visitor_not_null(bit_util::GetBit(data, values.offset() + i)); },
+        bitmap, values.offset, values.length,
+        [&](int64_t i) { visitor_not_null(bit_util::GetBit(data, values.offset + i)); },
         [&]() { visitor_null(); });
   } else {
     // Can avoid GetBit() overhead in the no-nulls case
     VisitBitBlocksVoid(
-        values.data()->buffers[1]->data(), values.offset(), values.length(),
+        values.buffers[1].data, values.offset, values.length,
         [&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
   }
 }
@@ -144,7 +143,6 @@ class ArrayCompareSorter {
   using GetView = GetViewType<ArrowType>;
 
  public:
-  // `offset` is used when this is called on a chunk of a chunked array
   NullPartitionResult operator()(uint64_t* indices_begin, uint64_t* indices_end,
                                  const Array& array, int64_t offset,
                                  const ArraySortOptions& options) {
@@ -208,7 +206,6 @@ class ArrayCountSorter {
   c_type min_{0};
   uint32_t value_range_{0};
 
-  // `offset` is used when this is called on a chunk of a chunked array
   template <typename CounterType>
   NullPartitionResult SortInternal(uint64_t* indices_begin, uint64_t* indices_end,
                                    const ArrayType& values, int64_t offset,
@@ -255,8 +252,8 @@ class ArrayCountSorter {
 
   template <typename CounterType>
   void CountValues(const ArrayType& values, CounterType* counts) const {
-    VisitRawValuesInline(
-        values, [&](c_type v) { ++counts[v - min_]; }, []() {});
+    VisitRawValuesInline<c_type>(
+        *values.data(), [&](c_type v) { ++counts[v - min_]; }, []() {});
   }
 
   template <typename CounterType>
@@ -264,8 +261,9 @@ class ArrayCountSorter {
                    CounterType* counts) const {
     int64_t index = offset;
     CounterType count_nulls = 0;
-    VisitRawValuesInline(
-        values, [&](c_type v) { p.non_nulls_begin[counts[v - min_]++] = index++; },
+    VisitRawValuesInline<c_type>(
+        *values.data(),
+        [&](c_type v) { p.non_nulls_begin[counts[v - min_]++] = index++; },
         [&]() { p.nulls_begin[count_nulls++] = index++; });
   }
 };
@@ -275,7 +273,6 @@ class ArrayCountSorter<BooleanType> {
  public:
   ArrayCountSorter() = default;
 
-  // `offset` is used when this is called on a chunk of a chunked array
   NullPartitionResult operator()(uint64_t* indices_begin, uint64_t* indices_end,
                                  const Array& array, int64_t offset,
                                  const ArraySortOptions& options) {
@@ -306,7 +303,7 @@ class ArrayCountSorter<BooleanType> {
 
     int64_t index = offset;
     VisitRawValuesInline(
-        values, [&](bool v) { p.non_nulls_begin[counts[v]++] = index++; },
+        *values.data(), [&](bool v) { p.non_nulls_begin[counts[v]++] = index++; },
         [&]() { p.nulls_begin[counts[2]++] = index++; });
     return p;
   }
@@ -321,7 +318,6 @@ class ArrayCountOrCompareSorter {
   using c_type = typename ArrowType::c_type;
 
  public:
-  // `offset` is used when this is called on a chunk of a chunked array
   NullPartitionResult operator()(uint64_t* indices_begin, uint64_t* indices_end,
                                  const Array& array, int64_t offset,
                                  const ArraySortOptions& options) {
@@ -439,23 +435,14 @@ template <typename OutType, typename InType>
 struct ArraySortIndices {
   using ArrayType = typename TypeTraits<InType>::ArrayType;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = ArraySortIndicesState::Get(ctx);
-
-    ArrayData* out_arr = out->mutable_array();
-    DCHECK_EQ(out_arr->length, batch.length);
+    ArrayData* out_arr = out->array_data().get();
     uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
     uint64_t* out_end = out_begin + out_arr->length;
     std::iota(out_begin, out_end, 0);
 
-    if (batch[0].kind() == Datum::CHUNKED_ARRAY) {
-      return SortChunkedArray(ctx->exec_context(), out_begin, out_end,
-                              *batch[0].chunked_array(), options.order,
-                              options.null_placement);
-    }
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
-    ArrayType arr(batch[0].array());
+    ArrayType arr(batch[0].array.ToArrayData());
     ARROW_ASSIGN_OR_RAISE(auto sorter, GetArraySorter(*GetPhysicalType(arr.type())));
 
     sorter(out_begin, out_end, arr, 0, options);
@@ -463,6 +450,18 @@ struct ArraySortIndices {
   }
 };
 
+Status ArraySortIndicesChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const auto& options = ArraySortIndicesState::Get(ctx);
+  ArrayData* out_arr = out->mutable_array();
+  DCHECK_EQ(out_arr->length, batch.length);
+  uint64_t* out_begin = out_arr->GetMutableValues<uint64_t>(1);
+  uint64_t* out_end = out_begin + out_arr->length;
+  std::iota(out_begin, out_end, 0);
+  return SortChunkedArray(ctx->exec_context(), out_begin, out_end,
+                          *batch[0].chunked_array(), options.order,
+                          options.null_placement);
+}
+
 template <template <typename...> class ExecTemplate>
 void AddArraySortingKernels(VectorKernel base, VectorFunction* func) {
   // null type
@@ -477,30 +476,30 @@ void AddArraySortingKernels(VectorKernel base, VectorFunction* func) {
 
   // duration type
   base.signature = KernelSignature::Make({InputType::Array(Type::DURATION)}, uint64());
-  base.exec = GenerateNumericOld<ExecTemplate, UInt64Type>(*int64());
+  base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*int64());
   DCHECK_OK(func->AddKernel(base));
 
   for (const auto& ty : NumericTypes()) {
     auto physical_type = GetPhysicalType(ty);
     base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
-    base.exec = GenerateNumericOld<ExecTemplate, UInt64Type>(*physical_type);
+    base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
     DCHECK_OK(func->AddKernel(base));
   }
   for (const auto& ty : TemporalTypes()) {
     auto physical_type = GetPhysicalType(ty);
     base.signature = KernelSignature::Make({InputType::Array(ty->id())}, uint64());
-    base.exec = GenerateNumericOld<ExecTemplate, UInt64Type>(*physical_type);
+    base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
     DCHECK_OK(func->AddKernel(base));
   }
   for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) {
     base.signature = KernelSignature::Make({InputType::Array(id)}, uint64());
-    base.exec = GenerateDecimalOld<ExecTemplate, UInt64Type>(id);
+    base.exec = GenerateDecimal<ExecTemplate, UInt64Type>(id);
     DCHECK_OK(func->AddKernel(base));
   }
   for (const auto& ty : BaseBinaryTypes()) {
     auto physical_type = GetPhysicalType(ty);
     base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
-    base.exec = GenerateVarBinaryBaseOld<ExecTemplate, UInt64Type>(*physical_type);
+    base.exec = GenerateVarBinaryBase<ExecTemplate, UInt64Type>(*physical_type);
     DCHECK_OK(func->AddKernel(base));
   }
   base.signature =
@@ -561,6 +560,7 @@ void RegisterVectorArraySort(FunctionRegistry* registry) {
       "array_sort_indices", Arity::Unary(), array_sort_indices_doc,
       GetDefaultArraySortOptions());
   base.init = ArraySortIndicesState::Init;
+  base.exec_chunked = ArraySortIndicesChunked;
   AddArraySortingKernels<ArraySortIndices>(base, array_sort_indices.get());
   DCHECK_OK(registry->AddFunction(std::move(array_sort_indices)));
 
diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
index 8e2afe1af8c..241438c529e 100644
--- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
+++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
@@ -30,7 +30,9 @@
 namespace arrow {
 namespace compute {
 namespace internal {
+
 namespace {
+
 template <typename OptionsType>
 struct CumulativeOptionsWrapper : public OptionsWrapper<OptionsType> {
   using State = CumulativeOptionsWrapper<OptionsType>;
@@ -69,113 +71,93 @@ struct CumulativeOptionsWrapper : public OptionsWrapper<OptionsType> {
 // and output types, which will normally be the same (e.g. the cumulative sum of an array
 // of Int64Type will result in an array of Int64Type).
 template <typename OutType, typename ArgType, typename Op, typename OptionsType>
-struct CumulativeGeneric {
+struct Accumulator {
   using OutValue = typename GetOutputType<OutType>::T;
   using ArgValue = typename GetViewType<ArgType>::T;
 
   KernelContext* ctx;
-  ArgValue accumulator;
+  ArgValue current_value;
   bool skip_nulls;
   bool encountered_null = false;
-  Datum values;
-  NumericBuilder<OutType>* builder;
-
-  Status Cumulate(std::shared_ptr<ArrayData>* out_arr) {
-    switch (values.kind()) {
-      case Datum::SCALAR: {
-        auto in_scalar = values.scalar();
-        if (!skip_nulls && !in_scalar->is_valid) {
-          RETURN_NOT_OK(builder->AppendNull());
-          break;
-        }
-
-        if (skip_nulls && !in_scalar->is_valid) {
-          RETURN_NOT_OK(builder->Append(accumulator));
-          break;
-        }
-
-        Status st;
-        auto in_value = UnboxScalar<OutType>::Unbox(*(in_scalar));
-        auto result = Op::template Call<OutValue, ArgValue, ArgValue>(ctx, accumulator,
-                                                                      in_value, &st);
-        RETURN_NOT_OK(st);
-        RETURN_NOT_OK(builder->Append(result));
-        break;
-      }
-      case Datum::ARRAY: {
-        auto arr_input = values.array();
-        RETURN_NOT_OK(builder->Reserve(arr_input->length));
-        RETURN_NOT_OK(Call(*arr_input));
-        break;
-      }
-      case Datum::CHUNKED_ARRAY: {
-        const auto& chunked_input = values.chunked_array();
-        RETURN_NOT_OK(builder->Reserve(chunked_input->length()));
-
-        for (const auto& chunk : chunked_input->chunks()) {
-          RETURN_NOT_OK(Call(*chunk->data()));
-        }
-        break;
-      }
-      default:
-        return Status::NotImplemented(
-            "Unsupported input type for function 'cumulative_<operator>': ",
-            values.ToString());
-    }
+  NumericBuilder<OutType> builder;
 
-    RETURN_NOT_OK(builder->FinishInternal(out_arr));
-    return Status::OK();
-  }
+  explicit Accumulator(KernelContext* ctx) : ctx(ctx), builder(ctx->memory_pool()) {}
 
-  Status Call(const ArrayData& input) {
+  Status Accumulate(const ArraySpan& input) {
     Status st = Status::OK();
 
     if (skip_nulls || (input.GetNullCount() == 0 && !encountered_null)) {
       VisitArrayValuesInline<ArgType>(
           input,
           [&](ArgValue v) {
-            accumulator =
-                Op::template Call<OutValue, ArgValue, ArgValue>(ctx, v, accumulator, &st);
-            builder->UnsafeAppend(accumulator);
+            current_value = Op::template Call<OutValue, ArgValue, ArgValue>(
+                ctx, v, current_value, &st);
+            builder.UnsafeAppend(current_value);
           },
-          [&]() { builder->UnsafeAppendNull(); });
+          [&]() { builder.UnsafeAppendNull(); });
     } else {
       int64_t nulls_start_idx = 0;
       VisitArrayValuesInline<ArgType>(
           input,
           [&](ArgValue v) {
             if (!encountered_null) {
-              accumulator = Op::template Call<OutValue, ArgValue, ArgValue>(
-                  ctx, v, accumulator, &st);
-              builder->UnsafeAppend(accumulator);
+              current_value = Op::template Call<OutValue, ArgValue, ArgValue>(
+                  ctx, v, current_value, &st);
+              builder.UnsafeAppend(current_value);
               ++nulls_start_idx;
             }
           },
           [&]() { encountered_null = true; });
 
-      RETURN_NOT_OK(builder->AppendNulls(input.length - nulls_start_idx));
+      RETURN_NOT_OK(builder.AppendNulls(input.length - nulls_start_idx));
     }
 
     return st;
   }
+};
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+template <typename OutType, typename ArgType, typename Op, typename OptionsType>
+struct CumulativeKernel {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = CumulativeOptionsWrapper<OptionsType>::Get(ctx);
+    Accumulator<OutType, ArgType, Op, OptionsType> accumulator(ctx);
+    accumulator.current_value = UnboxScalar<OutType>::Unbox(*(options.start));
+    accumulator.skip_nulls = options.skip_nulls;
+
+    RETURN_NOT_OK(accumulator.builder.Reserve(batch.length));
 
-    auto start = UnboxScalar<OutType>::Unbox(*(options.start));
-    auto skip_nulls = options.skip_nulls;
-    NumericBuilder<OutType> builder(ctx->memory_pool());
+    if (batch[0].is_array()) {
+      RETURN_NOT_OK(accumulator.Accumulate(batch[0].array));
+    } else {
+      // TODO(wesm): address up-promotion at a higher level per ARROW-16756
+      ArraySpan span(*batch[0].scalar);
+      RETURN_NOT_OK(accumulator.Accumulate(span));
+    }
 
-    CumulativeGeneric self;
-    self.ctx = ctx;
-    self.accumulator = start;
-    self.skip_nulls = skip_nulls;
-    self.values = batch[0];
-    self.builder = &builder;
+    std::shared_ptr<ArrayData> result;
+    RETURN_NOT_OK(accumulator.builder.FinishInternal(&result));
+    out->value = std::move(result);
+    return Status::OK();
+  }
+};
 
-    std::shared_ptr<ArrayData> out_arr;
-    RETURN_NOT_OK(self.Cumulate(&out_arr));
-    out->value = std::move(out_arr);
+template <typename OutType, typename ArgType, typename Op, typename OptionsType>
+struct CumulativeKernelChunked {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const auto& options = CumulativeOptionsWrapper<OptionsType>::Get(ctx);
+    Accumulator<OutType, ArgType, Op, OptionsType> accumulator(ctx);
+    accumulator.current_value = UnboxScalar<OutType>::Unbox(*(options.start));
+    accumulator.skip_nulls = options.skip_nulls;
+
+    const ChunkedArray& chunked_input = *batch[0].chunked_array();
+    RETURN_NOT_OK(accumulator.builder.Reserve(chunked_input.length()));
+    std::vector<std::shared_ptr<Array>> out_chunks;
+    for (const auto& chunk : chunked_input.chunks()) {
+      RETURN_NOT_OK(accumulator.Accumulate(*chunk->data()));
+    }
+    std::shared_ptr<ArrayData> result;
+    RETURN_NOT_OK(accumulator.builder.FinishInternal(&result));
+    out->value = std::move(result);
     return Status::OK();
   }
 };
@@ -214,8 +196,13 @@ void MakeVectorCumulativeFunction(FunctionRegistry* registry, const std::string
     kernel.can_execute_chunkwise = false;
     kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
     kernel.mem_allocation = MemAllocation::type::NO_PREALLOCATE;
-    kernel.signature = KernelSignature::Make({InputType(ty)}, OutputType(ty));
-    kernel.exec = ArithmeticExecFromOpOld<CumulativeGeneric, Op, OptionsType>(ty);
+    kernel.signature =
+        KernelSignature::Make({InputType::Array(ty)}, OutputType(ValueDescr(ty)));
+    kernel.exec =
+        ArithmeticExecFromOp<CumulativeKernel, Op, ArrayKernelExec, OptionsType>(ty);
+    kernel.exec_chunked =
+        ArithmeticExecFromOp<CumulativeKernelChunked, Op, VectorKernel::ChunkedExec,
+                             OptionsType>(ty);
     kernel.init = CumulativeOptionsWrapper<OptionsType>::Init;
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc
index 3485ffffb45..f3fec8870fd 100644
--- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc
@@ -66,49 +66,24 @@ TEST(TestCumulativeSum, AllNulls) {
   }
 }
 
-TEST(TestCumulativeSum, ScalarInput) {
-  CumulativeSumOptions no_start_no_skip;
-  CumulativeSumOptions no_start_do_skip(0, true);
-  CumulativeSumOptions has_start_no_skip(10);
-  CumulativeSumOptions has_start_do_skip(10, true);
+using testing::HasSubstr;
 
-  for (auto ty : NumericTypes()) {
-    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "10"),
-                     ArrayFromJSON(ty, "[10]"), &no_start_no_skip);
-    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "10"),
-                     ArrayFromJSON(ty, "[10]"), &no_start_no_skip);
-
-    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "10"),
-                     ArrayFromJSON(ty, "[20]"), &has_start_no_skip);
-    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "10"),
-                     ArrayFromJSON(ty, "[20]"), &has_start_no_skip);
-
-    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[null]"), &no_start_no_skip);
-    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[null]"), &no_start_no_skip);
-    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[null]"), &has_start_no_skip);
-    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[null]"), &has_start_no_skip);
-
-    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[0]"), &no_start_do_skip);
-    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[0]"), &no_start_do_skip);
-    CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[10]"), &has_start_do_skip);
-    CheckVectorUnary("cumulative_sum_checked", ScalarFromJSON(ty, "null"),
-                     ArrayFromJSON(ty, "[10]"), &has_start_do_skip);
-  }
-}
+TEST(TestCumulativeSum, ScalarNotSupported) {
+  CumulativeSumOptions options;
 
-using testing::HasSubstr;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented, HasSubstr("no kernel"),
+      CallFunction("cumulative_sum", {std::make_shared<Int64Scalar>(5)}, &options));
+
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented, HasSubstr("no kernel"),
+      CallFunction("cumulative_sum_checked", {std::make_shared<Int64Scalar>(5)},
+                   &options));
+}
 
 template <typename ArrowType>
 void CheckCumulativeSumUnsignedOverflow() {
   using CType = typename TypeTraits<ArrowType>::CType;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
   using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
 
   CumulativeSumOptions pos_overflow(1);
@@ -124,19 +99,15 @@ void CheckCumulativeSumUnsignedOverflow() {
   ASSERT_OK(builder.Append(min));
   ASSERT_OK(builder.Finish(&min_arr));
 
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, HasSubstr("overflow"),
-      CallFunction("cumulative_sum_checked", {ScalarType(max)}, &pos_overflow));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, HasSubstr("overflow"),
       CallFunction("cumulative_sum_checked", {max_arr}, &pos_overflow));
-  CheckVectorUnary("cumulative_sum", ScalarType(max), min_arr, &pos_overflow);
+  CheckVectorUnary("cumulative_sum", max_arr, min_arr, &pos_overflow);
 }
 
 template <typename ArrowType>
 void CheckCumulativeSumSignedOverflow() {
   using CType = typename TypeTraits<ArrowType>::CType;
-  using ScalarType = typename TypeTraits<ArrowType>::ScalarType;
   using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
 
   CheckCumulativeSumUnsignedOverflow<ArrowType>();
@@ -153,14 +124,10 @@ void CheckCumulativeSumSignedOverflow() {
   builder.Reset();
   ASSERT_OK(builder.Append(min));
   ASSERT_OK(builder.Finish(&min_arr));
-
-  EXPECT_RAISES_WITH_MESSAGE_THAT(
-      Invalid, HasSubstr("overflow"),
-      CallFunction("cumulative_sum_checked", {ScalarType(min)}, &neg_overflow));
   EXPECT_RAISES_WITH_MESSAGE_THAT(
       Invalid, HasSubstr("overflow"),
       CallFunction("cumulative_sum_checked", {min_arr}, &neg_overflow));
-  CheckVectorUnary("cumulative_sum", ScalarType(min), max_arr, &neg_overflow);
+  CheckVectorUnary("cumulative_sum", min_arr, max_arr, &neg_overflow);
 }
 
 TEST(TestCumulativeSum, IntegerOverflow) {
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index 9640b4d4d9b..f4b846f1c9e 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -82,9 +82,9 @@ class UniqueAction final : public ActionBase {
 
   bool ShouldEncodeNulls() { return true; }
 
-  Status Flush(Datum* out) { return Status::OK(); }
+  Status Flush(ExecResult* out) { return Status::OK(); }
 
-  Status FlushFinal(Datum* out) { return Status::OK(); }
+  Status FlushFinal(ExecResult* out) { return Status::OK(); }
 };
 
 // ----------------------------------------------------------------------
@@ -112,10 +112,10 @@ class ValueCountsAction final : ActionBase {
 
   // Don't do anything on flush because we don't want to finalize the builder
   // or incur the cost of memory copies.
-  Status Flush(Datum* out) { return Status::OK(); }
+  Status Flush(ExecResult* out) { return Status::OK(); }
 
   // Return the counts corresponding the MemoTable keys.
-  Status FlushFinal(Datum* out) {
+  Status FlushFinal(ExecResult* out) {
     std::shared_ptr<ArrayData> result;
     RETURN_NOT_OK(count_builder_.FinishInternal(&result));
     out->value = std::move(result);
@@ -211,14 +211,14 @@ class DictEncodeAction final : public ActionBase {
     return encode_options_.null_encoding_behavior == DictionaryEncodeOptions::ENCODE;
   }
 
-  Status Flush(Datum* out) {
+  Status Flush(ExecResult* out) {
     std::shared_ptr<ArrayData> result;
     RETURN_NOT_OK(indices_builder_.FinishInternal(&result));
     out->value = std::move(result);
     return Status::OK();
   }
 
-  Status FlushFinal(Datum* out) { return Status::OK(); }
+  Status FlushFinal(ExecResult* out) { return Status::OK(); }
 
  private:
   Int32Builder indices_builder_;
@@ -234,23 +234,23 @@ class HashKernel : public KernelState {
   virtual Status Reset() = 0;
 
   // Flush out accumulated results from the last invocation of Call.
-  virtual Status Flush(Datum* out) = 0;
+  virtual Status Flush(ExecResult* out) = 0;
   // Flush out accumulated results across all invocations of Call. The kernel
   // should not be used until after Reset() is called.
-  virtual Status FlushFinal(Datum* out) = 0;
+  virtual Status FlushFinal(ExecResult* out) = 0;
   // Get the values (keys) accumulated in the dictionary so far.
   virtual Status GetDictionary(std::shared_ptr<ArrayData>* out) = 0;
 
   virtual std::shared_ptr<DataType> value_type() const = 0;
 
-  Status Append(KernelContext* ctx, const ArrayData& input) {
+  Status Append(KernelContext* ctx, const ArraySpan& input) {
     std::lock_guard<std::mutex> guard(lock_);
     return Append(input);
   }
 
   // Prepare the Action for the given input (e.g. reserve appropriately sized
   // data structures) and visit the given input with Action.
-  virtual Status Append(const ArrayData& arr) = 0;
+  virtual Status Append(const ArraySpan& arr) = 0;
 
  protected:
   const FunctionOptions* options_;
@@ -274,14 +274,14 @@ class RegularHashKernel : public HashKernel {
     return action_.Reset();
   }
 
-  Status Append(const ArrayData& arr) override {
+  Status Append(const ArraySpan& arr) override {
     RETURN_NOT_OK(action_.Reserve(arr.length));
     return DoAppend(arr);
   }
 
-  Status Flush(Datum* out) override { return action_.Flush(out); }
+  Status Flush(ExecResult* out) override { return action_.Flush(out); }
 
-  Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); }
+  Status FlushFinal(ExecResult* out) override { return action_.FlushFinal(out); }
 
   Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
     return DictionaryTraits<Type>::GetDictionaryArrayData(pool_, type_, *memo_table_,
@@ -291,7 +291,7 @@ class RegularHashKernel : public HashKernel {
   std::shared_ptr<DataType> value_type() const override { return type_; }
 
   template <bool HasError = with_error_status>
-  enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
+  enable_if_t<!HasError, Status> DoAppend(const ArraySpan& arr) {
     return VisitArraySpanInline<Type>(
         arr,
         [this](Scalar v) {
@@ -323,7 +323,7 @@ class RegularHashKernel : public HashKernel {
   }
 
   template <bool HasError = with_error_status>
-  enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
+  enable_if_t<HasError, Status> DoAppend(const ArraySpan& arr) {
     return VisitArraySpanInline<Type>(
         arr,
         [this](Scalar v) {
@@ -377,10 +377,10 @@ class NullHashKernel : public HashKernel {
 
   Status Reset() override { return action_.Reset(); }
 
-  Status Append(const ArrayData& arr) override { return DoAppend(arr); }
+  Status Append(const ArraySpan& arr) override { return DoAppend(arr); }
 
   template <bool HasError = with_error_status>
-  enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
+  enable_if_t<!HasError, Status> DoAppend(const ArraySpan& arr) {
     RETURN_NOT_OK(action_.Reserve(arr.length));
     for (int64_t i = 0; i < arr.length; ++i) {
       if (i == 0) {
@@ -394,7 +394,7 @@ class NullHashKernel : public HashKernel {
   }
 
   template <bool HasError = with_error_status>
-  enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
+  enable_if_t<HasError, Status> DoAppend(const ArraySpan& arr) {
     Status s = Status::OK();
     RETURN_NOT_OK(action_.Reserve(arr.length));
     for (int64_t i = 0; i < arr.length; ++i) {
@@ -408,8 +408,8 @@ class NullHashKernel : public HashKernel {
     return s;
   }
 
-  Status Flush(Datum* out) override { return action_.Flush(out); }
-  Status FlushFinal(Datum* out) override { return action_.FlushFinal(out); }
+  Status Flush(ExecResult* out) override { return action_.Flush(out); }
+  Status FlushFinal(ExecResult* out) override { return action_.FlushFinal(out); }
 
   Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
     std::shared_ptr<NullArray> null_array;
@@ -443,10 +443,11 @@ class DictionaryHashKernel : public HashKernel {
 
   Status Reset() override { return indices_kernel_->Reset(); }
 
-  Status Append(const ArrayData& arr) override {
+  Status Append(const ArraySpan& arr) override {
+    auto arr_dict = arr.dictionary().ToArray();
     if (!dictionary_) {
-      dictionary_ = arr.dictionary;
-    } else if (!MakeArray(dictionary_)->Equals(*MakeArray(arr.dictionary))) {
+      dictionary_ = arr_dict;
+    } else if (!dictionary_->Equals(*arr_dict)) {
       // NOTE: This approach computes a new dictionary unification per chunk.
       // This is in effect O(n*k) where n is the total chunked array length and
       // k is the number of chunks (therefore O(n**2) if chunks have a fixed size).
@@ -454,30 +455,30 @@ class DictionaryHashKernel : public HashKernel {
       // A better approach may be to run the kernel over each individual chunk,
       // and then hash-aggregate all results (for example sum-group-by for
       // the "value_counts" kernel).
-      auto out_dict_type = dictionary_->type;
+      auto out_dict_type = dictionary_->type();
       std::shared_ptr<Buffer> transpose_map;
       std::shared_ptr<Array> out_dict;
       ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(out_dict_type));
 
-      ARROW_CHECK_OK(unifier->Unify(*MakeArray(dictionary_)));
-      ARROW_CHECK_OK(unifier->Unify(*MakeArray(arr.dictionary), &transpose_map));
+      ARROW_CHECK_OK(unifier->Unify(*dictionary_));
+      ARROW_CHECK_OK(unifier->Unify(*arr_dict, &transpose_map));
       ARROW_CHECK_OK(unifier->GetResult(&out_dict_type, &out_dict));
 
-      this->dictionary_ = out_dict->data();
+      dictionary_ = out_dict;
       auto transpose = reinterpret_cast<const int32_t*>(transpose_map->data());
-      auto in_dict_array = MakeArray(std::make_shared<ArrayData>(arr));
+      auto in_dict_array = arr.ToArray();
       ARROW_ASSIGN_OR_RAISE(
           auto tmp, arrow::internal::checked_cast<const DictionaryArray&>(*in_dict_array)
-                        .Transpose(arr.type, out_dict, transpose));
+                        .Transpose(arr.type->Copy(), out_dict, transpose));
       return indices_kernel_->Append(*tmp->data());
     }
 
     return indices_kernel_->Append(arr);
   }
 
-  Status Flush(Datum* out) override { return indices_kernel_->Flush(out); }
+  Status Flush(ExecResult* out) override { return indices_kernel_->Flush(out); }
 
-  Status FlushFinal(Datum* out) override { return indices_kernel_->FlushFinal(out); }
+  Status FlushFinal(ExecResult* out) override { return indices_kernel_->FlushFinal(out); }
 
   Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
     return indices_kernel_->GetDictionary(out);
@@ -491,11 +492,11 @@ class DictionaryHashKernel : public HashKernel {
     return dictionary_value_type_;
   }
 
-  std::shared_ptr<ArrayData> dictionary() const { return dictionary_; }
+  std::shared_ptr<Array> dictionary() const { return dictionary_; }
 
  private:
   std::unique_ptr<HashKernel> indices_kernel_;
-  std::shared_ptr<ArrayData> dictionary_;
+  std::shared_ptr<Array> dictionary_;
   std::shared_ptr<DataType> dictionary_value_type_;
 };
 
@@ -617,9 +618,9 @@ Result<std::unique_ptr<KernelState>> DictionaryHashInit(KernelContext* ctx,
       std::move(indices_hasher.ValueOrDie()), dict_type.value_type());
 }
 
-Status HashExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
-  RETURN_NOT_OK(hash_impl->Append(ctx, *batch[0].array()));
+  RETURN_NOT_OK(hash_impl->Append(ctx, batch[0].array));
   RETURN_NOT_OK(hash_impl->Flush(out));
   return Status::OK();
 }
@@ -656,11 +657,12 @@ std::shared_ptr<ArrayData> BoxValueCounts(const std::shared_ptr<ArrayData>& uniq
 Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash_impl = checked_cast<HashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  Datum value_counts;
 
   RETURN_NOT_OK(hash_impl->GetDictionary(&uniques));
-  RETURN_NOT_OK(hash_impl->FlushFinal(&value_counts));
-  *out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
+
+  ExecResult result;
+  RETURN_NOT_OK(hash_impl->FlushFinal(&result));
+  *out = {Datum(BoxValueCounts(uniques, result.array_data()))};
   return Status::OK();
 }
 
@@ -670,7 +672,7 @@ Status ValueCountsFinalize(KernelContext* ctx, std::vector<Datum>* out) {
 Result<std::shared_ptr<ArrayData>> EnsureHashDictionary(KernelContext* ctx,
                                                         DictionaryHashKernel* hash) {
   if (hash->dictionary()) {
-    return hash->dictionary();
+    return hash->dictionary()->data();
   }
   ARROW_ASSIGN_OR_RAISE(auto null, MakeArrayOfNull(hash->dictionary_value_type(),
                                                    /*length=*/0, ctx->memory_pool()));
@@ -688,11 +690,11 @@ Status UniqueFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
 Status ValueCountsFinalizeDictionary(KernelContext* ctx, std::vector<Datum>* out) {
   auto hash = checked_cast<DictionaryHashKernel*>(ctx->state());
   std::shared_ptr<ArrayData> uniques;
-  Datum value_counts;
+  ExecResult result;
   RETURN_NOT_OK(hash->GetDictionary(&uniques));
-  RETURN_NOT_OK(hash->FlushFinal(&value_counts));
+  RETURN_NOT_OK(hash->FlushFinal(&result));
   ARROW_ASSIGN_OR_RAISE(uniques->dictionary, EnsureHashDictionary(ctx, hash));
-  *out = {Datum(BoxValueCounts(uniques, value_counts.array()))};
+  *out = {Datum(BoxValueCounts(uniques, result.array_data()))};
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_nested.cc b/cpp/src/arrow/compute/kernels/vector_nested.cc
index 9cc1e72bdbc..1ca96e9f83a 100644
--- a/cpp/src/arrow/compute/kernels/vector_nested.cc
+++ b/cpp/src/arrow/compute/kernels/vector_nested.cc
@@ -28,10 +28,10 @@ namespace internal {
 namespace {
 
 template <typename Type>
-Status ListFlatten(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  typename TypeTraits<Type>::ArrayType list_array(batch[0].array());
+Status ListFlatten(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  typename TypeTraits<Type>::ArrayType list_array(batch[0].array.ToArrayData());
   ARROW_ASSIGN_OR_RAISE(auto result, list_array.Flatten(ctx->memory_pool()));
-  out->value = result->data();
+  out->value = std::move(result->data());
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_replace.cc b/cpp/src/arrow/compute/kernels/vector_replace.cc
index ff425f9e166..151757884ba 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -18,6 +18,7 @@
 #include "arrow/compute/api_scalar.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/copy_data_internal.h"
+#include "arrow/compute/kernels/util_internal.h"
 #include "arrow/util/bitmap_ops.h"
 
 namespace arrow {
@@ -26,46 +27,6 @@ namespace internal {
 
 namespace {
 
-// Helper to implement replace_with kernel with scalar mask for fixed-width types,
-// using callbacks to handle both bool and byte-sized types
-template <typename Type>
-Result<int64_t> ReplaceWithScalarMask(KernelContext* ctx, const ArrayData& array,
-                                      const BooleanScalar& mask,
-                                      const Datum& replacements,
-                                      int64_t replacements_offset, ArrayData* output) {
-  Datum source = array;
-  int64_t source_offset = 0;
-  if (!mask.is_valid) {
-    // Output = null
-    source = MakeNullScalar(output->type);
-  } else if (mask.value) {
-    // Output = replacement
-    source = replacements;
-    source_offset = replacements_offset;
-  }
-  uint8_t* out_bitmap = output->buffers[0]->mutable_data();
-  uint8_t* out_values = output->buffers[1]->mutable_data();
-  const int64_t out_offset = output->offset;
-  if (source.is_array()) {
-    const ArrayData& in_data = *source.array();
-    CopyDataUtils<Type>::CopyData(*array.type, in_data, source_offset, out_values,
-                                  out_offset, array.length);
-    if (in_data.MayHaveNulls()) {
-      arrow::internal::CopyBitmap(in_data.buffers[0]->data(),
-                                  in_data.offset + source_offset, array.length,
-                                  out_bitmap, out_offset);
-    } else {
-      bit_util::SetBitsTo(out_bitmap, out_offset, array.length, true);
-    }
-  } else {
-    const Scalar& in_data = *source.scalar();
-    CopyDataUtils<Type>::CopyData(*array.type, in_data, source_offset, out_values,
-                                  out_offset, array.length);
-    bit_util::SetBitsTo(out_bitmap, out_offset, array.length, in_data.is_valid);
-  }
-  return replacements_offset + array.length;
-}
-
 struct CopyArrayBitmap {
   const uint8_t* in_bitmap;
   int64_t in_offset;
@@ -96,17 +57,17 @@ struct CopyScalarBitmap {
   }
 };
 
-// Helper to implement replace_with kernel with array mask for fixed-width types,
-// using callbacks to handle both bool and byte-sized types and to handle
-// scalar and array replacements
+// Implement replace_with kernel with array mask for fixed-width types, using
+// callbacks to handle both bool and byte-sized types and to handle scalar and
+// array replacements
 template <typename Type, typename Data, typename CopyBitmap>
-int64_t ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
-                                 int64_t mask_offset, const Data& replacements,
-                                 bool replacements_bitmap, int64_t replacements_offset,
-                                 const CopyBitmap& copy_bitmap, uint8_t* out_bitmap,
-                                 uint8_t* out_values, const int64_t out_offset) {
-  const uint8_t* mask_bitmap = mask.MayHaveNulls() ? mask.buffers[0]->data() : nullptr;
-  const uint8_t* mask_values = mask.buffers[1]->data();
+int64_t ReplaceMaskArrayImpl(const ArraySpan& array, const ArraySpan& mask,
+                             int64_t mask_offset, const Data& replacements,
+                             bool replacements_bitmap, int64_t replacements_offset,
+                             const CopyBitmap& copy_bitmap, uint8_t* out_bitmap,
+                             uint8_t* out_values, const int64_t out_offset) {
+  const uint8_t* mask_bitmap = mask.buffers[0].data;
+  const uint8_t* mask_values = mask.buffers[1].data;
   CopyDataUtils<Type>::CopyData(*array.type, array, /*in_offset=*/0, out_values,
                                 /*out_offset=*/0, array.length);
   arrow::internal::OptionalBinaryBitBlockCounter counter(
@@ -137,7 +98,6 @@ int64_t ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
                                         /*length=*/1);
           if (replacements_bitmap) {
             copy_bitmap.SetBit(out_bitmap, out_offset + write_offset + i,
-
                                replacements_offset);
           } else if (out_bitmap) {
             bit_util::SetBitTo(out_bitmap, out_offset + write_offset + i, true);
@@ -151,161 +111,188 @@ int64_t ReplaceWithArrayMaskImpl(const ArrayData& array, const ArrayData& mask,
   return replacements_offset;
 }
 
+template <typename Type, typename Enable = void>
+struct ReplaceMaskImpl {};
+
 template <typename Type>
-Result<int64_t> ReplaceWithArrayMask(KernelContext* ctx, const ArrayData& array,
-                                     const ArrayData& mask, int64_t mask_offset,
-                                     const Datum& replacements,
-                                     int64_t replacements_offset, ArrayData* output) {
-  const int64_t out_offset = output->offset;
-  uint8_t* out_bitmap = nullptr;
-  uint8_t* out_values = output->buffers[1]->mutable_data();
-  const bool replacements_bitmap =
-      replacements.is_array() ? replacements.array()->MayHaveNulls() : true;
-  if (array.MayHaveNulls() || mask.MayHaveNulls() || replacements_bitmap) {
-    out_bitmap = output->buffers[0]->mutable_data();
-    output->null_count = -1;
-    if (array.MayHaveNulls()) {
-      // Copy array's bitmap
-      arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset, array.length,
-                                  out_bitmap, out_offset);
+struct ReplaceMaskImpl<
+    Type, enable_if_t<!(is_base_binary_type<Type>::value || is_null_type<Type>::value)>> {
+  static Result<int64_t> ExecScalarMask(KernelContext* ctx, const ArraySpan& array,
+                                        const BooleanScalar& mask, ExecValue replacements,
+                                        int64_t replacements_offset, ExecResult* out) {
+    // Implement replace_with kernel with scalar mask for fixed-width types
+    ExecValue source = array;
+    int64_t source_offset = 0;
+    std::shared_ptr<Scalar> null_scalar;
+    if (!mask.is_valid) {
+      // Output = null
+      null_scalar = MakeNullScalar(out->type()->Copy());
+      source.SetScalar(null_scalar.get());
+    } else if (mask.value) {
+      // Output = replacement
+      source = replacements;
+      source_offset = replacements_offset;
+    }
+    ArrayData* out_arr = out->array_data().get();
+    uint8_t* out_bitmap = out_arr->buffers[0]->mutable_data();
+    uint8_t* out_values = out_arr->buffers[1]->mutable_data();
+    const int64_t out_offset = out_arr->offset;
+    if (source.is_array()) {
+      const ArraySpan& in_data = source.array;
+      CopyDataUtils<Type>::CopyData(*array.type, in_data, source_offset, out_values,
+                                    out_offset, array.length);
+      if (in_data.MayHaveNulls()) {
+        arrow::internal::CopyBitmap(in_data.buffers[0].data,
+                                    in_data.offset + source_offset, array.length,
+                                    out_bitmap, out_offset);
+      } else {
+        bit_util::SetBitsTo(out_bitmap, out_offset, array.length, true);
+      }
     } else {
-      // Array has no bitmap but mask/replacements do, generate an all-valid bitmap
-      bit_util::SetBitsTo(out_bitmap, out_offset, array.length, true);
+      const Scalar& in_data = *source.scalar;
+      CopyDataUtils<Type>::CopyData(*array.type, in_data, source_offset, out_values,
+                                    out_offset, array.length);
+      bit_util::SetBitsTo(out_bitmap, out_offset, array.length, in_data.is_valid);
+    }
+    return replacements_offset + array.length;
+  }
+
+  static Result<int64_t> ExecArrayMask(KernelContext* ctx, const ArraySpan& array,
+                                       const ArraySpan& mask, int64_t mask_offset,
+                                       ExecValue replacements,
+                                       int64_t replacements_offset, ExecResult* out) {
+    ArrayData* out_arr = out->array_data().get();
+    out_arr->length = array.length;
+    const int64_t out_offset = out_arr->offset;
+    uint8_t* out_bitmap = nullptr;
+    uint8_t* out_values = out_arr->buffers[1]->mutable_data();
+    const bool replacements_bitmap =
+        replacements.is_array() ? replacements.array.MayHaveNulls() : true;
+    if (array.MayHaveNulls() || mask.MayHaveNulls() || replacements_bitmap) {
+      out_bitmap = out_arr->buffers[0]->mutable_data();
+      out_arr->null_count = kUnknownNullCount;
+      if (array.MayHaveNulls()) {
+        // Copy array's bitmap
+        arrow::internal::CopyBitmap(array.buffers[0].data, array.offset, array.length,
+                                    out_bitmap, out_offset);
+      } else {
+        // Array has no bitmap but mask/replacements do, generate an all-valid bitmap
+        bit_util::SetBitsTo(out_bitmap, out_offset, array.length, true);
+      }
+    } else {
+      bit_util::SetBitsTo(out_arr->buffers[0]->mutable_data(), out_offset, array.length,
+                          true);
+      out_arr->null_count = 0;
     }
-  } else {
-    bit_util::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, array.length,
-                        true);
-    output->null_count = 0;
-  }
-
-  int64_t new_replacements_offset = replacements_offset;
-  if (replacements.is_array()) {
-    const ArrayData& array_repl = *replacements.array();
-    new_replacements_offset = ReplaceWithArrayMaskImpl<Type>(
-        array, mask, mask_offset, array_repl, replacements_bitmap, replacements_offset,
-        CopyArrayBitmap{replacements_bitmap ? array_repl.buffers[0]->data() : nullptr,
-                        array_repl.offset},
-        out_bitmap, out_values, out_offset);
-  } else {
-    const Scalar& scalar_repl = *replacements.scalar();
-    new_replacements_offset = ReplaceWithArrayMaskImpl<Type>(
-        array, mask, mask_offset, scalar_repl, replacements_bitmap, replacements_offset,
-        CopyScalarBitmap{scalar_repl.is_valid}, out_bitmap, out_values, out_offset);
-  }
-
-  if (mask.MayHaveNulls()) {
-    arrow::internal::BitmapAnd(out_bitmap, out_offset, mask.buffers[0]->data(),
-                               mask.offset + mask_offset, array.length, out_offset,
-                               out_bitmap);
-  }
-  return new_replacements_offset;
-}
 
-template <typename Type, typename Enable = void>
-struct ReplaceWithMask {};
+    int64_t new_replacements_offset = replacements_offset;
+    if (replacements.is_array()) {
+      const ArraySpan& source = replacements.array;
+      new_replacements_offset = ReplaceMaskArrayImpl<Type>(
+          array, mask, mask_offset, source, replacements_bitmap, replacements_offset,
+          CopyArrayBitmap{replacements_bitmap ? source.buffers[0].data : nullptr,
+                          source.offset},
+          out_bitmap, out_values, out_offset);
+    } else {
+      const Scalar& source = *replacements.scalar;
+      new_replacements_offset = ReplaceMaskArrayImpl<Type>(
+          array, mask, mask_offset, source, replacements_bitmap, replacements_offset,
+          CopyScalarBitmap{source.is_valid}, out_bitmap, out_values, out_offset);
+    }
 
-template <typename Type>
-struct ReplaceWithMask<
-    Type, enable_if_t<!(is_base_binary_type<Type>::value || is_null_type<Type>::value)>> {
-  static Result<int64_t> ExecScalarMask(KernelContext* ctx, const ArrayData& array,
-                                        const BooleanScalar& mask,
-                                        const Datum& replacements,
-                                        int64_t replacements_offset, ArrayData* output) {
-    return ReplaceWithScalarMask<Type>(ctx, array, mask, replacements,
-                                       replacements_offset, output);
-  }
-  static Result<int64_t> ExecArrayMask(KernelContext* ctx, const ArrayData& array,
-                                       const ArrayData& mask, int64_t mask_offset,
-                                       const Datum& replacements,
-                                       int64_t replacements_offset, ArrayData* output) {
-    return ReplaceWithArrayMask<Type>(ctx, array, mask, mask_offset, replacements,
-                                      replacements_offset, output);
+    if (mask.MayHaveNulls()) {
+      arrow::internal::BitmapAnd(out_bitmap, out_offset, mask.buffers[0].data,
+                                 mask.offset + mask_offset, array.length, out_offset,
+                                 out_bitmap);
+    }
+    return new_replacements_offset;
   }
 };
 
 template <typename Type>
-struct ReplaceWithMask<Type, enable_if_null<Type>> {
-  static Result<int64_t> ExecScalarMask(KernelContext* ctx, const ArrayData& array,
-                                        const BooleanScalar& mask,
-                                        const Datum& replacements,
-                                        int64_t replacements_offset, ArrayData* output) {
-    *output = array;
+struct ReplaceMaskImpl<Type, enable_if_null<Type>> {
+  static Result<int64_t> ExecScalarMask(KernelContext* ctx, const ArraySpan& array,
+                                        const BooleanScalar& mask, ExecValue replacements,
+                                        int64_t replacements_offset, ExecResult* out) {
+    out->value = array;
     return Status::OK();
   }
-  static Result<int64_t> ExecArrayMask(KernelContext* ctx, const ArrayData& array,
-                                       const ArrayData& mask, int64_t mask_offset,
-                                       const Datum& replacements,
-                                       int64_t replacements_offset, ArrayData* output) {
-    *output = array;
+  static Result<int64_t> ExecArrayMask(KernelContext* ctx, const ArraySpan& array,
+                                       const ArraySpan& mask, int64_t mask_offset,
+                                       ExecValue replacements,
+                                       int64_t replacements_offset, ExecResult* out) {
+    out->value = array;
     return Status::OK();
   }
 };
 
 template <typename Type>
-struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
+struct ReplaceMaskImpl<Type, enable_if_base_binary<Type>> {
   using offset_type = typename Type::offset_type;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
 
-  static Result<int64_t> ExecScalarMask(KernelContext* ctx, const ArrayData& array,
-                                        const BooleanScalar& mask,
-                                        const Datum& replacements,
-                                        int64_t replacements_offset, ArrayData* output) {
+  static Result<int64_t> ExecScalarMask(KernelContext* ctx, const ArraySpan& array,
+                                        const BooleanScalar& mask, ExecValue replacements,
+                                        int64_t replacements_offset, ExecResult* out) {
     if (!mask.is_valid) {
       // Output = null
       ARROW_ASSIGN_OR_RAISE(
           auto replacement_array,
-          MakeArrayOfNull(array.type, array.length, ctx->memory_pool()));
-      *output = *replacement_array->data();
+          MakeArrayOfNull(array.type->Copy(), array.length, ctx->memory_pool()));
+      out->value = std::move(replacement_array->data());
       return replacements_offset;
     } else if (mask.value) {
       // Output = replacement
       if (replacements.is_scalar()) {
-        ARROW_ASSIGN_OR_RAISE(auto replacement_array,
-                              MakeArrayFromScalar(*replacements.scalar(), array.length,
-                                                  ctx->memory_pool()));
-        *output = *replacement_array->data();
+        ARROW_ASSIGN_OR_RAISE(
+            auto replacement_array,
+            MakeArrayFromScalar(*replacements.scalar, array.length, ctx->memory_pool()));
+        out->value = std::move(replacement_array->data());
       } else {
-        const ArrayData& replacement_array = *replacements.array();
-        *output = replacement_array;
-        output->offset += replacements_offset;
-        output->length = array.length;
-        output->null_count = kUnknownNullCount;
+        // Set to be a slice of replacements
+        std::shared_ptr<ArrayData> result = replacements.array.ToArrayData();
+        result->offset += replacements_offset;
+        result->length = array.length;
+
+        // TODO(wesm): why is the replacements null count not sufficient?
+        result->null_count = kUnknownNullCount;
+        out->value = result;
       }
       return replacements_offset + array.length;
     } else {
       // Output = input
-      *output = array;
+      out->value = array.ToArrayData();
       return replacements_offset;
     }
   }
-  static Result<int64_t> ExecArrayMask(KernelContext* ctx, const ArrayData& array,
-                                       const ArrayData& mask, int64_t mask_offset,
-                                       const Datum& replacements,
-                                       int64_t replacements_offset, ArrayData* output) {
-    BuilderType builder(array.type, ctx->memory_pool());
+  static Result<int64_t> ExecArrayMask(KernelContext* ctx, const ArraySpan& array,
+                                       const ArraySpan& mask, int64_t mask_offset,
+                                       ExecValue replacements,
+                                       int64_t replacements_offset, ExecResult* out) {
+    BuilderType builder(array.type->Copy(), ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(array.length));
-    RETURN_NOT_OK(builder.ReserveData(array.buffers[2]->size()));
+    RETURN_NOT_OK(builder.ReserveData(array.buffers[2].size));
     int64_t source_offset = 0;
 
-    ArrayData adjusted_mask = mask;
+    ArraySpan adjusted_mask = mask;
     adjusted_mask.offset += mask_offset;
     adjusted_mask.length = std::min(adjusted_mask.length - mask_offset, array.length);
     RETURN_NOT_OK(VisitArraySpanInline<BooleanType>(
         adjusted_mask,
         [&](bool replace) {
           if (replace && replacements.is_scalar()) {
-            const Scalar& scalar = *replacements.scalar();
+            const Scalar& scalar = *replacements.scalar;
             if (scalar.is_valid) {
               RETURN_NOT_OK(builder.Append(UnboxScalar<Type>::Unbox(scalar)));
             } else {
               RETURN_NOT_OK(builder.AppendNull());
             }
           } else {
-            const ArrayData& source = replace ? *replacements.array() : array;
+            const ArraySpan& source = replace ? replacements.array : array;
             const int64_t offset = replace ? replacements_offset++ : source_offset;
             if (!source.MayHaveNulls() ||
-                bit_util::GetBit(source.buffers[0]->data(), source.offset + offset)) {
-              const uint8_t* data = source.buffers[2]->data();
+                bit_util::GetBit(source.buffers[0].data, source.offset + offset)) {
+              const uint8_t* data = source.buffers[2].data;
               const offset_type* offsets = source.GetValues<offset_type>(1);
               const offset_type offset0 = offsets[offset];
               const offset_type offset1 = offsets[offset + 1];
@@ -322,116 +309,69 @@ struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
           source_offset++;
           return Status::OK();
         }));
-    std::shared_ptr<Array> temp_output;
-    RETURN_NOT_OK(builder.Finish(&temp_output));
-    *output = *temp_output->data();
+    std::shared_ptr<ArrayData> temp_output;
+    RETURN_NOT_OK(builder.FinishInternal(&temp_output));
     // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
-    output->type = array.type;
+    temp_output->type = array.type->Copy();
+    out->value = std::move(temp_output);
     return replacements_offset;
   }
 };
 
-template <typename Type>
-struct ReplaceWithMaskFunctor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const Datum& replacements = batch[2];
-
-    // Needed for FixedSizeBinary/parameterized types
-    if (!batch[0].type()->Equals(*replacements.type(), /*check_metadata=*/false)) {
-      return Status::Invalid("Replacements must be of same type (expected ",
-                             batch[0].type()->ToString(), " but got ",
-                             replacements.type()->ToString(), ")");
-    }
-    if (!replacements.is_array() && !replacements.is_scalar()) {
-      return Status::Invalid("Replacements must be array or scalar, not ",
-                             replacements.ToString());
-    }
-    if (!batch[1].is_array() && !batch[1].is_scalar()) {
-      return Status::Invalid("Mask must be array or scalar, not ", batch[1].ToString());
-    }
-
-    int64_t mask_count = 0;
-    if (batch[1].is_scalar()) {
-      const auto& mask = batch[1].scalar_as<BooleanScalar>();
-      mask_count = (mask.is_valid && mask.value) ? batch[0].length() : 0;
-    } else {
-      const ArrayData& mask = *batch[1].array();
-      BooleanArray mask_arr(mask.length, mask.buffers[1], mask.buffers[0],
-                            mask.null_count, mask.offset);
-      mask_count = mask_arr.true_count();
-
-      if (mask.length != batch[0].length()) {
-        return Status::Invalid("Mask must be of same length as array (expected ",
-                               batch[0].length(), " items but got ", mask.length,
-                               " items)");
-      }
+Status CheckReplaceMaskInputs(const DataType& value_type, int64_t arr_length,
+                              const ExecValue& mask_box,
+                              const DataType& replacements_type,
+                              int64_t replacements_length, bool replacements_is_array) {
+  // Needed for FixedSizeBinary/parameterized types
+  if (!value_type.Equals(replacements_type, /*check_metadata=*/false)) {
+    return Status::Invalid("Replacements must be of same type (expected ",
+                           value_type.ToString(), " but got ",
+                           replacements_type.ToString(), ")");
+  }
+
+  int64_t mask_count = 0;
+  if (mask_box.is_scalar()) {
+    const auto& mask = mask_box.scalar_as<BooleanScalar>();
+    mask_count = (mask.is_valid && mask.value) ? arr_length : 0;
+  } else {
+    const ArraySpan& mask = mask_box.array;
+    mask_count = GetTrueCount(mask);
+    if (mask.length != arr_length) {
+      return Status::Invalid("Mask must be of same length as array (expected ",
+                             arr_length, " items but got ", mask.length, " items)");
     }
-    int64_t replacements_length =
-        replacements.is_arraylike() ? replacements.length() : mask_count;
+  }
+  if (replacements_is_array) {
     if (replacements_length < mask_count) {
       return Status::Invalid("Replacement array must be of appropriate length (expected ",
                              mask_count, " items but got ", replacements_length,
                              " items)");
     }
+  }
+  return Status::OK();
+}
 
-    if (batch[0].is_array()) {
-      const ArrayData& array = *batch[0].array();
-      ArrayData* output = out->array().get();
-      output->length = array.length;
-
-      if (batch[1].is_scalar()) {
-        return ReplaceWithMask<Type>::ExecScalarMask(
-                   ctx, array, batch[1].scalar_as<BooleanScalar>(), replacements,
-                   /*replacements_offset=*/0, output)
-            .status();
-      }
-      const ArrayData& mask = *batch[1].array();
-      return ReplaceWithMask<Type>::ExecArrayMask(ctx, array, mask, /*mask_offset=*/0,
-                                                  replacements, /*replacements_offset=*/0,
-                                                  output)
+template <typename Type>
+struct ReplaceMask {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const ArraySpan& arr = batch[0].array;
+    const ExecValue& mask = batch[1];
+    const ExecValue& replacements = batch[2];
+    RETURN_NOT_OK(CheckReplaceMaskInputs(*arr.type, arr.length, mask,
+                                         *replacements.type(), replacements.length(),
+                                         replacements.is_array()));
+    if (mask.is_scalar()) {
+      return ReplaceMaskImpl<Type>::ExecScalarMask(
+                 ctx, arr, mask.scalar_as<BooleanScalar>(), replacements,
+                 /*replacements_offset=*/0, out)
           .status();
     } else {
-      // Chunked array
-      const auto& chunked_array = *batch[0].chunked_array();
-      ArrayVector output_chunks;
-      output_chunks.reserve(chunked_array.num_chunks());
-
-      int64_t mask_offset = 0;
-      int64_t replacements_offset = 0;
-      for (const auto& chunk : chunked_array.chunks()) {
-        if (chunk->length() == 0) continue;
-        // Allocate a new array
-        auto chunk_out = std::make_shared<ArrayData>(chunk->type(), chunk->length());
-        if (is_fixed_width(out->type()->id())) {
-          chunk_out->buffers.resize(2);
-          ARROW_ASSIGN_OR_RAISE(chunk_out->buffers[0],
-                                ctx->AllocateBitmap(chunk->length()));
-          const int64_t slot_width = bit_util::BytesForBits(
-              checked_cast<const FixedWidthType&>(*out->type()).bit_width());
-          ARROW_ASSIGN_OR_RAISE(chunk_out->buffers[1],
-                                ctx->Allocate(slot_width * chunk->length()));
-        }
-
-        if (batch[1].is_scalar()) {
-          ARROW_ASSIGN_OR_RAISE(
-              replacements_offset,
-              ReplaceWithMask<Type>::ExecScalarMask(
-                  ctx, *chunk->data(), batch[1].scalar_as<BooleanScalar>(), replacements,
-                  replacements_offset, chunk_out.get()));
-        } else {
-          const ArrayData& mask = *batch[1].array();
-          ARROW_ASSIGN_OR_RAISE(replacements_offset,
-                                ReplaceWithMask<Type>::ExecArrayMask(
-                                    ctx, *chunk->data(), mask, mask_offset, replacements,
-                                    replacements_offset, chunk_out.get()));
-        }
-        output_chunks.push_back(MakeArray(std::move(chunk_out)));
-        mask_offset += chunk->length();
-      }
-
-      ARROW_ASSIGN_OR_RAISE(*out,
-                            ChunkedArray::Make(std::move(output_chunks), out->type()));
-      return Status::OK();
+      // The extra mask offset is for dealing with chunked inputs, so zero when
+      // there is only a single chunk to process
+      return ReplaceMaskImpl<Type>::ExecArrayMask(ctx, arr, mask.array,
+                                                  /*mask_offset=*/0, replacements,
+                                                  /*replacements_offset=*/0, out)
+          .status();
     }
   }
 
@@ -442,25 +382,91 @@ struct ReplaceWithMaskFunctor {
   }
 };
 
+template <typename Type>
+struct ReplaceMaskChunked {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const Datum& mask = batch[1];
+    const Datum& replacements = batch[2];
+
+    // TODO(wesm): these assertions that the arguments cannot be ChunkedArray
+    // should happen someplace more generic, not here
+    if (!mask.is_array() && !mask.is_scalar()) {
+      return Status::Invalid("Mask must be array or scalar, not ", batch[1].ToString());
+    }
+
+    if (!replacements.is_array() && !replacements.is_scalar()) {
+      return Status::Invalid("Replacements must be array or scalar, not ",
+                             replacements.ToString());
+    }
+
+    const ChunkedArray& arr = *batch[0].chunked_array();
+
+    RETURN_NOT_OK(CheckReplaceMaskInputs(*arr.type(), arr.length(), GetExecValue(mask),
+                                         *replacements.type(), replacements.length(),
+                                         replacements.is_arraylike()));
+
+    ExecValue replacements_val = GetExecValue(replacements);
+
+    // Chunked array
+    ArrayVector output_chunks;
+    output_chunks.reserve(arr.num_chunks());
+
+    int64_t mask_offset = 0;
+    int64_t replacements_offset = 0;
+    for (const std::shared_ptr<Array>& chunk : arr.chunks()) {
+      if (chunk->length() == 0) continue;
+      // Allocate a new array
+      ExecResult chunk_result;
+      if (is_fixed_width(out->type()->id())) {
+        auto chunk_out = std::make_shared<ArrayData>(chunk->type(), chunk->length());
+        chunk_out->buffers.resize(2);
+        ARROW_ASSIGN_OR_RAISE(chunk_out->buffers[0],
+                              ctx->AllocateBitmap(chunk->length()));
+        const int64_t slot_width = out->type()->byte_width();
+        ARROW_ASSIGN_OR_RAISE(chunk_out->buffers[1],
+                              ctx->Allocate(slot_width * chunk->length()));
+        chunk_result.value = chunk_out;
+      }
+      if (batch[1].is_scalar()) {
+        ARROW_ASSIGN_OR_RAISE(
+            replacements_offset,
+            ReplaceMaskImpl<Type>::ExecScalarMask(
+                ctx, *chunk->data(), batch[1].scalar_as<BooleanScalar>(),
+                replacements_val, replacements_offset, &chunk_result));
+      } else {
+        ARROW_ASSIGN_OR_RAISE(replacements_offset,
+                              ReplaceMaskImpl<Type>::ExecArrayMask(
+                                  ctx, *chunk->data(), *batch[1].array(), mask_offset,
+                                  replacements_val, replacements_offset, &chunk_result));
+      }
+      output_chunks.push_back(MakeArray(chunk_result.array_data()));
+      mask_offset += chunk->length();
+    }
+
+    return ChunkedArray::Make(std::move(output_chunks), out->type()).Value(out);
+  }
+};
+
 // This is for fixed-size types only
 template <typename Type>
-void FillNullInDirectionImpl(const ArrayData& current_chunk, const uint8_t* null_bitmap,
-                             ArrayData* output, int8_t direction,
-                             const ArrayData& last_valid_value_chunk,
+void FillNullInDirectionImpl(const ArraySpan& current_chunk, const uint8_t* null_bitmap,
+                             ExecResult* out, int8_t direction,
+                             const ArraySpan& last_valid_value_chunk,
                              int64_t* last_valid_value_offset) {
-  uint8_t* out_bitmap = output->buffers[0]->mutable_data();
-  uint8_t* out_values = output->buffers[1]->mutable_data();
-  arrow::internal::CopyBitmap(current_chunk.buffers[0]->data(), current_chunk.offset,
-                              current_chunk.length, out_bitmap, output->offset);
+  ArrayData* out_arr = out->array_data().get();
+  uint8_t* out_bitmap = out_arr->buffers[0]->mutable_data();
+  uint8_t* out_values = out_arr->buffers[1]->mutable_data();
+  arrow::internal::CopyBitmap(current_chunk.buffers[0].data, current_chunk.offset,
+                              current_chunk.length, out_bitmap, out_arr->offset);
   CopyDataUtils<Type>::CopyData(*current_chunk.type, current_chunk, /*in_offset=*/0,
-                                out_values, /*out_offset=*/output->offset,
+                                out_values, /*out_offset=*/out_arr->offset,
                                 current_chunk.length);
 
   bool has_fill_value = *last_valid_value_offset != -1;
   int64_t write_offset = direction == 1 ? 0 : current_chunk.length - 1;
   int64_t bitmap_offset = 0;
 
-  arrow::internal::OptionalBitBlockCounter counter(null_bitmap, output->offset,
+  arrow::internal::OptionalBitBlockCounter counter(null_bitmap, out_arr->offset,
                                                    current_chunk.length);
   bool use_current_chunk = false;
   while (bitmap_offset < current_chunk.length) {
@@ -506,66 +512,45 @@ void FillNullInDirectionImpl(const ArrayData& current_chunk, const uint8_t* null
     }
     bitmap_offset += block.length;
   }
-  output->null_count = -1;
-  output->GetNullCount();
+  out_arr->null_count = kUnknownNullCount;
 }
 
 template <typename Type, typename Enable = void>
-struct FillNullExecutor {};
+struct FillNullImpl {};
 
 template <typename Type>
-struct FillNullExecutor<Type, enable_if_boolean<Type>> {
-  static Status ExecFillNull(KernelContext* ctx, const ArrayData& array,
-                             const uint8_t* reversed_bitmap, ArrayData* output,
-                             int8_t direction, const ArrayData& last_valid_value_chunk,
-                             int64_t* last_valid_value_offset) {
-    FillNullInDirectionImpl<Type>(array, reversed_bitmap, output, direction,
+struct FillNullImpl<
+    Type,
+    enable_if_t<is_number_type<Type>::value || is_boolean_type<Type>::value ||
+                is_boolean_type<Type>::value || is_fixed_size_binary_type<Type>::value ||
+                std::is_same<Type, MonthDayNanoIntervalType>::value>> {
+  static Status Exec(KernelContext* ctx, const ArraySpan& array,
+                     const uint8_t* reversed_bitmap, ExecResult* out, int8_t direction,
+                     const ArraySpan& last_valid_value_chunk,
+                     int64_t* last_valid_value_offset) {
+    FillNullInDirectionImpl<Type>(array, reversed_bitmap, out, direction,
                                   last_valid_value_chunk, last_valid_value_offset);
     return Status::OK();
   }
 };
 
 template <typename Type>
-struct FillNullExecutor<
-    Type, enable_if_t<is_number_type<Type>::value ||
-                      std::is_same<Type, MonthDayNanoIntervalType>::value>> {
-  static Status ExecFillNull(KernelContext* ctx, const ArrayData& array,
-                             const uint8_t* reversed_bitmap, ArrayData* output,
-                             int8_t direction, const ArrayData& last_valid_value_chunk,
-                             int64_t* last_valid_value_offset) {
-    FillNullInDirectionImpl<Type>(array, reversed_bitmap, output, direction,
-                                  last_valid_value_chunk, last_valid_value_offset);
-    return Status::OK();
-  }
-};
-
-template <typename Type>
-struct FillNullExecutor<Type, enable_if_fixed_size_binary<Type>> {
-  static Status ExecFillNull(KernelContext* ctx, const ArrayData& array,
-                             const uint8_t* reversed_bitmap, ArrayData* output,
-                             int8_t direction, const ArrayData& last_valid_value_chunk,
-                             int64_t* last_valid_value_offset) {
-    FillNullInDirectionImpl<Type>(array, reversed_bitmap, output, direction,
-                                  last_valid_value_chunk, last_valid_value_offset);
-    return Status::OK();
-  }
-};
-
-template <typename Type>
-struct FillNullExecutor<Type, enable_if_base_binary<Type>> {
+struct FillNullImpl<Type, enable_if_base_binary<Type>> {
   using offset_type = typename Type::offset_type;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
 
-  static Status ExecFillNull(KernelContext* ctx, const ArrayData& current_chunk,
-                             const uint8_t* reversed_bitmap, ArrayData* output,
-                             int8_t direction, const ArrayData& last_valid_value_chunk,
-                             int64_t* last_valid_value_offset) {
-    BuilderType builder(current_chunk.type, ctx->memory_pool());
+  static Status Exec(KernelContext* ctx, const ArraySpan& current_chunk,
+                     const uint8_t* reversed_bitmap, ExecResult* out, int8_t direction,
+                     const ArraySpan& last_valid_value_chunk,
+                     int64_t* last_valid_value_offset) {
+    ArrayData* out_arr = out->array_data().get();
+
+    BuilderType builder(current_chunk.type->Copy(), ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(current_chunk.length));
-    RETURN_NOT_OK(builder.ReserveData(current_chunk.buffers[2]->size()));
+    RETURN_NOT_OK(builder.ReserveData(current_chunk.buffers[2].size));
     int64_t array_value_index = direction == 1 ? 0 : current_chunk.length - 1;
-    const uint8_t* data = current_chunk.buffers[2]->data();
-    const uint8_t* data_prev = last_valid_value_chunk.buffers[2]->data();
+    const uint8_t* data = current_chunk.buffers[2].data;
+    const uint8_t* data_prev = last_valid_value_chunk.buffers[2].data;
     const offset_type* offsets = current_chunk.GetValues<offset_type>(1);
     const offset_type* offsets_prev = last_valid_value_chunk.GetValues<offset_type>(1);
 
@@ -573,9 +558,12 @@ struct FillNullExecutor<Type, enable_if_base_binary<Type>> {
     bool has_fill_value_current_chunk = false;
     /*tuple for store: <use current_chunk(true) or last_valid_chunk(false),
      * start offset of the current value, end offset for the current value>*/
+
+    // TODO(wesm): using out_arr->offset here is a bit ugly because we
+    // discard it later in the function
     std::vector<std::tuple<bool, offset_type, offset_type>> offsets_reversed;
     RETURN_NOT_OK(VisitNullBitmapInline<>(
-        reversed_bitmap, output->offset, current_chunk.length,
+        reversed_bitmap, out_arr->offset, current_chunk.length,
         current_chunk.GetNullCount(),
         [&]() {
           const offset_type offset0 = offsets[array_value_index];
@@ -630,226 +618,218 @@ struct FillNullExecutor<Type, enable_if_base_binary<Type>> {
 
     std::shared_ptr<Array> temp_output;
     RETURN_NOT_OK(builder.Finish(&temp_output));
-    *output = *temp_output->data();
+    out->value = std::move(temp_output->data());
     // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
-    output->type = current_chunk.type;
+    out->array_data()->type = current_chunk.type->Copy();
     return Status::OK();
   }
 };
 
 template <typename Type>
-struct FillNullExecutor<Type, enable_if_null<Type>> {
-  static Status ExecFillNull(KernelContext* ctx, const ArrayData& array,
-                             const uint8_t* reversed_bitmap, ArrayData* output,
-                             int8_t direction, const ArrayData& last_valid_value_chunk,
-                             int64_t* last_valid_value_offset) {
-    *output = array;
+struct FillNullImpl<Type, enable_if_null<Type>> {
+  static Status Exec(KernelContext* ctx, const ArraySpan& array,
+                     const uint8_t* reversed_bitmap, ExecResult* out, int8_t direction,
+                     const ArraySpan& last_valid_value_chunk,
+                     int64_t* last_valid_value_offset) {
+    out->value = array.ToArrayData();
     return Status::OK();
   }
 };
 
 template <typename Type>
-struct FillNullForwardFunctor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    switch (batch[0].kind()) {
-      case Datum::ARRAY: {
-        auto array_input = *batch[0].array();
-        int64_t last_valid_value_offset = -1;
-        return FillNullForwardArray(ctx, array_input, out, array_input,
-                                    &last_valid_value_offset);
-      }
-      case Datum::CHUNKED_ARRAY: {
-        return FillNullForwardChunkedArray(ctx, batch[0].chunked_array(), out);
-      }
-      default:
-        break;
-    }
-    return Status::NotImplemented("Unsupported type for fill_null_forward: ",
-                                  batch[0].ToString());
+struct FillNullForward {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const ArraySpan& array_input = batch[0].array;
+    int64_t last_valid_offset = -1;  // unused
+    return ExecChunk(ctx, array_input, out, array_input, &last_valid_offset);
   }
 
-  static Status FillNullForwardArray(KernelContext* ctx, const ArrayData& array,
-                                     Datum* out, const ArrayData& last_valid_value_chunk,
-                                     int64_t* last_valid_value_offset) {
-    ArrayData* output = out->array().get();
+  static Status ExecChunk(KernelContext* ctx, const ArraySpan& array, ExecResult* out,
+                          const ArraySpan& last_valid_value_chunk,
+                          int64_t* last_valid_value_offset) {
+    ArrayData* output = out->array_data().get();
     output->length = array.length;
     int8_t direction = 1;
-
     if (array.MayHaveNulls()) {
       ARROW_ASSIGN_OR_RAISE(
           auto null_bitmap,
-          arrow::internal::CopyBitmap(ctx->memory_pool(), array.buffers[0]->data(),
+          arrow::internal::CopyBitmap(ctx->memory_pool(), array.buffers[0].data,
                                       array.offset, array.length));
-      return FillNullExecutor<Type>::ExecFillNull(ctx, array, null_bitmap->data(), output,
-                                                  direction, last_valid_value_chunk,
-                                                  last_valid_value_offset);
+      return FillNullImpl<Type>::Exec(ctx, array, null_bitmap->data(), out, direction,
+                                      last_valid_value_chunk, last_valid_value_offset);
     } else {
+      // TODO(wesm): zero copy optimization is a bit ugly...
       if (array.length > 0) {
         *last_valid_value_offset = array.length - 1;
       }
-      *output = array;
+      out->value = array.ToArrayData();
     }
     return Status::OK();
   }
 
-  static Status FillNullForwardChunkedArray(KernelContext* ctx,
-                                            const std::shared_ptr<ChunkedArray>& values,
-                                            Datum* out) {
-    if (values->null_count() == 0) {
-      *out = Datum(values);
+  static std::shared_ptr<KernelSignature> GetSignature(detail::GetTypeId get_id) {
+    return KernelSignature::Make({InputType::Array(get_id.id)}, OutputType(FirstType));
+  }
+};
+
+template <typename Type>
+struct FillNullForwardChunked {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const ChunkedArray& values = *batch[0].chunked_array();
+
+    if (values.null_count() == 0) {
+      *out = batch[0];
       return Status::OK();
     }
-    if (values->null_count() == values->length()) {
-      *out = Datum(values);
+    if (values.null_count() == values.length()) {
+      *out = batch[0];
       return Status::OK();
     }
 
     ArrayVector new_chunks;
-    if (values->length() > 0) {
-      ArrayData* array_with_current = values->chunk(/*first_chunk=*/0)->data().get();
+    if (values.length() > 0) {
+      ArrayData* array_with_current = values.chunk(/*first_chunk=*/0)->data().get();
       int64_t last_valid_value_offset = -1;
-
-      for (const auto& chunk : values->chunks()) {
+      for (const std::shared_ptr<Array>& chunk : values.chunks()) {
         if (is_fixed_width(out->type()->id())) {
-          auto* output = out->mutable_array();
-          auto bit_width = checked_cast<const FixedWidthType&>(*output->type).bit_width();
-          auto data_bytes = bit_util::BytesForBits(bit_width * chunk->length());
+          ArrayData* output = out->mutable_array();
           ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(chunk->length()));
-          ARROW_ASSIGN_OR_RAISE(output->buffers[1], ctx->Allocate(data_bytes));
+          ARROW_ASSIGN_OR_RAISE(
+              output->buffers[1],
+              ctx->Allocate(out->type()->byte_width() * chunk->length()));
         }
-        RETURN_NOT_OK(FillNullForwardArray(ctx, *chunk->data(), out, *array_with_current,
-                                           &last_valid_value_offset));
+        ExecResult chunk_result;
+        chunk_result.value = out->array();
+        RETURN_NOT_OK(FillNullForward<Type>::ExecChunk(ctx, *chunk->data(), &chunk_result,
+                                                       *array_with_current,
+                                                       &last_valid_value_offset));
         if (chunk->null_count() != chunk->length()) {
-          array_with_current = &*chunk->data();
+          array_with_current = chunk->data().get();
         }
-        new_chunks.push_back(MakeArray(out->make_array()->data()->Copy()));
+        new_chunks.push_back(MakeArray(chunk_result.array_data()->Copy()));
       }
     }
 
-    auto output = std::make_shared<ChunkedArray>(std::move(new_chunks), values->type());
+    auto output = std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
     *out = Datum(output);
     return Status::OK();
   }
-
-  static std::shared_ptr<KernelSignature> GetSignature(detail::GetTypeId get_id) {
-    return KernelSignature::Make({InputType::Array(get_id.id)}, OutputType(FirstType));
-  }
 };
 
 template <typename Type>
-struct FillNullBackwardFunctor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    switch (batch[0].kind()) {
-      case Datum::ARRAY: {
-        auto array_input = *batch[0].array();
-        int64_t last_valid_value_offset = -1;
-        return FillNullBackwardArray(ctx, array_input, out, array_input,
-                                     &last_valid_value_offset);
-      }
-      case Datum::CHUNKED_ARRAY: {
-        return FillNullBackwardChunkedArray(ctx, batch[0].chunked_array(), out);
-      }
-      default:
-        break;
-    }
-    return Status::NotImplemented("Unsupported type for fill_null_backward operation: ",
-                                  batch[0].ToString());
+struct FillNullBackward {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    int64_t last_offset = -1;  // unused
+    return ExecChunk(ctx, batch[0].array, out, batch[0].array, &last_offset);
   }
 
-  static Status FillNullBackwardArray(KernelContext* ctx, const ArrayData& array,
-                                      Datum* out, const ArrayData& last_valid_value_chunk,
-                                      int64_t* last_valid_value_offset) {
-    ArrayData* output = out->array().get();
-    output->length = array.length;
+  static Status ExecChunk(KernelContext* ctx, const ArraySpan& array, ExecResult* out,
+                          const ArraySpan& last_valid_value_chunk,
+                          int64_t* last_valid_value_offset) {
+    ArrayData* out_arr = out->array_data().get();
+    out_arr->length = array.length;
     int8_t direction = -1;
 
     if (array.MayHaveNulls()) {
       ARROW_ASSIGN_OR_RAISE(
           auto reversed_bitmap,
-          arrow::internal::ReverseBitmap(ctx->memory_pool(), array.buffers[0]->data(),
+          arrow::internal::ReverseBitmap(ctx->memory_pool(), array.buffers[0].data,
                                          array.offset, array.length));
-      return FillNullExecutor<Type>::ExecFillNull(
-          ctx, array, reversed_bitmap->data(), output, direction, last_valid_value_chunk,
-          last_valid_value_offset);
+      return FillNullImpl<Type>::Exec(ctx, array, reversed_bitmap->data(), out, direction,
+                                      last_valid_value_chunk, last_valid_value_offset);
     } else {
+      // Zero copy optimization
       if (array.length > 0) {
         *last_valid_value_offset = 0;
       }
-      *output = array;
+      out->value = array.ToArrayData();
     }
     return Status::OK();
   }
 
-  static Status FillNullBackwardChunkedArray(KernelContext* ctx,
-                                             const std::shared_ptr<ChunkedArray>& values,
-                                             Datum* out) {
-    if (values->null_count() == 0) {
+  static std::shared_ptr<KernelSignature> GetSignature(detail::GetTypeId get_id) {
+    return KernelSignature::Make({InputType::Array(get_id.id)}, OutputType(FirstType));
+  }
+};
+
+template <typename Type>
+struct FillNullBackwardChunked {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    DCHECK_EQ(Datum::CHUNKED_ARRAY, batch[0].kind());
+    const ChunkedArray& values = *batch[0].chunked_array();
+    if (values.null_count() == 0) {
       *out = Datum(values);
       return Status::OK();
     }
-    if (values->null_count() == values->length()) {
+    if (values.null_count() == values.length()) {
       *out = Datum(values);
       return Status::OK();
     }
     std::vector<std::shared_ptr<Array>> new_chunks;
 
-    if (values->length() > 0) {
-      auto chunks_length = static_cast<int>(values->chunks().size());
+    if (values.length() > 0) {
+      auto chunks_length = static_cast<int>(values.chunks().size());
       ArrayData* array_with_current =
-          values->chunk(/*first_chunk=*/chunks_length - 1)->data().get();
+          values.chunk(/*first_chunk=*/chunks_length - 1)->data().get();
       int64_t last_valid_value_offset = -1;
-      auto chunks = values->chunks();
+      auto chunks = values.chunks();
       for (int i = chunks_length - 1; i >= 0; --i) {
         const auto& chunk = chunks[i];
         if (is_fixed_width(out->type()->id())) {
-          auto* output = out->mutable_array();
-          auto bit_width = checked_cast<const FixedWidthType&>(*output->type).bit_width();
-          auto data_bytes = bit_util::BytesForBits(bit_width * chunk->length());
+          ArrayData* output = out->mutable_array();
+          auto data_bytes = output->type->byte_width() * chunk->length();
           ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(chunk->length()));
           ARROW_ASSIGN_OR_RAISE(output->buffers[1], ctx->Allocate(data_bytes));
         }
-        RETURN_NOT_OK(FillNullBackwardArray(ctx, *chunk->data(), out, *array_with_current,
-                                            &last_valid_value_offset));
+        ExecResult chunk_result;
+        chunk_result.value = out->array();
+        RETURN_NOT_OK(FillNullBackward<Type>::ExecChunk(
+            ctx, *chunk->data(), &chunk_result, *array_with_current,
+            &last_valid_value_offset));
         if (chunk->null_count() != chunk->length()) {
-          array_with_current = &*chunk->data();
+          array_with_current = chunk->data().get();
         }
-        new_chunks.push_back(MakeArray(out->make_array()->data()->Copy()));
+        new_chunks.push_back(MakeArray(chunk_result.array_data()->Copy()));
       }
     }
 
     std::reverse(new_chunks.begin(), new_chunks.end());
-    auto output = std::make_shared<ChunkedArray>(std::move(new_chunks), values->type());
-    *out = Datum(output);
+    *out = std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
     return Status::OK();
   }
-
-  static std::shared_ptr<KernelSignature> GetSignature(detail::GetTypeId get_id) {
-    return KernelSignature::Make({InputType::Array(get_id.id)}, OutputType(FirstType));
-  }
 };
+
 }  // namespace
 
-template <template <class> class Functor>
+void AddKernel(Type::type type_id, std::shared_ptr<KernelSignature> signature,
+               ArrayKernelExec exec, VectorKernel::ChunkedExec exec_chunked,
+               FunctionRegistry* registry, VectorFunction* func) {
+  VectorKernel kernel;
+  kernel.can_execute_chunkwise = false;
+  if (is_fixed_width(type_id)) {
+    kernel.null_handling = NullHandling::type::COMPUTED_PREALLOCATE;
+  } else {
+    kernel.can_write_into_slices = false;
+    kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
+  }
+  kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
+  kernel.signature = std::move(signature);
+  kernel.exec = std::move(exec);
+  kernel.exec_chunked = exec_chunked;
+  kernel.can_execute_chunkwise = false;
+  kernel.output_chunked = false;
+  DCHECK_OK(func->AddKernel(std::move(kernel)));
+}
+
+template <template <class> class Functor, template <class> class ChunkedFunctor>
 void RegisterVectorFunction(FunctionRegistry* registry,
                             std::shared_ptr<VectorFunction> func) {
-  auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExecOld exec) {
-    VectorKernel kernel;
-    kernel.can_execute_chunkwise = false;
-    if (is_fixed_width(get_id.id)) {
-      kernel.null_handling = NullHandling::type::COMPUTED_PREALLOCATE;
-    } else {
-      kernel.can_write_into_slices = false;
-      kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
-    }
-    kernel.mem_allocation = MemAllocation::type::PREALLOCATE;
-    kernel.signature = Functor<FixedSizeBinaryType>::GetSignature(get_id.id);
-    kernel.exec = std::move(exec);
-    kernel.can_execute_chunkwise = false;
-    kernel.output_chunked = false;
-    DCHECK_OK(func->AddKernel(std::move(kernel)));
-  };
   auto add_primitive_kernel = [&](detail::GetTypeId get_id) {
-    add_kernel(get_id, GenerateTypeAgnosticPrimitiveOld<Functor>(get_id));
+    AddKernel(
+        get_id.id, Functor<FixedSizeBinaryType>::GetSignature(get_id),
+        GenerateTypeAgnosticPrimitive<Functor, ArrayKernelExec>(get_id),
+        GenerateTypeAgnosticPrimitive<ChunkedFunctor, VectorKernel::ChunkedExec>(get_id),
+        registry, func.get());
   };
   for (const auto& ty : NumericTypes()) {
     add_primitive_kernel(ty);
@@ -862,11 +842,24 @@ void RegisterVectorFunction(FunctionRegistry* registry,
   }
   add_primitive_kernel(null());
   add_primitive_kernel(boolean());
-  add_kernel(Type::FIXED_SIZE_BINARY, Functor<FixedSizeBinaryType>::Exec);
-  add_kernel(Type::DECIMAL128, Functor<FixedSizeBinaryType>::Exec);
-  add_kernel(Type::DECIMAL256, Functor<FixedSizeBinaryType>::Exec);
+  AddKernel(Type::FIXED_SIZE_BINARY,
+            Functor<FixedSizeBinaryType>::GetSignature(Type::FIXED_SIZE_BINARY),
+            Functor<FixedSizeBinaryType>::Exec, ChunkedFunctor<FixedSizeBinaryType>::Exec,
+            registry, func.get());
+  AddKernel(Type::DECIMAL128,
+            Functor<FixedSizeBinaryType>::GetSignature(Type::DECIMAL128),
+            Functor<FixedSizeBinaryType>::Exec, ChunkedFunctor<FixedSizeBinaryType>::Exec,
+            registry, func.get());
+  AddKernel(Type::DECIMAL256,
+            Functor<FixedSizeBinaryType>::GetSignature(Type::DECIMAL256),
+            Functor<FixedSizeBinaryType>::Exec, ChunkedFunctor<FixedSizeBinaryType>::Exec,
+            registry, func.get());
   for (const auto& ty : BaseBinaryTypes()) {
-    add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBaseOld<Functor>(*ty));
+    AddKernel(
+        ty->id(), Functor<FixedSizeBinaryType>::GetSignature(ty->id()),
+        GenerateTypeAgnosticVarBinaryBase<Functor, ArrayKernelExec>(*ty),
+        GenerateTypeAgnosticVarBinaryBase<ChunkedFunctor, VectorKernel::ChunkedExec>(*ty),
+        registry, func.get());
   }
   // TODO: list types
   DCHECK_OK(registry->AddFunction(std::move(func)));
@@ -900,17 +893,17 @@ void RegisterVectorReplace(FunctionRegistry* registry) {
   {
     auto func = std::make_shared<VectorFunction>("replace_with_mask", Arity::Ternary(),
                                                  replace_with_mask_doc);
-    RegisterVectorFunction<ReplaceWithMaskFunctor>(registry, func);
+    RegisterVectorFunction<ReplaceMask, ReplaceMaskChunked>(registry, func);
   }
   {
     auto func = std::make_shared<VectorFunction>("fill_null_forward", Arity::Unary(),
                                                  fill_null_forward_doc);
-    RegisterVectorFunction<FillNullForwardFunctor>(registry, func);
+    RegisterVectorFunction<FillNullForward, FillNullForwardChunked>(registry, func);
   }
   {
     auto func = std::make_shared<VectorFunction>("fill_null_backward", Arity::Unary(),
                                                  fill_null_backward_doc);
-    RegisterVectorFunction<FillNullBackwardFunctor>(registry, func);
+    RegisterVectorFunction<FillNullBackward, FillNullBackwardChunked>(registry, func);
   }
 }
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/vector_replace_test.cc b/cpp/src/arrow/compute/kernels/vector_replace_test.cc
index 544a1aa002d..589952ba700 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace_test.cc
@@ -383,7 +383,8 @@ TYPED_TEST(TestReplaceNumeric, ReplaceWithMask) {
            {"[null]", "[null, null]", "[null, null]", "[null, null, null]", "[null]"})},
   };
 
-  for (auto test_case : cases) {
+  for (size_t i = 0; i < cases.size(); ++i) {
+    auto test_case = cases[i];
     if (std::is_same<TypeParam, Date64Type>::value) {
       // ARROW-10924: account for Date64 value restrictions
       ASSERT_OK_AND_ASSIGN(test_case.input, Cast(test_case.input, int64()));
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index c189074bb4c..3a4f957ae5b 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -53,21 +53,19 @@ using internal::BitBlockCounter;
 using internal::CheckIndexBounds;
 using internal::CopyBitmap;
 using internal::CountSetBits;
-using internal::GetArrayView;
-using internal::GetByteWidth;
 using internal::OptionalBitBlockCounter;
 using internal::OptionalBitIndexer;
 
 namespace compute {
 namespace internal {
 
-int64_t GetFilterOutputSize(const ArrayData& filter,
+int64_t GetFilterOutputSize(const ArraySpan& filter,
                             FilterOptions::NullSelectionBehavior null_selection) {
   int64_t output_size = 0;
 
   if (filter.MayHaveNulls()) {
-    const uint8_t* filter_is_valid = filter.buffers[0]->data();
-    BinaryBitBlockCounter bit_counter(filter.buffers[1]->data(), filter.offset,
+    const uint8_t* filter_is_valid = filter.buffers[0].data;
+    BinaryBitBlockCounter bit_counter(filter.buffers[1].data, filter.offset,
                                       filter_is_valid, filter.offset, filter.length);
     int64_t position = 0;
     if (null_selection == FilterOptions::EMIT_NULL) {
@@ -85,7 +83,7 @@ int64_t GetFilterOutputSize(const ArrayData& filter,
     }
   } else {
     // The filter has no nulls, so we can use CountSetBits
-    output_size = CountSetBits(filter.buffers[1]->data(), filter.offset, filter.length);
+    output_size = CountSetBits(filter.buffers[1].data, filter.offset, filter.length);
   }
   return output_size;
 }
@@ -94,14 +92,13 @@ namespace {
 
 template <typename IndexType>
 Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
-    const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
+    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
     MemoryPool* memory_pool) {
   using T = typename IndexType::c_type;
 
-  const uint8_t* filter_data = filter.buffers[1]->data();
+  const uint8_t* filter_data = filter.buffers[1].data;
   const bool have_filter_nulls = filter.MayHaveNulls();
-  const uint8_t* filter_is_valid =
-      have_filter_nulls ? filter.buffers[0]->data() : nullptr;
+  const uint8_t* filter_is_valid = filter.buffers[0].data;
 
   if (have_filter_nulls && null_selection == FilterOptions::EMIT_NULL) {
     // Most complex case: the filter may have nulls and we don't drop them.
@@ -224,7 +221,7 @@ Result<std::shared_ptr<ArrayData>> GetTakeIndicesImpl(
 }  // namespace
 
 Result<std::shared_ptr<ArrayData>> GetTakeIndices(
-    const ArrayData& filter, FilterOptions::NullSelectionBehavior null_selection,
+    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
     MemoryPool* memory_pool) {
   DCHECK_EQ(filter.type->id(), Type::BOOL);
   if (filter.length <= std::numeric_limits<uint16_t>::max()) {
@@ -275,14 +272,14 @@ Status PreallocateData(KernelContext* ctx, int64_t length, int bit_width,
 /// This function assumes that the indices have been boundschecked.
 template <typename IndexCType, typename ValueCType>
 struct PrimitiveTakeImpl {
-  static void Exec(const PrimitiveArg& values, const PrimitiveArg& indices,
+  static void Exec(const ArraySpan& values, const ArraySpan& indices,
                    ArrayData* out_arr) {
-    auto values_data = reinterpret_cast<const ValueCType*>(values.data);
-    auto values_is_valid = values.is_valid;
+    const ValueCType* values_data = values.GetValues<ValueCType>(1);
+    const uint8_t* values_is_valid = values.buffers[0].data;
     auto values_offset = values.offset;
 
-    auto indices_data = reinterpret_cast<const IndexCType*>(indices.data);
-    auto indices_is_valid = indices.is_valid;
+    const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
+    const uint8_t* indices_is_valid = indices.buffers[0].data;
     auto indices_offset = indices.offset;
 
     auto out = out_arr->GetMutableValues<ValueCType>(1);
@@ -373,14 +370,14 @@ struct PrimitiveTakeImpl {
 
 template <typename IndexCType>
 struct BooleanTakeImpl {
-  static void Exec(const PrimitiveArg& values, const PrimitiveArg& indices,
+  static void Exec(const ArraySpan& values, const ArraySpan& indices,
                    ArrayData* out_arr) {
-    const uint8_t* values_data = values.data;
-    auto values_is_valid = values.is_valid;
+    const uint8_t* values_data = values.buffers[1].data;
+    const uint8_t* values_is_valid = values.buffers[0].data;
     auto values_offset = values.offset;
 
-    auto indices_data = reinterpret_cast<const IndexCType*>(indices.data);
-    auto indices_is_valid = indices.is_valid;
+    const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
+    const uint8_t* indices_is_valid = indices.buffers[0].data;
     auto indices_offset = indices.offset;
 
     auto out = out_arr->buffers[1]->mutable_data();
@@ -471,21 +468,21 @@ struct BooleanTakeImpl {
 };
 
 template <template <typename...> class TakeImpl, typename... Args>
-void TakeIndexDispatch(const PrimitiveArg& values, const PrimitiveArg& indices,
+void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
                        ArrayData* out) {
   // With the simplifying assumption that boundschecking has taken place
   // already at a higher level, we can now assume that the index values are all
   // non-negative. Thus, we can interpret signed integers as unsigned and avoid
   // having to generate double the amount of binary code to handle each integer
   // width.
-  switch (indices.bit_width) {
-    case 8:
+  switch (indices.type->byte_width()) {
+    case 1:
       return TakeImpl<uint8_t, Args...>::Exec(values, indices, out);
-    case 16:
+    case 2:
       return TakeImpl<uint16_t, Args...>::Exec(values, indices, out);
-    case 32:
+    case 4:
       return TakeImpl<uint32_t, Args...>::Exec(values, indices, out);
-    case 64:
+    case 8:
       return TakeImpl<uint64_t, Args...>::Exec(values, indices, out);
     default:
       DCHECK(false) << "Invalid indices byte width";
@@ -493,23 +490,25 @@ void TakeIndexDispatch(const PrimitiveArg& values, const PrimitiveArg& indices,
   }
 }
 
-Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status PrimitiveTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ArraySpan& values = batch[0].array;
+  const ArraySpan& indices = batch[1].array;
+
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length));
   }
 
-  PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
-  PrimitiveArg indices = GetPrimitiveArg(*batch[1].array());
+  ArrayData* out_arr = out->array_data().get();
 
-  ArrayData* out_arr = out->mutable_array();
+  const int bit_width = values.type->bit_width();
 
   // TODO: When neither values nor indices contain nulls, we can skip
   // allocating the validity bitmap altogether and save time and space. A
   // streamlined PrimitiveTakeImpl would need to be written that skips all
   // interactions with the output validity bitmap, though.
-  RETURN_NOT_OK(PreallocateData(ctx, indices.length, values.bit_width,
+  RETURN_NOT_OK(PreallocateData(ctx, indices.length, bit_width,
                                 /*allocate_validity=*/true, out_arr));
-  switch (values.bit_width) {
+  switch (bit_width) {
     case 1:
       TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
       break;
@@ -575,19 +574,24 @@ class PrimitiveFilterImpl {
   using T = typename std::conditional<std::is_same<ArrowType, BooleanType>::value,
                                       uint8_t, typename ArrowType::c_type>::type;
 
-  PrimitiveFilterImpl(const PrimitiveArg& values, const PrimitiveArg& filter,
+  PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
                       FilterOptions::NullSelectionBehavior null_selection,
                       ArrayData* out_arr)
-      : values_is_valid_(values.is_valid),
-        values_data_(reinterpret_cast<const T*>(values.data)),
+      : values_is_valid_(values.buffers[0].data),
+        values_data_(reinterpret_cast<const T*>(values.buffers[1].data)),
         values_null_count_(values.null_count),
         values_offset_(values.offset),
         values_length_(values.length),
-        filter_is_valid_(filter.is_valid),
-        filter_data_(filter.data),
+        filter_is_valid_(filter.buffers[0].data),
+        filter_data_(filter.buffers[1].data),
         filter_null_count_(filter.null_count),
         filter_offset_(filter.offset),
         null_selection_(null_selection) {
+    if (values.type->id() != Type::BOOL) {
+      // No offset applied for boolean because it's a bitmap
+      values_data_ += values.offset;
+    }
+
     if (out_arr->buffers[0] != nullptr) {
       // May not be allocated if neither filter nor values contains nulls
       out_is_valid_ = out_arr->buffers[0]->mutable_data();
@@ -786,15 +790,15 @@ inline void PrimitiveFilterImpl<BooleanType>::WriteNull() {
   bit_util::ClearBit(out_data_, out_offset_ + out_position_++);
 }
 
-Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
-  PrimitiveArg filter = GetPrimitiveArg(*batch[1].array());
+Status PrimitiveFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ArraySpan& values = batch[0].array;
+  const ArraySpan& filter = batch[1].array;
   FilterOptions::NullSelectionBehavior null_selection =
       FilterState::Get(ctx).null_selection_behavior;
 
-  int64_t output_length = GetFilterOutputSize(*batch[1].array(), null_selection);
+  int64_t output_length = GetFilterOutputSize(filter, null_selection);
 
-  ArrayData* out_arr = out->mutable_array();
+  ArrayData* out_arr = out->array_data().get();
 
   // The output precomputed null count is unknown except in the narrow
   // condition that all the values are non-null and the filter will not cause
@@ -811,10 +815,11 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   // validity bitmap.
   bool allocate_validity = values.null_count != 0 || filter.null_count != 0;
 
+  const int bit_width = values.type->bit_width();
   RETURN_NOT_OK(
-      PreallocateData(ctx, output_length, values.bit_width, allocate_validity, out_arr));
+      PreallocateData(ctx, output_length, bit_width, allocate_validity, out_arr));
 
-  switch (values.bit_width) {
+  switch (bit_width) {
     case 1:
       PrimitiveFilterImpl<BooleanType>(values, filter, null_selection, out_arr).Exec();
       break;
@@ -841,9 +846,8 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 // Optimized filter for base binary types (32-bit and 64-bit)
 
 #define BINARY_FILTER_SETUP_COMMON()                                                    \
-  auto raw_offsets =                                                                    \
-      reinterpret_cast<const offset_type*>(values.buffers[1]->data()) + values.offset;  \
-  const uint8_t* raw_data = values.buffers[2]->data();                                  \
+  const auto raw_offsets = values.GetValues<offset_type>(1);                            \
+  const uint8_t* raw_data = values.buffers[2].data;                                     \
                                                                                         \
   TypedBufferBuilder<offset_type> offset_builder(ctx->memory_pool());                   \
   TypedBufferBuilder<uint8_t> data_builder(ctx->memory_pool());                         \
@@ -877,12 +881,12 @@ Status PrimitiveFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 // Optimized binary filter for the case where neither values nor filter have
 // nulls
 template <typename Type>
-Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArrayData& values,
-                               const ArrayData& filter, int64_t output_length,
+Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArraySpan& values,
+                               const ArraySpan& filter, int64_t output_length,
                                FilterOptions::NullSelectionBehavior null_selection,
                                ArrayData* out) {
   using offset_type = typename Type::offset_type;
-  const auto filter_data = filter.buffers[1]->data();
+  const auto filter_data = filter.buffers[1].data;
 
   BINARY_FILTER_SETUP_COMMON();
 
@@ -909,17 +913,17 @@ Status BinaryFilterNonNullImpl(KernelContext* ctx, const ArrayData& values,
 }
 
 template <typename Type>
-Status BinaryFilterImpl(KernelContext* ctx, const ArrayData& values,
-                        const ArrayData& filter, int64_t output_length,
+Status BinaryFilterImpl(KernelContext* ctx, const ArraySpan& values,
+                        const ArraySpan& filter, int64_t output_length,
                         FilterOptions::NullSelectionBehavior null_selection,
                         ArrayData* out) {
   using offset_type = typename Type::offset_type;
 
-  const auto filter_data = filter.buffers[1]->data();
-  const uint8_t* filter_is_valid = GetValidityBitmap(filter);
+  const auto filter_data = filter.buffers[1].data;
+  const uint8_t* filter_is_valid = filter.buffers[0].data;
   const int64_t filter_offset = filter.offset;
 
-  const uint8_t* values_is_valid = GetValidityBitmap(values);
+  const uint8_t* values_is_valid = values.buffers[0].data;
   const int64_t values_offset = values.offset;
 
   uint8_t* out_is_valid = out->buffers[0]->mutable_data();
@@ -1082,14 +1086,15 @@ Status BinaryFilterImpl(KernelContext* ctx, const ArrayData& values,
 #undef APPEND_RAW_DATA
 #undef APPEND_SINGLE_VALUE
 
-Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status BinaryFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   FilterOptions::NullSelectionBehavior null_selection =
       FilterState::Get(ctx).null_selection_behavior;
 
-  const ArrayData& values = *batch[0].array();
-  const ArrayData& filter = *batch[1].array();
+  const ArraySpan& values = batch[0].array;
+  const ArraySpan& filter = batch[1].array;
   int64_t output_length = GetFilterOutputSize(filter, null_selection);
-  ArrayData* out_arr = out->mutable_array();
+
+  ArrayData* out_arr = out->array_data().get();
 
   // The output precomputed null count is unknown except in the narrow
   // condition that all the values are non-null and the filter will not cause
@@ -1132,19 +1137,19 @@ Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 // ----------------------------------------------------------------------
 // Null take and filter
 
-Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status NullTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
   }
   // batch.length doesn't take into account the take indices
-  auto new_length = batch[1].array()->length;
+  auto new_length = batch[1].array.length;
   out->value = std::make_shared<NullArray>(new_length)->data();
   return Status::OK();
 }
 
-Status NullFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  int64_t output_length = GetFilterOutputSize(
-      *batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
+Status NullFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  int64_t output_length =
+      GetFilterOutputSize(batch[1].array, FilterState::Get(ctx).null_selection_behavior);
   out->value = std::make_shared<NullArray>(output_length)->data();
   return Status::OK();
 }
@@ -1152,21 +1157,21 @@ Status NullFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 // ----------------------------------------------------------------------
 // Dictionary take and filter
 
-Status DictionaryTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  DictionaryArray values(batch[0].array());
+Status DictionaryTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DictionaryArray values(batch[0].array.ToArrayData());
   Datum result;
-  RETURN_NOT_OK(
-      Take(Datum(values.indices()), batch[1], TakeState::Get(ctx), ctx->exec_context())
-          .Value(&result));
+  RETURN_NOT_OK(Take(Datum(values.indices()), batch[1].array.ToArrayData(),
+                     TakeState::Get(ctx), ctx->exec_context())
+                    .Value(&result));
   DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
   out->value = taken_values.data();
   return Status::OK();
 }
 
-Status DictionaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  DictionaryArray dict_values(batch[0].array());
+Status DictionaryFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DictionaryArray dict_values(batch[0].array.ToArrayData());
   Datum result;
-  RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array(),
+  RETURN_NOT_OK(Filter(Datum(dict_values.indices()), batch[1].array.ToArrayData(),
                        FilterState::Get(ctx), ctx->exec_context())
                     .Value(&result));
   DictionaryArray filtered_values(dict_values.type(), result.make_array(),
@@ -1178,21 +1183,21 @@ Status DictionaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out)
 // ----------------------------------------------------------------------
 // Extension take and filter
 
-Status ExtensionTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  ExtensionArray values(batch[0].array());
+Status ExtensionTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  ExtensionArray values(batch[0].array.ToArrayData());
   Datum result;
-  RETURN_NOT_OK(
-      Take(Datum(values.storage()), batch[1], TakeState::Get(ctx), ctx->exec_context())
-          .Value(&result));
+  RETURN_NOT_OK(Take(Datum(values.storage()), batch[1].array.ToArrayData(),
+                     TakeState::Get(ctx), ctx->exec_context())
+                    .Value(&result));
   ExtensionArray taken_values(values.type(), result.make_array());
   out->value = taken_values.data();
   return Status::OK();
 }
 
-Status ExtensionFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  ExtensionArray ext_values(batch[0].array());
+Status ExtensionFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  ExtensionArray ext_values(batch[0].array.ToArrayData());
   Datum result;
-  RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array(),
+  RETURN_NOT_OK(Filter(Datum(ext_values.storage()), batch[1].array.ToArrayData(),
                        FilterState::Get(ctx), ctx->exec_context())
                     .Value(&result));
   ExtensionArray filtered_values(ext_values.type(), result.make_array());
@@ -1238,24 +1243,25 @@ struct Selection {
   };
 
   KernelContext* ctx;
-  std::shared_ptr<ArrayData> values;
-  std::shared_ptr<ArrayData> selection;
+  const ArraySpan& values;
+  const ArraySpan& selection;
   int64_t output_length;
   ArrayData* out;
   TypedBufferBuilder<bool> validity_builder;
 
-  Selection(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+  Selection(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+            ExecResult* out)
       : ctx(ctx),
-        values(batch[0].array()),
-        selection(batch[1].array()),
+        values(batch[0].array),
+        selection(batch[1].array),
         output_length(output_length),
-        out(out->mutable_array()),
+        out(out->array_data().get()),
         validity_builder(ctx->memory_pool()) {}
 
   virtual ~Selection() = default;
 
   Status FinishCommon() {
-    out->buffers.resize(values->buffers.size());
+    out->buffers.resize(values.num_buffers());
     out->length = validity_builder.length();
     out->null_count = validity_builder.false_count();
     return validity_builder.Finish(&out->buffers[0]);
@@ -1263,15 +1269,15 @@ struct Selection {
 
   template <typename IndexCType, typename ValidVisitor, typename NullVisitor>
   Status VisitTake(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
-    const auto indices_values = selection->GetValues<IndexCType>(1);
-    const uint8_t* is_valid = GetValidityBitmap(*selection);
-    OptionalBitIndexer indices_is_valid(selection->buffers[0], selection->offset);
-    OptionalBitIndexer values_is_valid(values->buffers[0], values->offset);
+    const auto indices_values = selection.GetValues<IndexCType>(1);
+    const uint8_t* is_valid = selection.buffers[0].data;
+    OptionalBitIndexer indices_is_valid(is_valid, selection.offset);
+    OptionalBitIndexer values_is_valid(values.buffers[0].data, values.offset);
 
-    const bool values_have_nulls = values->MayHaveNulls();
-    OptionalBitBlockCounter bit_counter(is_valid, selection->offset, selection->length);
+    const bool values_have_nulls = values.MayHaveNulls();
+    OptionalBitBlockCounter bit_counter(is_valid, selection.offset, selection.length);
     int64_t position = 0;
-    while (position < selection->length) {
+    while (position < selection.length) {
       BitBlockCount block = bit_counter.NextBlock();
       const bool indices_have_nulls = block.popcount < block.length;
       if (!indices_have_nulls && !values_have_nulls) {
@@ -1313,22 +1319,22 @@ struct Selection {
   Status VisitFilter(ValidVisitor&& visit_valid, NullVisitor&& visit_null) {
     auto null_selection = FilterState::Get(ctx).null_selection_behavior;
 
-    const auto filter_data = selection->buffers[1]->data();
+    const uint8_t* filter_data = selection.buffers[1].data;
 
-    const uint8_t* filter_is_valid = GetValidityBitmap(*selection);
-    const int64_t filter_offset = selection->offset;
-    OptionalBitIndexer values_is_valid(values->buffers[0], values->offset);
+    const uint8_t* filter_is_valid = selection.buffers[0].data;
+    const int64_t filter_offset = selection.offset;
+    OptionalBitIndexer values_is_valid(values.buffers[0].data, values.offset);
 
     // We use 3 block counters for fast scanning of the filter
     //
     // * values_valid_counter: for values null/not-null
     // * filter_valid_counter: for filter null/not-null
     // * filter_counter: for filter true/false
-    OptionalBitBlockCounter values_valid_counter(GetValidityBitmap(*values),
-                                                 values->offset, values->length);
+    OptionalBitBlockCounter values_valid_counter(values.buffers[0].data, values.offset,
+                                                 values.length);
     OptionalBitBlockCounter filter_valid_counter(filter_is_valid, filter_offset,
-                                                 selection->length);
-    BitBlockCounter filter_counter(filter_data, filter_offset, selection->length);
+                                                 selection.length);
+    BitBlockCounter filter_counter(filter_data, filter_offset, selection.length);
     int64_t in_position = 0;
 
     auto AppendNotNull = [&](int64_t index) -> Status {
@@ -1349,7 +1355,7 @@ struct Selection {
       }
     };
 
-    while (in_position < selection->length) {
+    while (in_position < selection.length) {
       BitBlockCount filter_valid_block = filter_valid_counter.NextWord();
       BitBlockCount values_valid_block = values_valid_counter.NextWord();
       BitBlockCount filter_block = filter_counter.NextWord();
@@ -1436,7 +1442,7 @@ struct Selection {
   Status ExecTake() {
     RETURN_NOT_OK(this->validity_builder.Reserve(output_length));
     RETURN_NOT_OK(Init());
-    int index_width = GetByteWidth(*this->selection->type);
+    int index_width = this->selection.type->byte_width();
 
     // CTRP dispatch here
     switch (index_width) {
@@ -1500,26 +1506,27 @@ struct VarBinaryImpl : public Selection<VarBinaryImpl<Type>, Type> {
   using Base = Selection<VarBinaryImpl<Type>, Type>;
   LIFT_BASE_MEMBERS();
 
-  std::shared_ptr<ArrayData> values_as_binary;
   TypedBufferBuilder<offset_type> offset_builder;
   TypedBufferBuilder<uint8_t> data_builder;
 
   static constexpr int64_t kOffsetLimit = std::numeric_limits<offset_type>::max() - 1;
 
-  VarBinaryImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
-                Datum* out)
+  VarBinaryImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+                ExecResult* out)
       : Base(ctx, batch, output_length, out),
         offset_builder(ctx->memory_pool()),
         data_builder(ctx->memory_pool()) {}
 
   template <typename Adapter>
   Status GenerateOutput() {
-    ValuesArrayType typed_values(this->values_as_binary);
+    const auto raw_offsets = this->values.template GetValues<offset_type>(1);
+    const uint8_t* raw_data = this->values.buffers[2].data;
 
     // Presize the data builder with a rough estimate of the required data size
-    if (values->length > 0) {
+    if (this->values.length > 0) {
+      int64_t data_length = raw_offsets[this->values.length] - raw_offsets[0];
       const double mean_value_length =
-          (typed_values.total_values_length() / static_cast<double>(values->length));
+          data_length / static_cast<double>(this->values.length);
 
       // TODO: See if possible to reduce output_length for take/filter cases
       // where there are nulls in the selection array
@@ -1528,9 +1535,6 @@ struct VarBinaryImpl : public Selection<VarBinaryImpl<Type>, Type> {
     }
     int64_t space_available = data_builder.capacity();
 
-    const offset_type* raw_offsets = typed_values.raw_value_offsets();
-    const uint8_t* raw_data = typed_values.raw_data();
-
     offset_type offset = 0;
     Adapter adapter(this);
     RETURN_NOT_OK(adapter.Generate(
@@ -1563,11 +1567,7 @@ struct VarBinaryImpl : public Selection<VarBinaryImpl<Type>, Type> {
     return Status::OK();
   }
 
-  Status Init() override {
-    ARROW_ASSIGN_OR_RAISE(this->values_as_binary,
-                          GetArrayView(this->values, TypeTraits<Type>::type_singleton()));
-    return offset_builder.Reserve(output_length + 1);
-  }
+  Status Init() override { return offset_builder.Reserve(output_length + 1); }
 
   Status Finish() override {
     RETURN_NOT_OK(offset_builder.Finish(&out->buffers[1]));
@@ -1581,12 +1581,13 @@ struct FSBImpl : public Selection<FSBImpl, FixedSizeBinaryType> {
 
   TypedBufferBuilder<uint8_t> data_builder;
 
-  FSBImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+  FSBImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+          ExecResult* out)
       : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {}
 
   template <typename Adapter>
   Status GenerateOutput() {
-    FixedSizeBinaryArray typed_values(this->values);
+    FixedSizeBinaryArray typed_values(this->values.ToArrayData());
     int32_t value_size = typed_values.byte_width();
 
     RETURN_NOT_OK(data_builder.Reserve(value_size * output_length));
@@ -1617,14 +1618,15 @@ struct ListImpl : public Selection<ListImpl<Type>, Type> {
   TypedBufferBuilder<offset_type> offset_builder;
   typename TypeTraits<Type>::OffsetBuilderType child_index_builder;
 
-  ListImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+  ListImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+           ExecResult* out)
       : Base(ctx, batch, output_length, out),
         offset_builder(ctx->memory_pool()),
         child_index_builder(ctx->memory_pool()) {}
 
   template <typename Adapter>
   Status GenerateOutput() {
-    ValuesArrayType typed_values(this->values);
+    ValuesArrayType typed_values(this->values.ToArrayData());
 
     // TODO presize child_index_builder with a similar heuristic as VarBinaryImpl
 
@@ -1659,7 +1661,7 @@ struct ListImpl : public Selection<ListImpl<Type>, Type> {
     std::shared_ptr<Array> child_indices;
     RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
 
-    ValuesArrayType typed_values(this->values);
+    ValuesArrayType typed_values(this->values.ToArrayData());
 
     // No need to boundscheck the child values indices
     ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
@@ -1680,12 +1682,12 @@ struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
   std::vector<int8_t> type_codes_;
   std::vector<Int32Builder> child_indices_builders_;
 
-  DenseUnionImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length,
-                 Datum* out)
+  DenseUnionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+                 ExecResult* out)
       : Base(ctx, batch, output_length, out),
         value_offset_buffer_builder_(ctx->memory_pool()),
         child_id_buffer_builder_(ctx->memory_pool()),
-        type_codes_(checked_cast<const UnionType&>(*this->values->type).type_codes()),
+        type_codes_(checked_cast<const UnionType&>(*this->values.type).type_codes()),
         child_indices_builders_(type_codes_.size()) {
     for (auto& child_indices_builder : child_indices_builders_) {
       child_indices_builder = Int32Builder(ctx->memory_pool());
@@ -1694,7 +1696,7 @@ struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
 
   template <typename Adapter>
   Status GenerateOutput() {
-    DenseUnionArray typed_values(this->values);
+    DenseUnionArray typed_values(this->values.ToArrayData());
     Adapter adapter(this);
     RETURN_NOT_OK(adapter.Generate(
         [&](int64_t index) {
@@ -1729,7 +1731,7 @@ struct DenseUnionImpl : public Selection<DenseUnionImpl, DenseUnionType> {
     ARROW_ASSIGN_OR_RAISE(auto child_ids_buffer, child_id_buffer_builder_.Finish());
     ARROW_ASSIGN_OR_RAISE(auto value_offsets_buffer,
                           value_offset_buffer_builder_.Finish());
-    DenseUnionArray typed_values(this->values);
+    DenseUnionArray typed_values(this->values.ToArrayData());
     auto num_fields = typed_values.num_fields();
     auto num_rows = child_ids_buffer->size();
     BufferVector buffers{nullptr, std::move(child_ids_buffer),
@@ -1752,12 +1754,13 @@ struct FSLImpl : public Selection<FSLImpl, FixedSizeListType> {
   using Base = Selection<FSLImpl, FixedSizeListType>;
   LIFT_BASE_MEMBERS();
 
-  FSLImpl(KernelContext* ctx, const ExecBatch& batch, int64_t output_length, Datum* out)
+  FSLImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+          ExecResult* out)
       : Base(ctx, batch, output_length, out), child_index_builder(ctx->memory_pool()) {}
 
   template <typename Adapter>
   Status GenerateOutput() {
-    ValuesArrayType typed_values(this->values);
+    ValuesArrayType typed_values(this->values.ToArrayData());
     const int32_t list_size = typed_values.list_type()->list_size();
     const int64_t base_offset = typed_values.offset();
 
@@ -1781,7 +1784,7 @@ struct FSLImpl : public Selection<FSLImpl, FixedSizeListType> {
     std::shared_ptr<Array> child_indices;
     RETURN_NOT_OK(child_index_builder.Finish(&child_indices));
 
-    ValuesArrayType typed_values(this->values);
+    ValuesArrayType typed_values(this->values.ToArrayData());
 
     // No need to boundscheck the child values indices
     ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> taken_child,
@@ -1809,7 +1812,7 @@ struct StructImpl : public Selection<StructImpl, StructType> {
 
   template <typename Adapter>
   Status GenerateOutput() {
-    StructArray typed_values(values);
+    StructArray typed_values(this->values.ToArrayData());
     Adapter adapter(this);
     // There's nothing to do for Struct except to generate the validity bitmap
     return adapter.Generate([&](int64_t index) { return Status::OK(); },
@@ -1817,13 +1820,15 @@ struct StructImpl : public Selection<StructImpl, StructType> {
   }
 
   Status Finish() override {
-    StructArray typed_values(values);
+    StructArray typed_values(this->values.ToArrayData());
 
     // Select from children without boundschecking
-    out->child_data.resize(values->type->num_fields());
-    for (int field_index = 0; field_index < values->type->num_fields(); ++field_index) {
+    out->child_data.resize(this->values.type->num_fields());
+    for (int field_index = 0; field_index < this->values.type->num_fields();
+         ++field_index) {
       ARROW_ASSIGN_OR_RAISE(Datum taken_field,
-                            Take(Datum(typed_values.field(field_index)), Datum(selection),
+                            Take(Datum(typed_values.field(field_index)),
+                                 Datum(this->selection.ToArrayData()),
                                  TakeOptions::NoBoundsCheck(), ctx->exec_context()));
       out->child_data[field_index] = taken_field.array();
     }
@@ -1831,18 +1836,18 @@ struct StructImpl : public Selection<StructImpl, StructType> {
   }
 };
 
-Status StructFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status StructFilter(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   // Transform filter to selection indices and then use Take.
   std::shared_ptr<ArrayData> indices;
-  RETURN_NOT_OK(GetTakeIndices(*batch[1].array(),
+  RETURN_NOT_OK(GetTakeIndices(batch[1].array,
                                FilterState::Get(ctx).null_selection_behavior,
                                ctx->memory_pool())
                     .Value(&indices));
 
   Datum result;
-  RETURN_NOT_OK(
-      Take(batch[0], Datum(indices), TakeOptions::NoBoundsCheck(), ctx->exec_context())
-          .Value(&result));
+  RETURN_NOT_OK(Take(batch[0].array.ToArrayData(), Datum(indices),
+                     TakeOptions::NoBoundsCheck(), ctx->exec_context())
+                    .Value(&result));
   out->value = result.array();
   return Status::OK();
 }
@@ -2308,18 +2313,18 @@ class DropNullMetaFunction : public MetaFunction {
 // ----------------------------------------------------------------------
 
 template <typename Impl>
-Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status FilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   // TODO: where are the values and filter length equality checked?
-  int64_t output_length = GetFilterOutputSize(
-      *batch[1].array(), FilterState::Get(ctx).null_selection_behavior);
+  int64_t output_length =
+      GetFilterOutputSize(batch[1].array, FilterState::Get(ctx).null_selection_behavior);
   Impl kernel(ctx, batch, output_length, out);
   return kernel.ExecFilter();
 }
 
 template <typename Impl>
-Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status TakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
   }
   Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
   return kernel.ExecTake();
@@ -2327,7 +2332,7 @@ Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
 struct SelectionKernelDescr {
   InputType input;
-  ArrayKernelExecOld exec;
+  ArrayKernelExec exec;
 };
 
 void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
@@ -2367,9 +2372,9 @@ const FunctionDoc indices_nonzero_doc(
 
 struct NonZeroVisitor {
   UInt64Builder* builder;
-  const ArrayDataVector& arrays;
+  const std::vector<ArraySpan>& arrays;
 
-  NonZeroVisitor(UInt64Builder* builder, const ArrayDataVector& arrays)
+  NonZeroVisitor(UInt64Builder* builder, const std::vector<ArraySpan>& arrays)
       : builder(builder), arrays(arrays) {}
 
   Status Visit(const DataType& type) { return Status::NotImplemented(type.ToString()); }
@@ -2383,9 +2388,9 @@ struct NonZeroVisitor {
     const T zero{};
     uint64_t index = 0;
 
-    for (const std::shared_ptr<ArrayData>& current_array : arrays) {
+    for (const ArraySpan& current_array : arrays) {
       VisitArrayValuesInline<Type>(
-          *current_array,
+          current_array,
           [&](T v) {
             if (v != zero) {
               this->builder->UnsafeAppend(index++);
@@ -2395,36 +2400,36 @@ struct NonZeroVisitor {
           },
           [&]() { ++index; });
     }
-
     return Status::OK();
   }
 };
 
-Status IndicesNonZeroExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status DoNonZero(const std::vector<ArraySpan>& arrays, int64_t total_length,
+                 std::shared_ptr<ArrayData>* out) {
   UInt64Builder builder;
-  ArrayDataVector arrays;
-  Datum input = batch[0];
-
-  if (input.kind() == Datum::ARRAY) {
-    std::shared_ptr<ArrayData> array = input.array();
-    RETURN_NOT_OK(builder.Reserve(array->length));
-    arrays.push_back(std::move(array));
-  } else if (input.kind() == Datum::CHUNKED_ARRAY) {
-    std::shared_ptr<ChunkedArray> chunkedarr = input.chunked_array();
-    RETURN_NOT_OK(builder.Reserve(chunkedarr->length()));
-    for (int chunkidx = 0; chunkidx < chunkedarr->num_chunks(); ++chunkidx) {
-      arrays.push_back(std::move(chunkedarr->chunk(chunkidx)->data()));
-    }
-  } else {
-    return Status::NotImplemented(input.ToString());
-  }
+  RETURN_NOT_OK(builder.Reserve(total_length));
 
   NonZeroVisitor visitor(&builder, arrays);
-  RETURN_NOT_OK(VisitTypeInline(*(arrays[0]->type), &visitor));
+  RETURN_NOT_OK(VisitTypeInline(*arrays[0].type, &visitor));
+  return builder.FinishInternal(out);
+}
 
-  std::shared_ptr<ArrayData> out_data;
-  RETURN_NOT_OK(builder.FinishInternal(&out_data));
-  out->value = std::move(out_data);
+Status IndicesNonZeroExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  std::shared_ptr<ArrayData> result;
+  RETURN_NOT_OK(DoNonZero({batch[0].array}, batch.length, &result));
+  out->value = std::move(result);
+  return Status::OK();
+}
+
+Status IndicesNonZeroExecChunked(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  const ChunkedArray& arr = *batch[0].chunked_array();
+  std::vector<ArraySpan> arrays;
+  for (int i = 0; i < arr.num_chunks(); ++i) {
+    arrays.push_back(ArraySpan(*arr.chunk(i)->data()));
+  }
+  std::shared_ptr<ArrayData> result;
+  RETURN_NOT_OK(DoNonZero(arrays, arr.length(), &result));
+  out->value = std::move(result);
   return Status::OK();
 }
 
@@ -2437,6 +2442,7 @@ std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
   kernel.output_chunked = false;
   kernel.exec = IndicesNonZeroExec;
+  kernel.exec_chunked = IndicesNonZeroExecChunked;
   kernel.can_execute_chunkwise = false;
 
   auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
diff --git a/cpp/src/arrow/compute/row/grouper.cc b/cpp/src/arrow/compute/row/grouper.cc
index eb55124b179..719015871ce 100644
--- a/cpp/src/arrow/compute/row/grouper.cc
+++ b/cpp/src/arrow/compute/row/grouper.cc
@@ -101,7 +101,13 @@ struct GrouperImpl : Grouper {
   Result<Datum> Consume(const ExecBatch& batch) override {
     std::vector<int32_t> offsets_batch(batch.length + 1);
     for (int i = 0; i < batch.num_values(); ++i) {
-      encoders_[i]->AddLength(batch[i], batch.length, offsets_batch.data());
+      ExecValue value;
+      if (batch[i].is_array()) {
+        value.SetArray(*batch[i].array());
+      } else {
+        value.SetScalar(batch[i].scalar().get());
+      }
+      encoders_[i]->AddLength(value, batch.length, offsets_batch.data());
     }
 
     int32_t total_length = 0;
@@ -119,7 +125,13 @@ struct GrouperImpl : Grouper {
     }
 
     for (int i = 0; i < batch.num_values(); ++i) {
-      RETURN_NOT_OK(encoders_[i]->Encode(batch[i], batch.length, key_buf_ptrs.data()));
+      ExecValue value;
+      if (batch[i].is_array()) {
+        value.SetArray(*batch[i].array());
+      } else {
+        value.SetScalar(batch[i].scalar().get());
+      }
+      RETURN_NOT_OK(encoders_[i]->Encode(value, batch.length, key_buf_ptrs.data()));
     }
 
     TypedBufferBuilder<uint32_t> group_ids_batch(ctx_->memory_pool());
diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc
index d2f2b20d165..3e5da9d6567 100644
--- a/cpp/src/arrow/ipc/metadata_internal.cc
+++ b/cpp/src/arrow/ipc/metadata_internal.cc
@@ -51,7 +51,6 @@ namespace arrow {
 
 namespace flatbuf = org::apache::arrow::flatbuf;
 using internal::checked_cast;
-using internal::GetByteWidth;
 
 namespace ipc {
 namespace internal {
@@ -1055,8 +1054,8 @@ Status MakeSparseTensorIndexCSF(FBB& fbb, const SparseCSFIndex& sparse_index,
   auto indices_type_offset = flatbuf::CreateInt(fbb, indices_value_type.bit_width(),
                                                 indices_value_type.is_signed());
 
-  const int64_t indptr_elem_size = GetByteWidth(indptr_value_type);
-  const int64_t indices_elem_size = GetByteWidth(indices_value_type);
+  const int64_t indptr_elem_size = indptr_value_type.byte_width();
+  const int64_t indices_elem_size = indices_value_type.byte_width();
 
   int64_t offset = 0;
   std::vector<flatbuf::Buffer> indptr, indices;
@@ -1224,7 +1223,7 @@ Result<std::shared_ptr<Buffer>> WriteTensorMessage(const Tensor& tensor,
   using TensorOffset = flatbuffers::Offset<flatbuf::Tensor>;
 
   FBB fbb;
-  const int elem_size = GetByteWidth(*tensor.type());
+  const int elem_size = tensor.type()->byte_width();
 
   flatbuf::Type fb_type_type;
   Offset fb_type;
diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc
index b27dcee33b1..be6fd513e5e 100644
--- a/cpp/src/arrow/ipc/read_write_test.cc
+++ b/cpp/src/arrow/ipc/read_write_test.cc
@@ -59,7 +59,6 @@ namespace arrow {
 
 using internal::checked_cast;
 using internal::checked_pointer_cast;
-using internal::GetByteWidth;
 using internal::TemporaryDir;
 
 namespace ipc {
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index c580add4f43..0972d7e85ca 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -71,7 +71,6 @@ namespace flatbuf = org::apache::arrow::flatbuf;
 
 using internal::checked_cast;
 using internal::checked_pointer_cast;
-using internal::GetByteWidth;
 
 namespace ipc {
 
@@ -2095,7 +2094,7 @@ Result<std::shared_ptr<SparseIndex>> ReadSparseCOOIndex(
 
   std::shared_ptr<DataType> indices_type;
   RETURN_NOT_OK(internal::GetSparseCOOIndexMetadata(sparse_index, &indices_type));
-  const int64_t indices_elsize = GetByteWidth(*indices_type);
+  const int64_t indices_elsize = indices_type->byte_width();
 
   auto* indices_buffer = sparse_index->indicesBuffer();
   ARROW_ASSIGN_OR_RAISE(auto indices_data,
@@ -2131,7 +2130,7 @@ Result<std::shared_ptr<SparseIndex>> ReadSparseCSXIndex(
   std::shared_ptr<DataType> indptr_type, indices_type;
   RETURN_NOT_OK(
       internal::GetSparseCSXIndexMetadata(sparse_index, &indptr_type, &indices_type));
-  const int indptr_byte_width = GetByteWidth(*indptr_type);
+  const int indptr_byte_width = indptr_type->byte_width();
 
   auto* indptr_buffer = sparse_index->indptrBuffer();
   ARROW_ASSIGN_OR_RAISE(auto indptr_data,
@@ -2142,7 +2141,7 @@ Result<std::shared_ptr<SparseIndex>> ReadSparseCSXIndex(
                         file->ReadAt(indices_buffer->offset(), indices_buffer->length()));
 
   std::vector<int64_t> indices_shape({non_zero_length});
-  const auto indices_minimum_bytes = indices_shape[0] * GetByteWidth(*indices_type);
+  const auto indices_minimum_bytes = indices_shape[0] * indices_type->byte_width();
   if (indices_minimum_bytes > indices_buffer->length()) {
     return Status::Invalid("shape is inconsistent to the size of indices buffer");
   }
diff --git a/cpp/src/arrow/ipc/tensor_test.cc b/cpp/src/arrow/ipc/tensor_test.cc
index 7e013fe3925..2706ea19d57 100644
--- a/cpp/src/arrow/ipc/tensor_test.cc
+++ b/cpp/src/arrow/ipc/tensor_test.cc
@@ -42,7 +42,6 @@
 namespace arrow {
 
 using internal::checked_cast;
-using internal::GetByteWidth;
 using internal::TemporaryDir;
 
 namespace ipc {
@@ -64,7 +63,7 @@ class TestTensorRoundTrip : public BaseTensorTest {
   void CheckTensorRoundTrip(const Tensor& tensor) {
     int32_t metadata_length;
     int64_t body_length;
-    const int elem_size = GetByteWidth(*tensor.type());
+    const int elem_size = tensor.type()->byte_width();
 
     ASSERT_OK(mmap_->Seek(0));
 
@@ -139,7 +138,7 @@ template <typename IndexValueType>
 class TestSparseTensorRoundTrip : public BaseTensorTest {
  public:
   void CheckSparseCOOTensorRoundTrip(const SparseCOOTensor& sparse_tensor) {
-    const int elem_size = GetByteWidth(*sparse_tensor.type());
+    const int elem_size = sparse_tensor.type()->byte_width();
     const int index_elem_size = sizeof(typename IndexValueType::c_type);
 
     int32_t metadata_length;
@@ -180,7 +179,7 @@ class TestSparseTensorRoundTrip : public BaseTensorTest {
                       std::is_same<SparseIndexType, SparseCSCIndex>::value,
                   "SparseIndexType must be either SparseCSRIndex or SparseCSCIndex");
 
-    const int elem_size = GetByteWidth(*sparse_tensor.type());
+    const int elem_size = sparse_tensor.type()->byte_width();
     const int index_elem_size = sizeof(typename IndexValueType::c_type);
 
     int32_t metadata_length;
@@ -221,7 +220,7 @@ class TestSparseTensorRoundTrip : public BaseTensorTest {
   }
 
   void CheckSparseCSFTensorRoundTrip(const SparseCSFTensor& sparse_tensor) {
-    const int elem_size = GetByteWidth(*sparse_tensor.type());
+    const int elem_size = sparse_tensor.type()->byte_width();
     const int index_elem_size = sizeof(typename IndexValueType::c_type);
 
     int32_t metadata_length;
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index 4a7671e158f..d015ee3f4d9 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -62,7 +62,6 @@ namespace arrow {
 using internal::checked_cast;
 using internal::checked_pointer_cast;
 using internal::CopyBitmap;
-using internal::GetByteWidth;
 
 namespace ipc {
 
@@ -322,7 +321,7 @@ class RecordBatchSerializer {
   Visit(const T& array) {
     std::shared_ptr<Buffer> data = array.values();
 
-    const int64_t type_width = GetByteWidth(*array.type());
+    const int64_t type_width = array.type()->byte_width();
     int64_t min_length = PaddedLength(array.length() * type_width);
 
     if (NeedTruncate(array.offset(), data.get(), min_length)) {
@@ -705,7 +704,7 @@ Status WriteStridedTensorData(int dim_index, int64_t offset, int elem_size,
 
 Status GetContiguousTensor(const Tensor& tensor, MemoryPool* pool,
                            std::unique_ptr<Tensor>* out) {
-  const int elem_size = GetByteWidth(*tensor.type());
+  const int elem_size = tensor.type()->byte_width();
 
   ARROW_ASSIGN_OR_RAISE(
       auto scratch_space,
@@ -727,7 +726,7 @@ Status GetContiguousTensor(const Tensor& tensor, MemoryPool* pool,
 
 Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length,
                    int64_t* body_length) {
-  const int elem_size = GetByteWidth(*tensor.type());
+  const int elem_size = tensor.type()->byte_width();
 
   *body_length = tensor.size() * elem_size;
 
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 957dd3c5ca1..8f9d1cb45b9 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -65,7 +65,6 @@ class MemoryPool;
 
 using internal::checked_cast;
 using internal::CheckIndexBounds;
-using internal::GetByteWidth;
 using internal::OptionalParallelFor;
 
 namespace py {
@@ -280,7 +279,7 @@ inline const T* GetPrimitiveValues(const Array& arr) {
   if (arr.length() == 0) {
     return nullptr;
   }
-  const int elsize = GetByteWidth(*arr.type());
+  const int elsize = arr.type()->byte_width();
   const auto& prim_arr = checked_cast<const PrimitiveArray&>(arr);
   return reinterpret_cast<const T*>(prim_arr.values()->data() + arr.offset() * elsize);
 }
diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc
index 7e48bb80f08..b83d42e5fb2 100644
--- a/cpp/src/arrow/sparse_tensor.cc
+++ b/cpp/src/arrow/sparse_tensor.cc
@@ -277,7 +277,7 @@ Result<std::shared_ptr<SparseCOOIndex>> SparseCOOIndex::Make(
   if (!is_integer(indices_type->id())) {
     return Status::TypeError("Type of SparseCOOIndex indices must be integer");
   }
-  const int64_t elsize = internal::GetByteWidth(*indices_type);
+  const int64_t elsize = indices_type->byte_width();
   std::vector<int64_t> indices_shape({non_zero_length, ndim});
   std::vector<int64_t> indices_strides({elsize * ndim, elsize});
   return Make(indices_type, indices_shape, indices_strides, indices_data);
diff --git a/cpp/src/arrow/tensor.cc b/cpp/src/arrow/tensor.cc
index f20a057b1e6..77ccedbde15 100644
--- a/cpp/src/arrow/tensor.cc
+++ b/cpp/src/arrow/tensor.cc
@@ -44,7 +44,7 @@ namespace internal {
 Status ComputeRowMajorStrides(const FixedWidthType& type,
                               const std::vector<int64_t>& shape,
                               std::vector<int64_t>* strides) {
-  const int byte_width = GetByteWidth(type);
+  const int byte_width = type.byte_width();
   const size_t ndim = shape.size();
 
   int64_t remaining = 0;
@@ -75,7 +75,7 @@ Status ComputeRowMajorStrides(const FixedWidthType& type,
 Status ComputeColumnMajorStrides(const FixedWidthType& type,
                                  const std::vector<int64_t>& shape,
                                  std::vector<int64_t>* strides) {
-  const int byte_width = internal::GetByteWidth(type);
+  const int byte_width = type.byte_width();
   const size_t ndim = shape.size();
 
   int64_t total = 0;
@@ -183,7 +183,7 @@ Status CheckTensorStridesValidity(const std::shared_ptr<Buffer>& data,
         "offsets computed from shape and strides would not fit in 64-bit integer");
   }
 
-  const int byte_width = internal::GetByteWidth(*type);
+  const int byte_width = type->byte_width();
   if (largest_offset > data->size() - byte_width) {
     return Status::Invalid("strides must not involve buffer over run");
   }
diff --git a/cpp/src/arrow/tensor/coo_converter.cc b/cpp/src/arrow/tensor/coo_converter.cc
index 148aa7c1b75..7e29b668f53 100644
--- a/cpp/src/arrow/tensor/coo_converter.cc
+++ b/cpp/src/arrow/tensor/coo_converter.cc
@@ -172,8 +172,8 @@ class SparseCOOTensorConverter : private SparseTensorConverterMixin {
     RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
                                                                   tensor_.shape()));
 
-    const int index_elsize = GetByteWidth(*index_value_type_);
-    const int value_elsize = GetByteWidth(*tensor_.type());
+    const int index_elsize = index_value_type_->byte_width();
+    const int value_elsize = tensor_.type()->byte_width();
 
     const int64_t ndim = tensor_.ndim();
     ARROW_ASSIGN_OR_RAISE(int64_t nonzero_count, tensor_.CountNonZero());
@@ -295,10 +295,10 @@ Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
   const auto& coords = sparse_index.indices();
   const auto* coords_data = coords->raw_data();
 
-  const int index_elsize = GetByteWidth(*coords->type());
+  const int index_elsize = coords->type()->byte_width();
 
   const auto& value_type = checked_cast<const FixedWidthType&>(*sparse_tensor->type());
-  const int value_elsize = GetByteWidth(value_type);
+  const int value_elsize = value_type.byte_width();
   ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                         AllocateBuffer(value_elsize * sparse_tensor->size(), pool));
   auto values = values_buffer->mutable_data();
diff --git a/cpp/src/arrow/tensor/csf_converter.cc b/cpp/src/arrow/tensor/csf_converter.cc
index f8facfd957c..2d925ddbbb0 100644
--- a/cpp/src/arrow/tensor/csf_converter.cc
+++ b/cpp/src/arrow/tensor/csf_converter.cc
@@ -71,8 +71,8 @@ class SparseCSFTensorConverter : private SparseTensorConverterMixin {
     RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
                                                                   tensor_.shape()));
 
-    const int index_elsize = GetByteWidth(*index_value_type_);
-    const int value_elsize = GetByteWidth(*tensor_.type());
+    const int index_elsize = index_value_type_->byte_width();
+    const int value_elsize = tensor_.type()->byte_width();
 
     const int64_t ndim = tensor_.ndim();
     // Axis order as ascending order of dimension size is a good heuristic but is not
@@ -203,11 +203,11 @@ class TensorBuilderFromSparseCSFTensor : private SparseTensorConverterMixin {
         ndim_(sparse_tensor->ndim()),
         tensor_size_(sparse_tensor->size()),
         value_type_(checked_cast<const FixedWidthType&>(*sparse_tensor->type())),
-        value_elsize_(GetByteWidth(value_type_)),
+        value_elsize_(value_type_.byte_width()),
         raw_data_(sparse_tensor->raw_data()) {}
 
   int ElementSize(const std::shared_ptr<Tensor>& tensor) const {
-    return GetByteWidth(*tensor->type());
+    return tensor->type()->byte_width();
   }
 
   Result<std::shared_ptr<Tensor>> Build() {
diff --git a/cpp/src/arrow/tensor/csx_converter.cc b/cpp/src/arrow/tensor/csx_converter.cc
index 882e277efbb..f30e71f5c40 100644
--- a/cpp/src/arrow/tensor/csx_converter.cc
+++ b/cpp/src/arrow/tensor/csx_converter.cc
@@ -52,8 +52,8 @@ class SparseCSXMatrixConverter : private SparseTensorConverterMixin {
     RETURN_NOT_OK(::arrow::internal::CheckSparseIndexMaximumValue(index_value_type_,
                                                                   tensor_.shape()));
 
-    const int index_elsize = GetByteWidth(*index_value_type_);
-    const int value_elsize = GetByteWidth(*tensor_.type());
+    const int index_elsize = index_value_type_->byte_width();
+    const int value_elsize = tensor_.type()->byte_width();
 
     const int64_t ndim = tensor_.ndim();
     if (ndim > 2) {
@@ -166,11 +166,11 @@ Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSXMatrix(
   const auto* indptr_data = indptr->raw_data();
   const auto* indices_data = indices->raw_data();
 
-  const int indptr_elsize = GetByteWidth(*indptr->type());
-  const int indices_elsize = GetByteWidth(*indices->type());
+  const int indptr_elsize = indptr->type()->byte_width();
+  const int indices_elsize = indices->type()->byte_width();
 
   const auto& fw_value_type = checked_cast<const FixedWidthType&>(*value_type);
-  const int value_elsize = GetByteWidth(fw_value_type);
+  const int value_elsize = fw_value_type.byte_width();
   ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                         AllocateBuffer(value_elsize * tensor_size, pool));
   auto values = values_buffer->mutable_data();
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index df5f786e5fc..40fe748589f 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -183,11 +183,6 @@ std::string ToString(TimeUnit::type unit) {
   }
 }
 
-int GetByteWidth(const DataType& type) {
-  const auto& fw_type = checked_cast<const FixedWidthType&>(type);
-  return fw_type.bit_width() / CHAR_BIT;
-}
-
 }  // namespace internal
 
 namespace {
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 4c1fbcc6bc3..b1b6d088f92 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -718,7 +718,8 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri
         {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(byte_width())});
   }
 
-  int32_t byte_width() const override { return byte_width_; }
+  int byte_width() const override { return byte_width_; }
+
   int bit_width() const override;
 
   // Validating constructor
@@ -2060,9 +2061,6 @@ std::string ToTypeName(Type::type id);
 ARROW_EXPORT
 std::string ToString(TimeUnit::type unit);
 
-ARROW_EXPORT
-int GetByteWidth(const DataType& type);
-
 }  // namespace internal
 
 // Helpers to get instances of data types based on general categories
diff --git a/cpp/src/arrow/util/bitmap_ops.cc b/cpp/src/arrow/util/bitmap_ops.cc
index f73fa356468..9063dc0533b 100644
--- a/cpp/src/arrow/util/bitmap_ops.cc
+++ b/cpp/src/arrow/util/bitmap_ops.cc
@@ -25,6 +25,7 @@
 #include "arrow/buffer.h"
 #include "arrow/result.h"
 #include "arrow/util/align_util.h"
+#include "arrow/util/bit_block_counter.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_reader.h"
 #include "arrow/util/bitmap_writer.h"
@@ -84,6 +85,22 @@ int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length) {
   return count;
 }
 
+int64_t CountAndSetBits(const uint8_t* left_bitmap, int64_t left_offset,
+                        const uint8_t* right_bitmap, int64_t right_offset,
+                        int64_t length) {
+  BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
+                                    length);
+  int64_t count = 0;
+  while (true) {
+    BitBlockCount block = bit_counter.NextAndWord();
+    if (block.length == 0) {
+      break;
+    }
+    count += block.popcount;
+  }
+  return count;
+}
+
 enum class TransferMode : bool { Copy, Invert };
 
 // Reverse all bits from entire byte(uint8)
diff --git a/cpp/src/arrow/util/bitmap_ops.h b/cpp/src/arrow/util/bitmap_ops.h
index 8ba84dce4a7..a9d900b2588 100644
--- a/cpp/src/arrow/util/bitmap_ops.h
+++ b/cpp/src/arrow/util/bitmap_ops.h
@@ -116,6 +116,20 @@ Result<std::shared_ptr<Buffer>> ReverseBitmap(MemoryPool* pool, const uint8_t* b
 ARROW_EXPORT
 int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length);
 
+/// Compute the number of 1's in the result of an "and" (&) of two bitmaps
+///
+/// \param[in] left_bitmap a packed LSB-ordered bitmap as a byte array
+/// \param[in] left_offset a bitwise offset into the left bitmap
+/// \param[in] right_bitmap a packed LSB-ordered bitmap as a byte array
+/// \param[in] right_offset a bitwise offset into the right bitmap
+/// \param[in] length the length of the bitmaps (must be the same)
+///
+/// \return The number of set (1) bits in the "and" of the two bitmaps
+ARROW_EXPORT
+int64_t CountAndSetBits(const uint8_t* left_bitmap, int64_t left_offset,
+                        const uint8_t* right_bitmap, int64_t right_offset,
+                        int64_t length);
+
 ARROW_EXPORT
 bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
                   int64_t right_offset, int64_t length);
diff --git a/cpp/src/arrow/util/bitmap_reader.h b/cpp/src/arrow/util/bitmap_reader.h
index 350b4436e8c..110fb6958da 100644
--- a/cpp/src/arrow/util/bitmap_reader.h
+++ b/cpp/src/arrow/util/bitmap_reader.h
@@ -260,8 +260,8 @@ struct OptionalBitIndexer {
   const uint8_t* bitmap;
   const int64_t offset;
 
-  explicit OptionalBitIndexer(const std::shared_ptr<Buffer>& buffer, int64_t offset = 0)
-      : bitmap(buffer == NULLPTR ? NULLPTR : buffer->data()), offset(offset) {}
+  explicit OptionalBitIndexer(const uint8_t* buffer = NULLPTR, int64_t offset = 0)
+      : bitmap(buffer), offset(offset) {}
 
   bool operator[](int64_t i) const {
     return bitmap == NULLPTR || bit_util::GetBit(bitmap, offset + i);
diff --git a/r/tests/testthat/test-compute-vector.R b/r/tests/testthat/test-compute-vector.R
index 345da5656bf..1e25c9e6a47 100644
--- a/r/tests/testthat/test-compute-vector.R
+++ b/r/tests/testthat/test-compute-vector.R
@@ -116,7 +116,7 @@ test_that("call_function validation", {
       Array$create(c(TRUE, FALSE, TRUE)),
       options = list(keep_na = TRUE)
     ),
-    "Array arguments must all be the same length"
+    "arguments must all be the same length"
   )
   expect_error(
     call_function("filter",