From dcfb181f68e2518ee7d6a65ee29a17733ca4fda4 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Mon, 6 Jun 2022 21:51:52 -0500
Subject: [PATCH 01/15] Introduce ArraySpan, ExecSpan data structures and new
 lighter-weight method of scalar kernel evaluation

Checkpoint, getting closer to compilation

libarrow builds again

Get everything compiling again, compute internals tests passing

Get Bitmap test cases passing again

Don't try filling validity bitmap if none was allocated

Fix bit block visitors and a few arithmetic kernels

Refactor int_util.h to use ArraySpan

Some more tests passing

Fix some more unit tests, compilation

Another fix

Fix some more tests

Fix some more bugs

Fixed some more tests

All scalar_if_else.cc tests passing again

Work partway through scalar_nested.cc

Down to only a few failing scalar tests

scalar kernels tests passing again

fix scalar executor, tests all passing now
---
 cpp/src/arrow/array/array_dict.cc             |    6 +-
 cpp/src/arrow/array/array_nested.cc           |    7 +-
 cpp/src/arrow/array/array_nested.h            |    2 +-
 cpp/src/arrow/array/array_test.cc             |   29 +-
 cpp/src/arrow/array/builder_base.h            |    2 +-
 cpp/src/arrow/array/builder_binary.h          |    4 +-
 cpp/src/arrow/array/builder_dict.h            |    9 +-
 cpp/src/arrow/array/builder_nested.h          |   26 +-
 cpp/src/arrow/array/builder_primitive.h       |    6 +-
 cpp/src/arrow/array/builder_union.cc          |    8 +-
 cpp/src/arrow/array/builder_union.h           |    4 +-
 cpp/src/arrow/array/data.cc                   |  140 ++
 cpp/src/arrow/array/data.h                    |  124 ++
 cpp/src/arrow/array/util.cc                   |    4 +-
 cpp/src/arrow/builder.cc                      |    2 +-
 cpp/src/arrow/chunked_array.h                 |    2 +-
 cpp/src/arrow/compute/cast.cc                 |    2 +-
 cpp/src/arrow/compute/cast.h                  |    2 +-
 cpp/src/arrow/compute/exec.cc                 |  591 +++++--
 cpp/src/arrow/compute/exec.h                  |  188 ++
 cpp/src/arrow/compute/exec_internal.h         |   65 +-
 cpp/src/arrow/compute/exec_test.cc            |  501 +++++-
 cpp/src/arrow/compute/function.cc             |    4 +-
 cpp/src/arrow/compute/function.h              |    4 +-
 cpp/src/arrow/compute/function_benchmark.cc   |   22 +-
 cpp/src/arrow/compute/function_test.cc        |   34 +-
 cpp/src/arrow/compute/kernel.h                |   97 +-
 .../arrow/compute/kernels/aggregate_basic.cc  |    4 +-
 .../arrow/compute/kernels/aggregate_mode.cc   |    5 +-
 .../compute/kernels/aggregate_quantile.cc     |    2 +-
 .../arrow/compute/kernels/codegen_internal.cc |   16 +-
 .../arrow/compute/kernels/codegen_internal.h  |  660 +++++--
 .../arrow/compute/kernels/hash_aggregate.cc   |    4 +-
 .../compute/kernels/scalar_arithmetic.cc      |   83 +-
 .../arrow/compute/kernels/scalar_boolean.cc   |  304 ++--
 .../compute/kernels/scalar_cast_boolean.cc    |    7 +-
 .../compute/kernels/scalar_cast_dictionary.cc |   45 +-
 .../compute/kernels/scalar_cast_internal.cc   |  118 +-
 .../compute/kernels/scalar_cast_internal.h    |   18 +-
 .../compute/kernels/scalar_cast_nested.cc     |   51 +-
 .../compute/kernels/scalar_cast_numeric.cc    |   65 +-
 .../compute/kernels/scalar_cast_string.cc     |  131 +-
 .../compute/kernels/scalar_cast_temporal.cc   |   75 +-
 .../arrow/compute/kernels/scalar_compare.cc   |  138 +-
 .../arrow/compute/kernels/scalar_if_else.cc   | 1512 +++++++++--------
 .../arrow/compute/kernels/scalar_nested.cc    |  364 ++--
 .../compute/kernels/scalar_nested_test.cc     |   13 +-
 .../arrow/compute/kernels/scalar_random.cc    |   20 +-
 .../compute/kernels/scalar_set_lookup.cc      |   98 +-
 .../compute/kernels/scalar_set_lookup_test.cc |   12 +-
 .../compute/kernels/scalar_string_ascii.cc    |  440 ++---
 .../compute/kernels/scalar_string_internal.h  |   66 +-
 .../compute/kernels/scalar_string_utf8.cc     |   32 +-
 .../compute/kernels/scalar_temporal_binary.cc |   20 +-
 .../compute/kernels/scalar_temporal_unary.cc  |  129 +-
 .../arrow/compute/kernels/scalar_validity.cc  |  127 +-
 .../compute/kernels/scalar_validity_test.cc   |    6 -
 .../arrow/compute/kernels/temporal_internal.h |   30 +-
 .../arrow/compute/kernels/util_internal.cc    |   65 +-
 cpp/src/arrow/compute/kernels/util_internal.h |   18 +-
 .../compute/kernels/vector_array_sort.cc      |   10 +-
 .../compute/kernels/vector_cumulative_ops.cc  |    6 +-
 .../arrow/compute/kernels/vector_replace.cc   |    6 +-
 .../arrow/compute/kernels/vector_selection.cc |   13 +-
 cpp/src/arrow/datum.h                         |    2 +
 cpp/src/arrow/pretty_print.cc                 |   24 +-
 cpp/src/arrow/python/arrow_to_pandas.cc       |    9 +-
 cpp/src/arrow/python/udf.cc                   |   50 +-
 cpp/src/arrow/scalar.cc                       |    2 +-
 cpp/src/arrow/scalar.h                        |   24 +-
 cpp/src/arrow/type.h                          |   26 +-
 cpp/src/arrow/util/bit_block_counter.h        |  105 +-
 cpp/src/arrow/util/bit_util_benchmark.cc      |   15 +-
 cpp/src/arrow/util/bit_util_test.cc           |   35 +-
 cpp/src/arrow/util/bitmap.cc                  |   24 +-
 cpp/src/arrow/util/bitmap.h                   |   61 +-
 cpp/src/arrow/util/formatting.h               |   18 +-
 cpp/src/arrow/util/formatting_util_test.cc    |   33 +-
 cpp/src/arrow/util/int_util.cc                |  149 +-
 cpp/src/arrow/util/int_util.h                 |   13 +-
 cpp/src/arrow/util/int_util_benchmark.cc      |    4 +-
 cpp/src/arrow/util/int_util_test.cc           |   32 +-
 82 files changed, 4517 insertions(+), 2707 deletions(-)

diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc
index dbfc7bd7586..0a4d33e03da 100644
--- a/cpp/src/arrow/array/array_dict.cc
+++ b/cpp/src/arrow/array/array_dict.cc
@@ -125,7 +125,7 @@ Result<std::shared_ptr<Array>> DictionaryArray::FromArrays(
         "Dictionary type's index type does not match "
         "indices array's type");
   }
-  RETURN_NOT_OK(internal::CheckIndexBounds(*indices->data(),
+  RETURN_NOT_OK(internal::CheckIndexBounds(ArraySpan(*indices->data()),
                                            static_cast<uint64_t>(dictionary->length())));
   return std::make_shared<DictionaryArray>(type, indices, dictionary);
 }
@@ -290,8 +290,8 @@ class DictionaryUnifierImpl : public DictionaryUnifier {
 
   Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
                                 std::shared_ptr<Array>* out_dict) override {
-    int64_t dict_length = memo_table_.size();
-    if (!internal::IntegersCanFit(Datum(dict_length), *index_type).ok()) {
+    Int64Scalar dict_length(memo_table_.size());
+    if (!internal::IntegersCanFit(dict_length, *index_type).ok()) {
       return Status::Invalid(
           "These dictionaries cannot be combined.  The unified dictionary requires a "
           "larger index type.");
diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc
index 757ab8e9c3f..64edec0c7aa 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -569,7 +569,7 @@ const ArrayVector& StructArray::fields() const {
   return boxed_fields_;
 }
 
-std::shared_ptr<Array> StructArray::field(int i) const {
+const std::shared_ptr<Array>& StructArray::field(int i) const {
   std::shared_ptr<Array> result = internal::atomic_load(&boxed_fields_[i]);
   if (!result) {
     std::shared_ptr<ArrayData> field_data;
@@ -578,10 +578,11 @@ std::shared_ptr<Array> StructArray::field(int i) const {
     } else {
       field_data = data_->child_data[i];
     }
-    result = MakeArray(field_data);
+    std::shared_ptr<Array> result = MakeArray(field_data);
     internal::atomic_store(&boxed_fields_[i], result);
+    return boxed_fields_[i];
   }
-  return result;
+  return boxed_fields_[i];
 }
 
 std::shared_ptr<Array> StructArray::GetFieldByName(const std::string& name) const {
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index be6371941df..5d04bef4f9e 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -378,7 +378,7 @@ class ARROW_EXPORT StructArray : public Array {
   // Return a shared pointer in case the requestor desires to share ownership
   // with this array.  The returned array has its offset, length and null
   // count adjusted.
-  std::shared_ptr<Array> field(int pos) const;
+  const std::shared_ptr<Array>& field(int pos) const;
 
   const ArrayVector& fields() const;
 
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 929c4d3bf4d..6057796875c 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -696,6 +696,7 @@ TEST_F(TestArray, TestMakeEmptyArray) {
 
 TEST_F(TestArray, TestAppendArraySlice) {
   auto scalars = GetScalars();
+  ArraySpan span;
   for (const auto& scalar : scalars) {
     ARROW_SCOPED_TRACE(*scalar->type);
     ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, 16));
@@ -704,31 +705,33 @@ TEST_F(TestArray, TestAppendArraySlice) {
     std::unique_ptr<arrow::ArrayBuilder> builder;
     ASSERT_OK(MakeBuilder(pool_, scalar->type, &builder));
 
-    ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 4));
+    span.SetMembers(*array->data());
+    ASSERT_OK(builder->AppendArraySlice(span, 0, 4));
     ASSERT_EQ(4, builder->length());
-    ASSERT_OK(builder->AppendArraySlice(*array->data(), 0, 0));
+    ASSERT_OK(builder->AppendArraySlice(span, 0, 0));
     ASSERT_EQ(4, builder->length());
-    ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 0));
+    ASSERT_OK(builder->AppendArraySlice(span, 1, 0));
     ASSERT_EQ(4, builder->length());
-    ASSERT_OK(builder->AppendArraySlice(*array->data(), 1, 4));
+    ASSERT_OK(builder->AppendArraySlice(span, 1, 4));
     ASSERT_EQ(8, builder->length());
 
-    ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 4));
+    span.SetMembers(*nulls->data());
+    ASSERT_OK(builder->AppendArraySlice(span, 0, 4));
     ASSERT_EQ(12, builder->length());
     if (!is_union(scalar->type->id())) {
       ASSERT_EQ(4, builder->null_count());
     }
-    ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 0, 0));
+    ASSERT_OK(builder->AppendArraySlice(span, 0, 0));
     ASSERT_EQ(12, builder->length());
     if (!is_union(scalar->type->id())) {
       ASSERT_EQ(4, builder->null_count());
     }
-    ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 0));
+    ASSERT_OK(builder->AppendArraySlice(span, 1, 0));
     ASSERT_EQ(12, builder->length());
     if (!is_union(scalar->type->id())) {
       ASSERT_EQ(4, builder->null_count());
     }
-    ASSERT_OK(builder->AppendArraySlice(*nulls->data(), 1, 4));
+    ASSERT_OK(builder->AppendArraySlice(span, 1, 4));
     ASSERT_EQ(16, builder->length());
     if (!is_union(scalar->type->id())) {
       ASSERT_EQ(8, builder->null_count());
@@ -746,13 +749,15 @@ TEST_F(TestArray, TestAppendArraySlice) {
   {
     ASSERT_OK_AND_ASSIGN(auto array, MakeArrayOfNull(null(), 16));
     NullBuilder builder(pool_);
-    ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 4));
+
+    span.SetMembers(*array->data());
+    ASSERT_OK(builder.AppendArraySlice(span, 0, 4));
     ASSERT_EQ(4, builder.length());
-    ASSERT_OK(builder.AppendArraySlice(*array->data(), 0, 0));
+    ASSERT_OK(builder.AppendArraySlice(span, 0, 0));
     ASSERT_EQ(4, builder.length());
-    ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 0));
+    ASSERT_OK(builder.AppendArraySlice(span, 1, 0));
     ASSERT_EQ(4, builder.length());
-    ASSERT_OK(builder.AppendArraySlice(*array->data(), 1, 4));
+    ASSERT_OK(builder.AppendArraySlice(span, 1, 4));
     ASSERT_EQ(8, builder.length());
     std::shared_ptr<Array> result;
     ASSERT_OK(builder.Finish(&result));
diff --git a/cpp/src/arrow/array/builder_base.h b/cpp/src/arrow/array/builder_base.h
index 4d0b477dcb8..bc4932a4b83 100644
--- a/cpp/src/arrow/array/builder_base.h
+++ b/cpp/src/arrow/array/builder_base.h
@@ -147,7 +147,7 @@ class ARROW_EXPORT ArrayBuilder {
   /// \brief Append a range of values from an array.
   ///
   /// The given array must be the same type as the builder.
-  virtual Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  virtual Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                                   int64_t length) {
     return Status::NotImplemented("AppendArraySlice for builder for ", *type());
   }
diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h
index 703355bf278..25cec5c1e25 100644
--- a/cpp/src/arrow/array/builder_binary.h
+++ b/cpp/src/arrow/array/builder_binary.h
@@ -278,7 +278,7 @@ class BaseBinaryBuilder : public ArrayBuilder {
     return Status::OK();
   }
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     auto bitmap = array.GetValues<uint8_t>(0, 0);
     auto offsets = array.GetValues<offset_type>(1);
@@ -516,7 +516,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
   Status AppendEmptyValue() final;
   Status AppendEmptyValues(int64_t length) final;
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     return AppendValues(
         array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index 90a7c48d9f0..b720f73d7d2 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -366,10 +366,11 @@ class DictionaryBuilderBase : public ArrayBuilder {
     return Status::OK();
   }
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
     // Visit the indices and insert the unpacked values.
     const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
-    const typename TypeTraits<T>::ArrayType dict(array.dictionary);
+    // See if possible to avoid using ToArrayData here
+    const typename TypeTraits<T>::ArrayType dict(array.dictionary().ToArrayData());
     ARROW_RETURN_NOT_OK(Reserve(length));
     switch (dict_ty.index_type()->id()) {
       case Type::UINT8:
@@ -490,10 +491,10 @@ class DictionaryBuilderBase : public ArrayBuilder {
  protected:
   template <typename c_type>
   Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
-                              const ArrayData& array, int64_t offset, int64_t length) {
+                              const ArraySpan& array, int64_t offset, int64_t length) {
     const c_type* values = array.GetValues<c_type>(1) + offset;
     return VisitBitBlocks(
-        array.buffers[0], array.offset + offset, length,
+        array.buffers[0].data, array.offset + offset, length,
         [&](const int64_t position) {
           const int64_t index = static_cast<int64_t>(values[position]);
           if (dict.IsValid(index)) {
diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h
index 4f39ce86294..3d36cb5f65e 100644
--- a/cpp/src/arrow/array/builder_nested.h
+++ b/cpp/src/arrow/array/builder_nested.h
@@ -126,15 +126,15 @@ class BaseListBuilder : public ArrayBuilder {
     return Status::OK();
   }
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     const offset_type* offsets = array.GetValues<offset_type>(1);
-    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
     for (int64_t row = offset; row < offset + length; row++) {
       if (!validity || bit_util::GetBit(validity, array.offset + row)) {
         ARROW_RETURN_NOT_OK(Append());
         int64_t slot_length = offsets[row + 1] - offsets[row];
-        ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(*array.child_data[0],
+        ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0],
                                                              offsets[row], slot_length));
       } else {
         ARROW_RETURN_NOT_OK(AppendNull());
@@ -296,18 +296,18 @@ class ARROW_EXPORT MapBuilder : public ArrayBuilder {
 
   Status AppendEmptyValues(int64_t length) final;
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     const int32_t* offsets = array.GetValues<int32_t>(1);
-    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
     for (int64_t row = offset; row < offset + length; row++) {
       if (!validity || bit_util::GetBit(validity, array.offset + row)) {
         ARROW_RETURN_NOT_OK(Append());
         const int64_t slot_length = offsets[row + 1] - offsets[row];
         ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
-            *array.child_data[0]->child_data[0], offsets[row], slot_length));
+            array.child_data[0].child_data[0], offsets[row], slot_length));
         ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
-            *array.child_data[0]->child_data[1], offsets[row], slot_length));
+            array.child_data[0].child_data[1], offsets[row], slot_length));
       } else {
         ARROW_RETURN_NOT_OK(AppendNull());
       }
@@ -425,12 +425,12 @@ class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
 
   Status AppendEmptyValues(int64_t length) final;
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset, int64_t length) final {
-    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
     for (int64_t row = offset; row < offset + length; row++) {
       if (!validity || bit_util::GetBit(validity, array.offset + row)) {
         ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
-            *array.child_data[0], list_size_ * (array.offset + row), list_size_));
+            array.child_data[0], list_size_ * (array.offset + row), list_size_));
         ARROW_RETURN_NOT_OK(Append());
       } else {
         ARROW_RETURN_NOT_OK(AppendNull());
@@ -532,13 +532,13 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
     return Status::OK();
   }
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
-      ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(*array.child_data[i],
+      ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i],
                                                          array.offset + offset, length));
     }
-    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0]->data() : NULLPTR;
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
     ARROW_RETURN_NOT_OK(Reserve(length));
     UnsafeAppendToBitmap(validity, array.offset + offset, length);
     return Status::OK();
diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h
index c72b48f0b32..8f2dcc8b09b 100644
--- a/cpp/src/arrow/array/builder_primitive.h
+++ b/cpp/src/arrow/array/builder_primitive.h
@@ -53,7 +53,7 @@ class ARROW_EXPORT NullBuilder : public ArrayBuilder {
 
   Status Append(std::nullptr_t) { return AppendNull(); }
 
-  Status AppendArraySlice(const ArrayData&, int64_t, int64_t length) override {
+  Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override {
     return AppendNulls(length);
   }
 
@@ -279,7 +279,7 @@ class NumericBuilder : public ArrayBuilder {
     return Status::OK();
   }
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     return AppendValues(array.GetValues<value_type>(1) + offset, length,
                         array.GetValues<uint8_t>(0, 0), array.offset + offset);
@@ -513,7 +513,7 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
 
   Status AppendValues(int64_t length, bool value);
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     return AppendValues(array.GetValues<uint8_t>(1, 0), length,
                         array.GetValues<uint8_t>(0, 0), array.offset + offset);
diff --git a/cpp/src/arrow/array/builder_union.cc b/cpp/src/arrow/array/builder_union.cc
index 6096b76ff21..883cda3d8b7 100644
--- a/cpp/src/arrow/array/builder_union.cc
+++ b/cpp/src/arrow/array/builder_union.cc
@@ -45,7 +45,7 @@ Status BasicUnionBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
   return Status::OK();
 }
 
-Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset,
+Status DenseUnionBuilder::AppendArraySlice(const ArraySpan& array, const int64_t offset,
                                            const int64_t length) {
   const int8_t* type_codes = array.GetValues<int8_t>(1);
   const int32_t* offsets = array.GetValues<int32_t>(2);
@@ -55,7 +55,7 @@ Status DenseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t
     const int32_t union_offset = offsets[row];
     RETURN_NOT_OK(Append(type_code));
     RETURN_NOT_OK(type_id_to_children_[type_code]->AppendArraySlice(
-        *array.child_data[child_id], union_offset, /*length=*/1));
+        array.child_data[child_id], union_offset, /*length=*/1));
   }
   return Status::OK();
 }
@@ -137,11 +137,11 @@ int8_t BasicUnionBuilder::NextTypeId() {
   return dense_type_id_++;
 }
 
-Status SparseUnionBuilder::AppendArraySlice(const ArrayData& array, const int64_t offset,
+Status SparseUnionBuilder::AppendArraySlice(const ArraySpan& array, const int64_t offset,
                                             const int64_t length) {
   for (size_t i = 0; i < type_codes_.size(); i++) {
     RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendArraySlice(
-        *array.child_data[i], array.offset + offset, length));
+        array.child_data[i], array.offset + offset, length));
   }
   const int8_t* type_codes = array.GetValues<int8_t>(1);
   RETURN_NOT_OK(types_builder_.Append(type_codes + offset, length));
diff --git a/cpp/src/arrow/array/builder_union.h b/cpp/src/arrow/array/builder_union.h
index f1629939ce3..eb8c5d3af0e 100644
--- a/cpp/src/arrow/array/builder_union.h
+++ b/cpp/src/arrow/array/builder_union.h
@@ -160,7 +160,7 @@ class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
     return offsets_builder_.Append(offset);
   }
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override;
 
   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
@@ -239,7 +239,7 @@ class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
   /// is called, and all other child builders must have null or empty value appended.
   Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
 
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override;
 };
 
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index 9dfc76c01ce..e1587695d6b 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -25,9 +25,12 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/array/util.h"
 #include "arrow/buffer.h"
+#include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/type.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bitmap_ops.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
@@ -128,6 +131,143 @@ int64_t ArrayData::GetNullCount() const {
   return precomputed;
 }
 
+// ----------------------------------------------------------------------
+// Methods for ArraySpan
+
+void ArraySpan::SetMembers(const ArrayData& data) {
+  this->type = data.type.get();
+  this->length = data.length;
+  this->null_count = data.null_count.load();
+  this->offset = data.offset;
+
+  for (size_t i = 0; i < data.buffers.size(); ++i) {
+    const std::shared_ptr<Buffer>& buffer = data.buffers[i];
+    // It is the invoker-of-kernels's responsibility to ensure that
+    // const buffers are not written to accidentally.
+    if (buffer) {
+      SetBuffer(i, buffer);
+    } else {
+      ClearBuffer(i);
+    }
+  }
+
+  // Makes sure any other buffers are seen as null / non-existent
+  for (size_t i = data.buffers.size(); i < 3; ++i) {
+    ClearBuffer(i);
+  }
+
+  // TODO(wesm): what about extension arrays?
+
+  if (this->type->id() == Type::DICTIONARY) {
+    this->child_data.resize(1);
+    this->child_data[0].SetMembers(*data.dictionary);
+  } else {
+    this->child_data.resize(data.child_data.size());
+    for (size_t child_index = 0; child_index < data.child_data.size(); ++child_index) {
+      this->child_data[child_index].SetMembers(*data.child_data[child_index]);
+    }
+  }
+}
+
+void ArraySpan::FillFromScalar(const Scalar& value) {
+  static const uint8_t kValidByte = 0x01;
+  static const uint8_t kNullByte = 0x00;
+
+  this->type = value.type.get();
+  this->length = 1;
+
+  // Populate null count and validity bitmap
+  this->null_count = value.is_valid ? 0 : 1;
+  this->buffers[0].data = const_cast<uint8_t*>(value.is_valid ? &kValidByte : &kNullByte);
+  this->buffers[0].size = 1;
+
+  if (is_primitive(value.type->id())) {
+    const auto& scalar =
+        internal::checked_cast<const internal::PrimitiveScalarBase&>(value);
+    const uint8_t* scalar_data = reinterpret_cast<const uint8_t*>(scalar.data());
+    this->buffers[1].data = const_cast<uint8_t*>(scalar_data);
+    this->buffers[1].size = scalar.type->byte_width();
+  } else {
+    // TODO(wesm): implement for other types
+    DCHECK(false) << "need to implement for other types";
+  }
+}
+
+int64_t ArraySpan::GetNullCount() const {
+  int64_t precomputed = this->null_count;
+  if (ARROW_PREDICT_FALSE(precomputed == kUnknownNullCount)) {
+    if (this->buffers[0].data != nullptr) {
+      precomputed =
+          this->length - CountSetBits(this->buffers[0].data, this->offset, this->length);
+    } else {
+      precomputed = 0;
+    }
+    this->null_count = precomputed;
+  }
+  return precomputed;
+}
+
+int GetNumBuffers(const DataType& type) {
+  switch (type.id()) {
+    case Type::NA:
+      return 0;
+    case Type::STRUCT:
+    case Type::FIXED_SIZE_LIST:
+      return 1;
+    case Type::BINARY:
+    case Type::LARGE_BINARY:
+    case Type::STRING:
+    case Type::LARGE_STRING:
+    case Type::DENSE_UNION:
+      return 3;
+    case Type::EXTENSION:
+      // The number of buffers depends on the storage type
+      return GetNumBuffers(
+          *internal::checked_cast<const ExtensionType&>(type).storage_type());
+    default:
+      // Everything else has 2 buffers
+      return 2;
+  }
+}
+
+int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); }
+
+std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
+  auto result = std::make_shared<ArrayData>(this->type->GetSharedPtr(), this->length,
+                                            kUnknownNullCount, this->offset);
+
+  for (int i = 0; i < this->num_buffers(); ++i) {
+    if (this->buffers[i].owner) {
+      result->buffers.emplace_back(this->GetBuffer(i));
+    } else {
+      result->buffers.push_back(nullptr);
+    }
+  }
+
+  if (this->type->id() == Type::NA) {
+    result->null_count = this->length;
+  } else if (this->buffers[0].data == nullptr) {
+    // No validity bitmap, so the null count is 0
+    result->null_count = 0;
+  }
+
+  // TODO(wesm): what about extension arrays?
+
+  if (this->type->id() == Type::DICTIONARY) {
+    result->dictionary = this->dictionary().ToArrayData();
+  } else {
+    // Emit children, too
+    for (size_t i = 0; i < this->child_data.size(); ++i) {
+      result->child_data.push_back(this->child_data[i].ToArrayData());
+    }
+  }
+  return result;
+}
+
+std::shared_ptr<Array> ArraySpan::ToArray() const {
+  return MakeArray(this->ToArrayData());
+}
+
 // ----------------------------------------------------------------------
 // Implement ArrayData::View
 
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index 418d09def6b..a6cbdcbe8ee 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -25,11 +25,14 @@
 
 #include "arrow/buffer.h"
 #include "arrow/result.h"
+#include "arrow/util/bit_util.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
 namespace arrow {
 
+class Array;
+
 // When slicing, we do not know the null count of the sliced range without
 // doing some computation. To avoid doing this eagerly, we set the null count
 // to -1 (any negative number will do). When Array::null_count is called the
@@ -242,6 +245,127 @@ struct ARROW_EXPORT ArrayData {
   std::shared_ptr<ArrayData> dictionary;
 };
 
+/// \brief A non-owning Buffer reference
+struct ARROW_EXPORT BufferRef {
+  // It is the user of this class's responsibility to ensure that
+  // buffers that were const originally are not written to
+  // accidentally.
+  uint8_t* data = NULLPTR;
+  int64_t size = 0;
+  // Pointer back to buffer that owns this memory
+  const std::shared_ptr<Buffer>* owner = NULLPTR;
+};
+
+/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply
+/// copyable and does not contain any shared_ptr objects. Do not use in public
+/// APIs aside from compute kernels for now
+struct ARROW_EXPORT ArraySpan {
+  const DataType* type;
+  int64_t length = 0;
+  mutable int64_t null_count = kUnknownNullCount;
+  int64_t offset = 0;
+  BufferRef buffers[3];
+
+  ArraySpan() = default;
+
+  explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
+  explicit ArraySpan(const ArrayData& data) { SetMembers(data); }
+  explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
+
+  /// If dictionary-encoded, put dictionary in the first entry
+  // TODO(wesm): would a std::unique_ptr<vector<...>> be better?
+  std::vector<ArraySpan> child_data;
+
+  /// \brief Populate ArraySpan to look like an array of length 1 pointing at
+  /// the data members of a Scalar value
+  void FillFromScalar(const Scalar& value);
+
+  void SetMembers(const ArrayData& data);
+
+  void SetBuffer(int index, const std::shared_ptr<Buffer>& buffer) {
+    this->buffers[index].data = const_cast<uint8_t*>(buffer->data());
+    this->buffers[index].size = buffer->size();
+    this->buffers[index].owner = &buffer;
+  }
+
+  void ClearBuffer(int index) {
+    this->buffers[index].data = NULLPTR;
+    this->buffers[index].size = 0;
+    this->buffers[index].owner = NULLPTR;
+  }
+
+  const ArraySpan& dictionary() const { return child_data[0]; }
+
+  /// \brief Return the number of buffers (out of 3) that are used to
+  /// constitute this array
+  int num_buffers() const;
+
+  // Access a buffer's data as a typed C pointer
+  template <typename T>
+  inline T* GetValues(int i, int64_t absolute_offset) {
+    return reinterpret_cast<T*>(buffers[i].data) + absolute_offset;
+  }
+
+  template <typename T>
+  inline T* GetValues(int i) {
+    return GetValues<T>(i, this->offset);
+  }
+
+  // Access a buffer's data as a typed C pointer
+  template <typename T>
+  inline const T* GetValues(int i, int64_t absolute_offset) const {
+    return reinterpret_cast<const T*>(buffers[i].data) + absolute_offset;
+  }
+
+  template <typename T>
+  inline const T* GetValues(int i) const {
+    return GetValues<T>(i, this->offset);
+  }
+
+  bool IsNull(int64_t i) const {
+    return ((this->buffers[0].data != NULLPTR)
+                ? !bit_util::GetBit(this->buffers[0].data, i + this->offset)
+                : this->null_count == this->length);
+  }
+
+  bool IsValid(int64_t i) const {
+    return ((this->buffers[0].data != NULLPTR)
+                ? bit_util::GetBit(this->buffers[0].data, i + this->offset)
+                : this->null_count != this->length);
+  }
+
+  std::shared_ptr<ArrayData> ToArrayData() const;
+
+  std::shared_ptr<Array> ToArray() const;
+
+  std::shared_ptr<Buffer> GetBuffer(int index) const {
+    if (this->buffers[index].owner == NULLPTR) {
+      return NULLPTR;
+    } else {
+      return *this->buffers[index].owner;
+    }
+  }
+
+  void AddOffset(int64_t offset) {
+    this->offset += offset;
+    this->null_count = kUnknownNullCount;
+  }
+
+  void SetOffset(int64_t offset) {
+    this->offset = offset;
+    this->null_count = kUnknownNullCount;
+  }
+
+  /// \brief Return null count, or compute and set it if it's not known
+  int64_t GetNullCount() const;
+
+  bool MayHaveNulls() const {
+    // If an ArrayData is slightly malformed it may have kUnknownNullCount set
+    // but no buffer
+    return null_count != 0 && buffers[0].data != NULLPTR;
+  }
+};
+
 namespace internal {
 
 /// Construct a zero-copy view of this ArrayData with the given type.
diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc
index 413182de0df..e5b4ab39493 100644
--- a/cpp/src/arrow/array/util.cc
+++ b/cpp/src/arrow/array/util.cc
@@ -432,7 +432,9 @@ class NullArrayFactory {
       RETURN_NOT_OK(CreateBuffer());
     }
     std::vector<std::shared_ptr<ArrayData>> child_data(type_->num_fields());
-    out_ = ArrayData::Make(type_, length_, {buffer_}, child_data, length_, 0);
+    out_ = ArrayData::Make(type_, length_,
+                           {SliceBuffer(buffer_, 0, bit_util::BytesForBits(length_))},
+                           child_data, length_, 0);
     RETURN_NOT_OK(VisitTypeInline(*type_, this));
     return out_;
   }
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index f3d12dbece9..7c027c9b1e8 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -119,7 +119,7 @@ class ARROW_EXPORT TypeErasedIntBuilder : public ArrayBuilder {
   Status AppendScalars(const ScalarVector& scalars) override {
     return builder_->AppendScalars(scalars);
   }
-  Status AppendArraySlice(const ArrayData& array, int64_t offset,
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
                           int64_t length) override {
     return builder_->AppendArraySlice(array, offset, length);
   }
diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h
index 956595b117b..6ec7d11ac83 100644
--- a/cpp/src/arrow/chunked_array.h
+++ b/cpp/src/arrow/chunked_array.h
@@ -111,7 +111,7 @@ class ARROW_EXPORT ChunkedArray {
   int num_chunks() const { return static_cast<int>(chunks_.size()); }
 
   /// \return chunk a particular chunk from the chunked array
-  std::shared_ptr<Array> chunk(int i) const { return chunks_[i]; }
+  const std::shared_ptr<Array>& chunk(int i) const { return chunks_[i]; }
 
   /// \return an ArrayVector of chunks
   const ArrayVector& chunks() const { return chunks_; }
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index bd49041b4f3..0acab32e447 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -165,7 +165,7 @@ Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
 }
 
 Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
-                               OutputType out_type, ArrayKernelExec exec,
+                               OutputType out_type, ScalarKernel::ExecFunc exec,
                                NullHandling::type null_handling,
                                MemAllocation::type mem_allocation) {
   ScalarKernel kernel;
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
index e9c3cf55da9..a27dafe97c3 100644
--- a/cpp/src/arrow/compute/cast.h
+++ b/cpp/src/arrow/compute/cast.h
@@ -84,7 +84,7 @@ class CastFunction : public ScalarFunction {
   const std::vector<Type::type>& in_type_ids() const { return in_type_ids_; }
 
   Status AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
-                   OutputType out_type, ArrayKernelExec exec,
+                   OutputType out_type, ScalarKernel::ExecFunc exec,
                    NullHandling::type = NullHandling::INTERSECTION,
                    MemAllocation::type = MemAllocation::PREALLOCATE);
 
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 186a3cdf3c1..04bb29e9bfa 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -229,18 +229,45 @@ Status CheckAllValues(const std::vector<Datum>& values) {
   return Status::OK();
 }
 
-ExecBatchIterator::ExecBatchIterator(std::vector<Datum> args, int64_t length,
+Status GetBatchLength(const std::vector<Datum>& values, int64_t* out_length) {
+  for (const auto& arg : values) {
+    if (!(arg.is_arraylike() || arg.is_scalar())) {
+      return Status::Invalid(
+          "Batch iteration only works with Scalar, Array, and "
+          "ChunkedArray arguments");
+    }
+  }
+
+  // If the arguments are all scalars, then the length is 1
+  int64_t length = 1;
+
+  bool length_set = false;
+  for (auto& arg : values) {
+    if (arg.is_scalar()) {
+      continue;
+    }
+    if (!length_set) {
+      length = arg.length();
+      length_set = true;
+    } else {
+      if (arg.length() != length) {
+        return Status::Invalid("Array arguments must all be the same length");
+      }
+    }
+  }
+  *out_length = length;
+  return Status::OK();
+}
+
+ExecBatchIterator::ExecBatchIterator(const std::vector<Datum>& args, int64_t length,
                                      int64_t max_chunksize)
-    : args_(std::move(args)),
-      position_(0),
-      length_(length),
-      max_chunksize_(max_chunksize) {
+    : args_(args), position_(0), length_(length), max_chunksize_(max_chunksize) {
   chunk_indexes_.resize(args_.size(), 0);
   chunk_positions_.resize(args_.size(), 0);
 }
 
 Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
-    std::vector<Datum> args, int64_t max_chunksize) {
+    const std::vector<Datum>& args, int64_t max_chunksize) {
   for (const auto& arg : args) {
     if (!(arg.is_arraylike() || arg.is_scalar())) {
       return Status::Invalid(
@@ -270,7 +297,7 @@ Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
   max_chunksize = std::min(length, max_chunksize);
 
   return std::unique_ptr<ExecBatchIterator>(
-      new ExecBatchIterator(std::move(args), length, max_chunksize));
+      new ExecBatchIterator(args, length, max_chunksize));
 }
 
 bool ExecBatchIterator::Next(ExecBatch* batch) {
@@ -325,35 +352,158 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
   return true;
 }
 
+// ----------------------------------------------------------------------
+// ExecSpanIterator; to eventually replace ExecBatchIterator
+
+ExecSpanIterator::ExecSpanIterator(const std::vector<Datum>& args, int64_t length,
+                                   int64_t max_chunksize)
+    : args_(args), position_(0), length_(length), max_chunksize_(max_chunksize) {
+  chunk_indexes_.resize(args_.size(), 0);
+  value_positions_.resize(args_.size(), 0);
+  value_offsets_.resize(args_.size(), 0);
+}
+
+Result<std::unique_ptr<ExecSpanIterator>> ExecSpanIterator::Make(
+    const std::vector<Datum>& args, int64_t max_chunksize) {
+  int64_t length = 1;
+  RETURN_NOT_OK(GetBatchLength(args, &length));
+  max_chunksize = std::min(length, max_chunksize);
+  return std::unique_ptr<ExecSpanIterator>(
+      new ExecSpanIterator(args, length, max_chunksize));
+}
+
+int64_t ExecSpanIterator::GetNextChunkSpan(int64_t iteration_size, ExecSpan* span) {
+  for (size_t i = 0; i < args_.size() && iteration_size > 0; ++i) {
+    // If the argument is not a chunked array, it's either a Scalar or Array,
+    // in which case it doesn't influence the size of this span. Note that if
+    // the args are all scalars the span length is 1
+    if (!args_[i].is_chunked_array()) {
+      continue;
+    }
+    const ChunkedArray* arg = args_[i].chunked_array().get();
+    const Array* current_chunk;
+    while (true) {
+      current_chunk = arg->chunk(chunk_indexes_[i]).get();
+      if (value_positions_[i] == current_chunk->length()) {
+        // Chunk is zero-length, or was exhausted in the previous
+        // iteration. Move to the next chunk
+        ++chunk_indexes_[i];
+        current_chunk = arg->chunk(chunk_indexes_[i]).get();
+        span->values[i].SetArray(*current_chunk->data());
+        value_positions_[i] = 0;
+        value_offsets_[i] = current_chunk->offset();
+        continue;
+      }
+      break;
+    }
+    iteration_size =
+        std::min(current_chunk->length() - value_positions_[i], iteration_size);
+  }
+  return iteration_size;
+}
+
+bool ExecSpanIterator::Next(ExecSpan* span) {
+  if (position_ == length_) {
+    // This also protects from degenerate cases like ChunkedArrays
+    // without any chunks
+    return false;
+  }
+
+  if (!initialized_) {
+    span->length = 0;
+
+    // The first this this is called, we populate the output span with
+    // any Scalar or Array arguments in the ExecValue struct, and then
+    // just increment array offsets below. If any arguments are
+    // ChunkedArray, then the internal ArraySpans will see their
+    // members updated during hte iteration
+    span->values.resize(args_.size());
+    for (size_t i = 0; i < args_.size(); ++i) {
+      if (args_[i].is_scalar()) {
+        span->values[i].SetScalar(args_[i].scalar().get());
+      } else if (args_[i].is_array()) {
+        const ArrayData& arr = *args_[i].array();
+        span->values[i].SetArray(arr);
+        value_offsets_[i] = arr.offset;
+      } else {
+        // Populate members from the first chunk
+        const Array* first_chunk = args_[i].chunked_array()->chunk(0).get();
+        const ArrayData& arr = *first_chunk->data();
+        span->values[i].SetArray(arr);
+        value_offsets_[i] = arr.offset;
+        have_chunked_arrays_ = true;
+      }
+    }
+    initialized_ = true;
+  }
+
+  if (position_ == length_) {
+    return false;
+  }
+
+  // Determine how large the common contiguous "slice" of all the arguments is
+  int64_t iteration_size = std::min(length_ - position_, max_chunksize_);
+  if (have_chunked_arrays_) {
+    iteration_size = GetNextChunkSpan(iteration_size, span);
+  }
+
+  // Now, adjust the span
+  span->length = iteration_size;
+  for (size_t i = 0; i < args_.size(); ++i) {
+    const Datum& arg = args_[i];
+    if (!arg.is_scalar()) {
+      ArraySpan* arr = &span->values[i].array;
+      arr->length = iteration_size;
+      arr->SetOffset(value_positions_[i] + value_offsets_[i]);
+      value_positions_[i] += iteration_size;
+    }
+  }
+  position_ += iteration_size;
+  DCHECK_LE(position_, length_);
+  return true;
+}
+
 namespace {
 
 struct NullGeneralization {
   enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL };
 
-  static type Get(const Datum& datum) {
-    const auto dtype_id = datum.type()->id();
+  static type Get(const ExecValue& value) {
+    const auto dtype_id = value.type()->id();
     if (dtype_id == Type::NA) {
       return ALL_NULL;
     }
     if (!arrow::internal::HasValidityBitmap(dtype_id)) {
       return ALL_VALID;
     }
-    if (datum.is_scalar()) {
-      return datum.scalar()->is_valid ? ALL_VALID : ALL_NULL;
-    }
-    if (datum.is_array()) {
-      const auto& arr = *datum.array();
+    if (value.is_scalar()) {
+      return value.scalar->is_valid ? ALL_VALID : ALL_NULL;
+    } else {
+      const ArraySpan& arr = value.array;
       // Do not count the bits if they haven't been counted already
-      const int64_t known_null_count = arr.null_count.load();
-      if ((known_null_count == 0) || (arr.buffers[0] == NULLPTR)) {
+      if ((arr.null_count == 0) || (arr.buffers[0].data == nullptr)) {
         return ALL_VALID;
       }
-      if (known_null_count == arr.length) {
+      if (arr.null_count == arr.length) {
         return ALL_NULL;
       }
     }
     return PERHAPS_NULL;
   }
+
+  static type Get(const Datum& datum) {
+    // Temporary workaround to help with ARROW-16756
+    ExecValue value;
+    if (datum.is_array()) {
+      value.SetArray(*datum.array());
+    } else if (datum.is_scalar()) {
+      value.SetScalar(datum.scalar().get());
+    } else {
+      // TODO(wesm): ChunkedArray, I think
+      return PERHAPS_NULL;
+    }
+    return Get(value);
+  }
 };
 
 // Null propagation implementation that deals both with preallocated bitmaps
@@ -369,35 +519,29 @@ struct NullGeneralization {
 // * Otherwise, we allocate the bitmap and populate it
 class NullPropagator {
  public:
-  NullPropagator(KernelContext* ctx, const ExecBatch& batch, ArrayData* output)
+  NullPropagator(KernelContext* ctx, const ExecSpan& batch, ArrayData* output)
       : ctx_(ctx), batch_(batch), output_(output) {
-    for (const Datum& datum : batch_.values) {
-      auto null_generalization = NullGeneralization::Get(datum);
-
+    for (const ExecValue& value : batch_.values) {
+      auto null_generalization = NullGeneralization::Get(value);
       if (null_generalization == NullGeneralization::ALL_NULL) {
         is_all_null_ = true;
       }
-
-      if (null_generalization != NullGeneralization::ALL_VALID &&
-          datum.kind() == Datum::ARRAY) {
-        arrays_with_nulls_.push_back(datum.array().get());
+      if (null_generalization != NullGeneralization::ALL_VALID && value.is_array()) {
+        arrays_with_nulls_.push_back(&value.array);
       }
     }
-
     if (output->buffers[0] != nullptr) {
       bitmap_preallocated_ = true;
-      SetBitmap(output_->buffers[0].get());
+      bitmap_ = output_->buffers[0]->mutable_data();
     }
   }
 
-  void SetBitmap(Buffer* bitmap) { bitmap_ = bitmap->mutable_data(); }
-
   Status EnsureAllocated() {
     if (bitmap_preallocated_) {
       return Status::OK();
     }
     ARROW_ASSIGN_OR_RAISE(output_->buffers[0], ctx_->AllocateBitmap(output_->length));
-    SetBitmap(output_->buffers[0].get());
+    bitmap_ = output_->buffers[0]->mutable_data();
     return Status::OK();
   }
 
@@ -412,10 +556,10 @@ class NullPropagator {
 
     // Walk all the values with nulls instead of breaking on the first in case
     // we find a bitmap that can be reused in the non-preallocated case
-    for (const ArrayData* arr : arrays_with_nulls_) {
-      if (arr->null_count.load() == arr->length && arr->buffers[0] != nullptr) {
+    for (const ArraySpan* arr : arrays_with_nulls_) {
+      if (arr->null_count == arr->length && arr->buffers[0].owner != nullptr) {
         // Reuse this all null bitmap
-        output_->buffers[0] = arr->buffers[0];
+        output_->buffers[0] = arr->GetBuffer(0);
         return Status::OK();
       }
     }
@@ -427,14 +571,14 @@ class NullPropagator {
 
   Status PropagateSingle() {
     // One array
-    const ArrayData& arr = *arrays_with_nulls_[0];
-    const std::shared_ptr<Buffer>& arr_bitmap = arr.buffers[0];
+    const ArraySpan& arr = *arrays_with_nulls_[0];
+    const uint8_t* arr_bitmap = arr.buffers[0].data;
 
     // Reuse the null count if it's known
-    output_->null_count = arr.null_count.load();
+    output_->null_count = arr.null_count;
 
     if (bitmap_preallocated_) {
-      CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_, output_->offset);
+      CopyBitmap(arr_bitmap, arr.offset, arr.length, bitmap_, output_->offset);
       return Status::OK();
     }
 
@@ -448,14 +592,13 @@ class NullPropagator {
     // the bitmap is not preallocated, and that precondition is asserted
     // higher in the call stack.
     if (arr.offset == 0) {
-      output_->buffers[0] = arr_bitmap;
+      output_->buffers[0] = arr.GetBuffer(0);
     } else if (arr.offset % 8 == 0) {
-      output_->buffers[0] =
-          SliceBuffer(arr_bitmap, arr.offset / 8, bit_util::BytesForBits(arr.length));
+      output_->buffers[0] = SliceBuffer(arr.GetBuffer(0), arr.offset / 8,
+                                        bit_util::BytesForBits(arr.length));
     } else {
       RETURN_NOT_OK(EnsureAllocated());
-      CopyBitmap(arr_bitmap->data(), arr.offset, arr.length, bitmap_,
-                 /*dst_offset=*/0);
+      CopyBitmap(arr_bitmap, arr.offset, arr.length, bitmap_, /*dst_offset=*/0);
     }
     return Status::OK();
   }
@@ -466,22 +609,22 @@ class NullPropagator {
     // Do not compute the intersection null count until it's needed
     RETURN_NOT_OK(EnsureAllocated());
 
-    auto Accumulate = [&](const ArrayData& left, const ArrayData& right) {
-      DCHECK(left.buffers[0]);
-      DCHECK(right.buffers[0]);
-      BitmapAnd(left.buffers[0]->data(), left.offset, right.buffers[0]->data(),
-                right.offset, output_->length, output_->offset,
-                output_->buffers[0]->mutable_data());
+    auto Accumulate = [&](const uint8_t* left_data, int64_t left_offset,
+                          const uint8_t* right_data, int64_t right_offset) {
+      BitmapAnd(left_data, left_offset, right_data, right_offset, output_->length,
+                output_->offset, bitmap_);
     };
 
     DCHECK_GT(arrays_with_nulls_.size(), 1);
 
     // Seed the output bitmap with the & of the first two bitmaps
-    Accumulate(*arrays_with_nulls_[0], *arrays_with_nulls_[1]);
+    Accumulate(arrays_with_nulls_[0]->buffers[0].data, arrays_with_nulls_[0]->offset,
+               arrays_with_nulls_[1]->buffers[0].data, arrays_with_nulls_[1]->offset);
 
     // Accumulate the rest
     for (size_t i = 2; i < arrays_with_nulls_.size(); ++i) {
-      Accumulate(*output_, *arrays_with_nulls_[i]);
+      Accumulate(bitmap_, output_->offset, arrays_with_nulls_[i]->buffers[0].data,
+                 arrays_with_nulls_[i]->offset);
     }
     return Status::OK();
   }
@@ -527,8 +670,8 @@ class NullPropagator {
 
  private:
   KernelContext* ctx_;
-  const ExecBatch& batch_;
-  std::vector<const ArrayData*> arrays_with_nulls_;
+  const ExecSpan& batch_;
+  std::vector<const ArraySpan*> arrays_with_nulls_;
   bool is_all_null_ = false;
   ArrayData* output_;
   uint8_t* bitmap_;
@@ -573,13 +716,9 @@ class KernelExecutorImpl : public KernelExecutor {
   }
 
  protected:
-  // This is overridden by the VectorExecutor
-  virtual Status SetupArgIteration(const std::vector<Datum>& args) {
-    ARROW_ASSIGN_OR_RAISE(
-        batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
-    return Status::OK();
-  }
-
+  // Prepare an output ArrayData to be written to. If
+  // Kernel::mem_allocation is not MemAllocation::PREALLOCATE, then no
+  // data buffers will be set
   Result<std::shared_ptr<ArrayData>> PrepareOutput(int64_t length) {
     auto out = std::make_shared<ArrayData>(output_descr_.type, length);
     out->buffers.resize(output_num_buffers_);
@@ -619,7 +758,6 @@ class KernelExecutorImpl : public KernelExecutor {
 
   KernelContext* kernel_ctx_;
   const KernelType* kernel_;
-  std::unique_ptr<ExecBatchIterator> batch_iterator_;
   ValueDescr output_descr_;
 
   int output_num_buffers_;
@@ -636,22 +774,35 @@ class KernelExecutorImpl : public KernelExecutor {
 class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
  public:
   Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
-    RETURN_NOT_OK(PrepareExecute(args));
-    ExecBatch batch;
-    while (batch_iterator_->Next(&batch)) {
-      RETURN_NOT_OK(ExecuteBatch(batch, listener));
+    ARROW_ASSIGN_OR_RAISE(span_iterator_,
+                          ExecSpanIterator::Make(args, exec_context()->exec_chunksize()));
+
+    // TODO(wesm): remove if with ARROW-16757
+    if (output_descr_.shape != ValueDescr::SCALAR) {
+      // If the executor is configured to produce a single large Array output for
+      // kernels supporting preallocation, then we do so up front and then
+      // iterate over slices of that large array. Otherwise, we preallocate prior
+      // to processing each span emitted from the ExecSpanIterator
+      RETURN_NOT_OK(SetupPreallocation(span_iterator_->length(), args));
     }
-    if (preallocate_contiguous_) {
-      // If we preallocated one big chunk, since the kernel execution is
-      // completed, we can now emit it
-      RETURN_NOT_OK(listener->OnResult(std::move(preallocated_)));
+
+    // ARROW-16756: Here we have to accommodate the distinct cases
+    //
+    // * Fully-preallocated contiguous output
+    // * Fully-preallocated, non-contiguous kernel output
+    // * Not-fully-preallocated kernel output: we pass an empty or
+    //   partially-filled ArrayData to the kernel
+    if (preallocating_all_buffers_) {
+      return ExecuteSpans(listener);
+    } else {
+      return ExecuteNonSpans(listener);
     }
-    return Status::OK();
   }
 
   Datum WrapResults(const std::vector<Datum>& inputs,
                     const std::vector<Datum>& outputs) override {
     if (output_descr_.shape == ValueDescr::SCALAR) {
+      // TODO(wesm): to remove, see ARROW-16757
       DCHECK_EQ(outputs.size(), 1);
       // Return as SCALAR
       return outputs[0];
@@ -674,101 +825,112 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
   }
 
  protected:
-  Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
-    Datum out;
-    RETURN_NOT_OK(PrepareNextOutput(batch, &out));
-
-    if (output_descr_.shape == ValueDescr::ARRAY) {
-      ArrayData* out_arr = out.mutable_array();
-      if (output_descr_.type->id() == Type::NA) {
-        out_arr->null_count = out_arr->length;
-      } else if (kernel_->null_handling == NullHandling::INTERSECTION) {
-        RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out_arr));
-      } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
-        out_arr->null_count = 0;
-      }
-    } else {
-      if (kernel_->null_handling == NullHandling::INTERSECTION) {
-        // set scalar validity
-        out.scalar()->is_valid =
-            std::all_of(batch.values.begin(), batch.values.end(),
-                        [](const Datum& input) { return input.scalar()->is_valid; });
-      } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
-        out.scalar()->is_valid = true;
-      }
-    }
-
-    RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+  Status ExecuteSpans(ExecListener* listener) {
+    // We put the preallocation in an ArraySpan to be passed to the
+    // kernel which is expecting to receive that. More
+    // performance-critical code (e.g. expression evaluation) should
+    // eventually skip the creation of ArrayData altogether
+    std::shared_ptr<ArrayData> preallocation;
+    ExecSpan input;
+    ExecResult output;
+    ArraySpan* output_span = output.array_span();
     if (preallocate_contiguous_) {
-      // Some kernels may like to simply nullify the validity bitmap when
-      // they know the output will have 0 nulls.  However, this is not compatible
-      // with writing into slices.
-      if (output_descr_.shape == ValueDescr::ARRAY) {
-        DCHECK(out.array()->buffers[0])
-            << "Null bitmap deleted by kernel but can_write_into_slices = true";
+      // Make one big output allocation
+      ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(span_iterator_->length()));
+
+      // Populate and then reuse the ArraySpan inside
+      output_span->SetMembers(*preallocation);
+      output_span->offset = 0;
+      while (span_iterator_->Next(&input)) {
+        // Set absolute output span position and length
+        output_span->length = input.length;
+        RETURN_NOT_OK(ExecuteSingleSpan(input, &output));
+        output_span->SetOffset(span_iterator_->position());
       }
+
+      // Kernel execution is complete; emit result
+      RETURN_NOT_OK(listener->OnResult(std::move(preallocation)));
     } else {
-      // If we are producing chunked output rather than one big array, then
-      // emit each chunk as soon as it's available
-      RETURN_NOT_OK(listener->OnResult(std::move(out)));
+      // Fully preallocating, but not contiguously
+      // We preallocate (maybe) only for the output of processing the current
+      // chunk
+      while (span_iterator_->Next(&input)) {
+        ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(input.length));
+        output_span->SetMembers(*preallocation);
+        RETURN_NOT_OK(ExecuteSingleSpan(input, &output));
+        // Emit the result for this chunk
+        RETURN_NOT_OK(listener->OnResult(std::move(preallocation)));
+      }
     }
     return Status::OK();
   }
 
-  Status PrepareExecute(const std::vector<Datum>& args) {
-    RETURN_NOT_OK(this->SetupArgIteration(args));
-
-    if (output_descr_.shape == ValueDescr::ARRAY) {
-      // If the executor is configured to produce a single large Array output for
-      // kernels supporting preallocation, then we do so up front and then
-      // iterate over slices of that large array. Otherwise, we preallocate prior
-      // to processing each batch emitted from the ExecBatchIterator
-      RETURN_NOT_OK(SetupPreallocation(batch_iterator_->length(), args));
+  Status ExecuteSingleSpan(const ExecSpan& input, ExecResult* out) {
+    ArraySpan* result_span = out->array_span();
+    if (output_descr_.type->id() == Type::NA) {
+      result_span->null_count = result_span->length;
+    } else if (kernel_->null_handling == NullHandling::INTERSECTION) {
+      if (!elide_validity_bitmap_) {
+        PropagateNullsSpans(input, result_span);
+      }
+    } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+      result_span->null_count = 0;
     }
-    return Status::OK();
+    return kernel_->exec(kernel_ctx_, input, out);
   }
 
-  // We must accommodate two different modes of execution for preallocated
-  // execution
-  //
-  // * A single large ("contiguous") allocation that we populate with results
-  //   on a chunkwise basis according to the ExecBatchIterator. This permits
-  //   parallelization even if the objective is to obtain a single Array or
-  //   ChunkedArray at the end
-  // * A standalone buffer preallocation for each chunk emitted from the
-  //   ExecBatchIterator
-  //
-  // When data buffer preallocation is not possible (e.g. with BINARY / STRING
-  // outputs), then contiguous results are only possible if the input is
-  // contiguous.
-
-  Status PrepareNextOutput(const ExecBatch& batch, Datum* out) {
-    if (output_descr_.shape == ValueDescr::ARRAY) {
-      if (preallocate_contiguous_) {
-        // The output is already fully preallocated
-        const int64_t batch_start_position = batch_iterator_->position() - batch.length;
-
-        if (batch.length < batch_iterator_->length()) {
-          // If this is a partial execution, then we write into a slice of
-          // preallocated_
-          out->value = preallocated_->Slice(batch_start_position, batch.length);
-        } else {
-          // Otherwise write directly into preallocated_. The main difference
-          // computationally (versus the Slice approach) is that the null_count
-          // may not need to be recomputed in the result
-          out->value = preallocated_;
+  Status ExecuteNonSpans(ExecListener* listener) {
+    // ARROW-16756: Kernel is going to allocate some memory and so
+    // for the time being we pass in an empty or partially-filled
+    // shared_ptr<ArrayData> or shared_ptr<Scalar> to be populated
+    // by the kernel.
+    //
+    // We will eventually delete the Scalar output path per
+    // ARROW-16757.
+    ExecSpan input;
+    ExecResult output;
+    while (span_iterator_->Next(&input)) {
+      if (output_descr_.shape == ValueDescr::ARRAY) {
+        ARROW_ASSIGN_OR_RAISE(output.value, PrepareOutput(input.length));
+        DCHECK(output.is_array_data());
+      } else {
+        // For scalar outputs, we set a null scalar of the correct type to
+        // communicate the output type to the kernel if needed
+        //
+        // XXX: Is there some way to avoid this step?
+        // TODO: Remove this path in ARROW-16757
+        output.value = MakeNullScalar(output_descr_.type);
+      }
+
+      if (output_descr_.shape == ValueDescr::ARRAY) {
+        ArrayData* out_arr = output.array_data().get();
+        if (output_descr_.type->id() == Type::NA) {
+          out_arr->null_count = out_arr->length;
+        } else if (kernel_->null_handling == NullHandling::INTERSECTION) {
+          RETURN_NOT_OK(PropagateNulls(kernel_ctx_, input, out_arr));
+        } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+          out_arr->null_count = 0;
         }
       } else {
-        // We preallocate (maybe) only for the output of processing the current
-        // batch
-        ARROW_ASSIGN_OR_RAISE(out->value, PrepareOutput(batch.length));
+        // TODO(wesm): to remove, see ARROW-16757
+        if (kernel_->null_handling == NullHandling::INTERSECTION) {
+          // set scalar validity
+          output.scalar()->is_valid =
+              std::all_of(input.values.begin(), input.values.end(),
+                          [](const ExecValue& input) { return input.scalar->is_valid; });
+        } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+          output.scalar()->is_valid = true;
+        }
+      }
+
+      RETURN_NOT_OK(kernel_->exec(kernel_ctx_, input, &output));
+
+      // Emit a result for each chunk
+      if (output_descr_.shape == ValueDescr::ARRAY) {
+        RETURN_NOT_OK(listener->OnResult(output.array_data()));
+      } else {
+        RETURN_NOT_OK(listener->OnResult(output.scalar()));
       }
-    } else {
-      // For scalar outputs, we set a null scalar of the correct type to
-      // communicate the output type to the kernel if needed
-      //
-      // XXX: Is there some way to avoid this step?
-      out->value = MakeNullScalar(output_descr_.type);
     }
     return Status::OK();
   }
@@ -780,23 +942,42 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
     // - Output Array is NullArray
     // - kernel_->null_handling is COMPUTED_NO_PREALLOCATE or OUTPUT_NOT_NULL
     validity_preallocated_ = false;
+
     if (out_type_id != Type::NA) {
       if (kernel_->null_handling == NullHandling::COMPUTED_PREALLOCATE) {
         // Override the flag if kernel asks for pre-allocation
         validity_preallocated_ = true;
       } else if (kernel_->null_handling == NullHandling::INTERSECTION) {
-        bool are_all_inputs_valid = true;
+        elide_validity_bitmap_ = true;
         for (const auto& arg : args) {
           auto null_gen = NullGeneralization::Get(arg) == NullGeneralization::ALL_VALID;
-          are_all_inputs_valid = are_all_inputs_valid && null_gen;
+
+          // If not all valid, this becomes false
+          elide_validity_bitmap_ = elide_validity_bitmap_ && null_gen;
         }
-        validity_preallocated_ = !are_all_inputs_valid;
+        validity_preallocated_ = !elide_validity_bitmap_;
+      } else if (kernel_->null_handling == NullHandling::OUTPUT_NOT_NULL) {
+        elide_validity_bitmap_ = true;
       }
     }
     if (kernel_->mem_allocation == MemAllocation::PREALLOCATE) {
       ComputeDataPreallocate(*output_descr_.type, &data_preallocated_);
     }
 
+    // Validity bitmap either preallocated or elided, and all data
+    // buffers allocated. This is basically only true for primitive
+    // types that are not dictionary-encoded
+    preallocating_all_buffers_ =
+        ((validity_preallocated_ || elide_validity_bitmap_) &&
+         data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) &&
+         !is_nested(out_type_id) && !is_dictionary(out_type_id));
+
+    // TODO(wesm): why was this check ever here? Fixed width binary
+    // can be 0-width but anything else?
+    DCHECK(std::all_of(
+        data_preallocated_.begin(), data_preallocated_.end(),
+        [](const BufferPreallocation& prealloc) { return prealloc.bit_width >= 0; }));
+
     // Contiguous preallocation only possible on non-nested types if all
     // buffers are preallocated.  Otherwise, we must go chunk-by-chunk.
     //
@@ -804,26 +985,25 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
     // kernel's attributes.
     preallocate_contiguous_ =
         (exec_context()->preallocate_contiguous() && kernel_->can_write_into_slices &&
-         validity_preallocated_ && !is_nested(out_type_id) &&
-         !is_dictionary(out_type_id) &&
-         data_preallocated_.size() == static_cast<size_t>(output_num_buffers_ - 1) &&
-         std::all_of(data_preallocated_.begin(), data_preallocated_.end(),
-                     [](const BufferPreallocation& prealloc) {
-                       return prealloc.bit_width >= 0;
-                     }));
-    if (preallocate_contiguous_) {
-      ARROW_ASSIGN_OR_RAISE(preallocated_, PrepareOutput(total_length));
-    }
+         preallocating_all_buffers_);
     return Status::OK();
   }
 
+  // Used to account for the case where we do not preallocate a
+  // validity bitmap because the inputs are all non-null and we're
+  // using NullHandling::INTERSECTION to compute the validity bitmap
+  bool elide_validity_bitmap_ = false;
+
+  // All memory is preallocated for output, contiguous and
+  // non-contiguous
+  bool preallocating_all_buffers_ = false;
+
   // If true, and the kernel and output type supports preallocation (for both
   // the validity and data buffers), then we allocate one big array and then
   // iterate through it while executing the kernel in chunks
   bool preallocate_contiguous_ = false;
 
-  // For storing a contiguous preallocation per above. Unused otherwise
-  std::shared_ptr<ArrayData> preallocated_;
+  std::unique_ptr<ExecSpanIterator> span_iterator_;
 };
 
 Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
@@ -888,7 +1068,7 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
 
     if (kernel_->null_handling == NullHandling::INTERSECTION &&
         output_descr_.shape == ValueDescr::ARRAY) {
-      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, batch, out.mutable_array()));
+      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array()));
     }
     RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
     if (!kernel_->finalize) {
@@ -913,16 +1093,11 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
     return Status::OK();
   }
 
-  Status SetupArgIteration(const std::vector<Datum>& args) override {
+  Status PrepareExecute(const std::vector<Datum>& args) {
     if (kernel_->can_execute_chunkwise) {
       ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
                                                  args, exec_context()->exec_chunksize()));
     }
-    return Status::OK();
-  }
-
-  Status PrepareExecute(const std::vector<Datum>& args) {
-    RETURN_NOT_OK(this->SetupArgIteration(args));
     output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
 
     // Decide if we need to preallocate memory for this kernel
@@ -935,6 +1110,7 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
     return Status::OK();
   }
 
+  std::unique_ptr<ExecBatchIterator> batch_iterator_;
   std::vector<Datum> results_;
 };
 
@@ -947,7 +1123,8 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
   }
 
   Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
-    RETURN_NOT_OK(this->SetupArgIteration(args));
+    ARROW_ASSIGN_OR_RAISE(
+        batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
 
     ExecBatch batch;
     while (batch_iterator_->Next(&batch)) {
@@ -971,7 +1148,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
 
  private:
   Status Consume(const ExecBatch& batch) {
-    // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
+    // FIXME(ARROW-11840) don't merge *any* aggregates for every batch
     ARROW_ASSIGN_OR_RAISE(
         auto batch_state,
         kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}));
@@ -988,6 +1165,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
     return Status::OK();
   }
 
+  std::unique_ptr<ExecBatchIterator> batch_iterator_;
   const std::vector<ValueDescr>* input_descrs_;
   const FunctionOptions* options_;
 };
@@ -1004,7 +1182,7 @@ Result<std::unique_ptr<KernelExecutor>> MakeExecutor(ExecContext* ctx,
 
 }  // namespace
 
-Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* output) {
+Status PropagateNulls(KernelContext* ctx, const ExecSpan& batch, ArrayData* output) {
   DCHECK_NE(nullptr, output);
   DCHECK_GT(output->buffers.size(), 0);
 
@@ -1026,6 +1204,67 @@ Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out
   return propagator.Execute();
 }
 
+void PropagateNullsSpans(const ExecSpan& batch, ArraySpan* out) {
+  if (out->type->id() == Type::NA) {
+    // Null output type is a no-op (rare when this would happen but we at least
+    // will test for it)
+    return;
+  }
+
+  std::vector<const ArraySpan*> arrays_with_nulls;
+  bool is_all_null = false;
+  for (const ExecValue& value : batch.values) {
+    auto null_generalization = NullGeneralization::Get(value);
+    if (null_generalization == NullGeneralization::ALL_NULL) {
+      is_all_null = true;
+    }
+    if (null_generalization != NullGeneralization::ALL_VALID && value.is_array()) {
+      arrays_with_nulls.push_back(&value.array);
+    }
+  }
+  uint8_t* out_bitmap = out->buffers[0].data;
+  if (is_all_null) {
+    // An all-null value (scalar null or all-null array) gives us a short
+    // circuit opportunity
+    // OK, the output should be all null
+    out->null_count = out->length;
+    bit_util::SetBitsTo(out_bitmap, out->offset, out->length, false);
+    return;
+  }
+
+  out->null_count = kUnknownNullCount;
+  if (arrays_with_nulls.empty()) {
+    // No arrays with nulls case
+    out->null_count = 0;
+    if (out_bitmap != nullptr) {
+      // An output buffer was allocated, so we fill it with all valid
+      bit_util::SetBitsTo(out_bitmap, out->offset, out->length, true);
+    }
+  } else if (arrays_with_nulls.size() == 1) {
+    // One array
+    const ArraySpan& arr = *arrays_with_nulls[0];
+
+    // Reuse the null count if it's known
+    out->null_count = arr.null_count;
+    CopyBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap, out->offset);
+  } else {
+    // More than one array. We use BitmapAnd to intersect their bitmaps
+    auto Accumulate = [&](const ArraySpan& left, const ArraySpan& right) {
+      DCHECK(left.buffers[0].data != nullptr);
+      DCHECK(right.buffers[0].data != nullptr);
+      BitmapAnd(left.buffers[0].data, left.offset, right.buffers[0].data, right.offset,
+                out->length, out->offset, out_bitmap);
+    };
+    // Seed the output bitmap with the & of the first two bitmaps
+    Accumulate(*arrays_with_nulls[0], *arrays_with_nulls[1]);
+
+    // Accumulate the rest
+    for (size_t i = 2; i < arrays_with_nulls.size(); ++i) {
+      Accumulate(*out, *arrays_with_nulls[i]);
+    }
+  }
+}
+
 std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalar() {
   return ::arrow::internal::make_unique<detail::ScalarExecutor>();
 }
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 742c3794416..994254ffb70 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -20,6 +20,8 @@
 
 #pragma once
 
+#include <algorithm>
+#include <atomic>
 #include <cstdint>
 #include <limits>
 #include <memory>
@@ -252,6 +254,192 @@ struct ARROW_EXPORT ExecBatch {
 inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
 inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
 
+struct ExecValue {
+  enum Kind { ARRAY, SCALAR };
+  Kind kind = ARRAY;
+  ArraySpan array;
+  const Scalar* scalar;
+
+  ExecValue(Scalar* scalar)  // NOLINT implicit conversion
+      : kind(SCALAR), scalar(scalar) {}
+
+  ExecValue(ArraySpan array)  // NOLINT implicit conversion
+      : kind(ARRAY), array(std::move(array)) {}
+
+  ExecValue(const ArrayData& array)  // NOLINT implicit conversion
+      : kind(ARRAY) {
+    this->array.SetMembers(array);
+  }
+
+  ExecValue() = default;
+  ExecValue(const ExecValue& other) = default;
+  ExecValue& operator=(const ExecValue& other) = default;
+  ExecValue(ExecValue&& other) = default;
+  ExecValue& operator=(ExecValue&& other) = default;
+
+  int64_t length() const { return this->is_array() ? this->array.length : 1; }
+
+  bool is_array() const { return this->kind == ARRAY; }
+  bool is_scalar() const { return this->kind == SCALAR; }
+
+  void SetArray(const ArrayData& array) {
+    this->kind = ARRAY;
+    this->array.SetMembers(array);
+  }
+
+  void SetScalar(const Scalar* scalar) {
+    this->kind = SCALAR;
+    this->scalar = scalar;
+  }
+
+  template <typename ExactType>
+  const ExactType& scalar_as() const {
+    return ::arrow::internal::checked_cast<const ExactType&>(*this->scalar);
+  }
+
+  /// XXX: here only temporarily until type resolution can be cleaned
+  /// up to not use ValueDescr
+  ValueDescr descr() const {
+    ValueDescr::Shape shape = this->is_array() ? ValueDescr::ARRAY : ValueDescr::SCALAR;
+    return ValueDescr(const_cast<DataType*>(this->type())->shared_from_this(), shape);
+  }
+
+  /// XXX: here temporarily for compatibility with datum, see
+  /// e.g. MakeStructExec in scalar_nested.cc
+  int64_t null_count() const {
+    if (this->is_array()) {
+      return this->array.GetNullCount();
+    } else {
+      return this->scalar->is_valid ? 0 : 1;
+    }
+  }
+
+  const DataType* type() const {
+    if (this->kind == ARRAY) {
+      return array.type;
+    } else {
+      return scalar->type.get();
+    }
+  }
+};
+
+struct ARROW_EXPORT ExecResult {
+  // The default value of the variant is ArraySpan
+  // TODO(wesm): remove Scalar output modality in ARROW-16577
+  util::Variant<ArraySpan, std::shared_ptr<ArrayData>, std::shared_ptr<Scalar>> value;
+
+  int64_t length() const {
+    if (this->is_array_span()) {
+      return this->array_span()->length;
+    } else if (this->is_array_data()) {
+      return this->array_data()->length;
+    } else {
+      // Should not reach here
+      return 1;
+    }
+  }
+
+  const DataType* type() const {
+    switch (this->value.index()) {
+      case 0:
+        return this->array_span()->type;
+      case 1:
+        return this->array_data()->type.get();
+      default:
+        // scalar
+        return this->scalar()->type.get();
+    };
+  }
+
+  ArraySpan* array_span() const {
+    return const_cast<ArraySpan*>(&util::get<ArraySpan>(this->value));
+  }
+  bool is_array_span() const { return this->value.index() == 0; }
+
+  const std::shared_ptr<ArrayData>& array_data() const {
+    return util::get<std::shared_ptr<ArrayData>>(this->value);
+  }
+
+  bool is_array_data() const { return this->value.index() == 1; }
+
+  const std::shared_ptr<Scalar>& scalar() const {
+    return util::get<std::shared_ptr<Scalar>>(this->value);
+  }
+
+  bool is_scalar() const { return this->value.index() == 2; }
+};
+
+/// \brief A "lightweight" column batch object which contains no
+/// std::shared_ptr objects and does not have any memory ownership
+/// semantics. Can represent a view onto an "owning" ExecBatch.
+struct ARROW_EXPORT ExecSpan {
+  ExecSpan() = default;
+  ExecSpan(const ExecSpan& other) = default;
+  ExecSpan& operator=(const ExecSpan& other) = default;
+  ExecSpan(ExecSpan&& other) = default;
+  ExecSpan& operator=(ExecSpan&& other) = default;
+
+  explicit ExecSpan(std::vector<ExecValue> values, int64_t length)
+      : length(length), values(std::move(values)) {}
+
+  explicit ExecSpan(const ExecBatch& batch) {
+    this->length = batch.length;
+    this->values.resize(batch.values.size());
+    for (size_t i = 0; i < batch.values.size(); ++i) {
+      const Datum& in_value = batch[i];
+      ExecValue* out_value = &this->values[i];
+      if (in_value.is_array()) {
+        out_value->SetArray(*in_value.array());
+      } else {
+        out_value->SetScalar(in_value.scalar().get());
+      }
+    }
+  }
+
+  bool is_all_scalar() const {
+    return std::all_of(this->values.begin(), this->values.end(),
+                       [](const ExecValue& v) { return v.is_scalar(); });
+  }
+
+  /// \brief Return the value at the i-th index
+  template <typename index_type>
+  inline const ExecValue& operator[](index_type i) const {
+    return values[i];
+  }
+
+  void AddOffset(int64_t offset) {
+    for (ExecValue& value : values) {
+      if (value.kind == ExecValue::ARRAY) {
+        value.array.AddOffset(offset);
+      }
+    }
+  }
+
+  void SetOffset(int64_t offset) {
+    for (ExecValue& value : values) {
+      if (value.kind == ExecValue::ARRAY) {
+        value.array.SetOffset(offset);
+      }
+    }
+  }
+
+  /// \brief A convenience for the number of values / arguments.
+  int num_values() const { return static_cast<int>(values.size()); }
+
+  // XXX: eliminate the need for ValueDescr; copied temporarily from
+  // ExecBatch
+  std::vector<ValueDescr> GetDescriptors() const {
+    std::vector<ValueDescr> result;
+    for (const auto& value : this->values) {
+      result.emplace_back(value.descr());
+    }
+    return result;
+  }
+
+  int64_t length;
+  std::vector<ExecValue> values;
+};
+
 /// \defgroup compute-call-function One-shot calls to compute functions
 ///
 /// @{
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 74124f02267..1219c39a2df 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -39,8 +39,9 @@ static constexpr int64_t kDefaultMaxChunksize = std::numeric_limits<int64_t>::ma
 
 namespace detail {
 
-/// \brief Break std::vector<Datum> into a sequence of ExecBatch for kernel
-/// execution
+/// \brief Break std::vector<Datum> into a sequence of ExecBatch for
+/// kernel execution. The lifetime of the Datum vector must be longer
+/// than the lifetime of this object
 class ARROW_EXPORT ExecBatchIterator {
  public:
   /// \brief Construct iterator and do basic argument validation
@@ -49,7 +50,7 @@ class ARROW_EXPORT ExecBatchIterator {
   /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending
   /// on the chunk layout of ChunkedArray.
   static Result<std::unique_ptr<ExecBatchIterator>> Make(
-      std::vector<Datum> args, int64_t max_chunksize = kDefaultMaxChunksize);
+      const std::vector<Datum>& args, int64_t max_chunksize = kDefaultMaxChunksize);
 
   /// \brief Compute the next batch. Always returns at least one batch. Return
   /// false if the iterator is exhausted
@@ -62,9 +63,10 @@ class ARROW_EXPORT ExecBatchIterator {
   int64_t max_chunksize() const { return max_chunksize_; }
 
  private:
-  ExecBatchIterator(std::vector<Datum> args, int64_t length, int64_t max_chunksize);
+  ExecBatchIterator(const std::vector<Datum>& args, int64_t length,
+                    int64_t max_chunksize);
 
-  std::vector<Datum> args_;
+  const std::vector<Datum>& args_;
   std::vector<int> chunk_indexes_;
   std::vector<int64_t> chunk_positions_;
   int64_t position_;
@@ -72,6 +74,54 @@ class ARROW_EXPORT ExecBatchIterator {
   int64_t max_chunksize_;
 };
 
+/// \brief Break std::vector<Datum> into a sequence of non-owning
+/// ExecSpan for kernel execution. The lifetime of the Datum vector
+/// must be longer than the lifetime of this object
+class ARROW_EXPORT ExecSpanIterator {
+ public:
+  /// \brief Construct iterator and do basic argument validation
+  ///
+  /// \param[in] args the Datum argument, must be all array-like or scalar
+  /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending
+  /// on the chunk layout of ChunkedArray.
+  static Result<std::unique_ptr<ExecSpanIterator>> Make(
+      const std::vector<Datum>& args, int64_t max_chunksize = kDefaultMaxChunksize);
+
+  /// \brief Compute the next span by updating the state of the
+  /// previous span object. You must keep passing in the previous
+  /// value for the results to be consistent. If you need to process
+  /// in parallel, make a copy of the in-use ExecSpan while it's being
+  /// used by another thread and pass it into Next. This function
+  /// always populates at least one span. If you call this function
+  /// with a blank ExecSpan after the first iteration, it will not
+  /// work correctly (maybe we will change this later). Return false
+  /// if the iteration is exhausted
+  bool Next(ExecSpan* span);
+
+  int64_t length() const { return length_; }
+  int64_t position() const { return position_; }
+
+ private:
+  ExecSpanIterator(const std::vector<Datum>& args, int64_t length, int64_t max_chunksize);
+
+  int64_t GetNextChunkSpan(int64_t iteration_size, ExecSpan* span);
+
+  bool initialized_ = false;
+  bool have_chunked_arrays_ = false;
+  const std::vector<Datum>& args_;
+  std::vector<int> chunk_indexes_;
+  std::vector<int64_t> value_positions_;
+
+  // Keep track of the array offset in the "active" array (e.g. the
+  // array or the particular chunk of an array) in each slot, separate
+  // from the relative position within each chunk (which is in
+  // value_positions_)
+  std::vector<int64_t> value_offsets_;
+  int64_t position_;
+  int64_t length_;
+  int64_t max_chunksize_;
+};
+
 // "Push" / listener API like IPC reader so that consumers can receive
 // processed chunks as soon as they're available.
 
@@ -138,7 +188,10 @@ class ARROW_EXPORT KernelExecutor {
 /// \param[in] batch the data batch
 /// \param[in] out the output ArrayData, must not be null
 ARROW_EXPORT
-Status PropagateNulls(KernelContext* ctx, const ExecBatch& batch, ArrayData* out);
+Status PropagateNulls(KernelContext* ctx, const ExecSpan& batch, ArrayData* out);
+
+ARROW_EXPORT
+void PropagateNullsSpans(const ExecSpan& batch, ArraySpan* out);
 
 }  // namespace detail
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index b00bcb319c3..c74d24d6ecf 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -90,13 +90,20 @@ TEST(SelectionVector, Basics) {
   ASSERT_EQ(3, sel_vector->indices()[1]);
 }
 
+void AssertValidityZeroExtraBits(const uint8_t* data, int64_t length, int64_t offset) {
+  const int64_t bit_extent = ((offset + length + 7) / 8) * 8;
+  for (int64_t i = offset + length; i < bit_extent; ++i) {
+    EXPECT_FALSE(bit_util::GetBit(data, i)) << i;
+  }
+}
+
+void AssertValidityZeroExtraBits(const ArraySpan& arr) {
+  return AssertValidityZeroExtraBits(arr.buffers[0].data, arr.length, arr.offset);
+}
+
 void AssertValidityZeroExtraBits(const ArrayData& arr) {
   const Buffer& buf = *arr.buffers[0];
-
-  const int64_t bit_extent = ((arr.offset + arr.length + 7) / 8) * 8;
-  for (int64_t i = arr.offset + arr.length; i < bit_extent; ++i) {
-    EXPECT_FALSE(bit_util::GetBit(buf.data(), i)) << i;
-  }
+  return AssertValidityZeroExtraBits(buf.data(), arr.length, arr.offset);
 }
 
 class TestComputeInternals : public ::testing::Test {
@@ -137,6 +144,9 @@ class TestComputeInternals : public ::testing::Test {
   std::unique_ptr<random::RandomArrayGenerator> rng_;
 };
 
+// ----------------------------------------------------------------------
+// Test PropagateNulls
+
 class TestPropagateNulls : public TestComputeInternals {};
 
 TEST_F(TestPropagateNulls, UnknownNullCountWithNullsZeroCopies) {
@@ -149,7 +159,7 @@ TEST_F(TestPropagateNulls, UnknownNullCountWithNullsZeroCopies) {
   ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount);
 
   ExecBatch batch({input}, length);
-  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
   ASSERT_EQ(nulls.get(), output.buffers[0].get());
   ASSERT_EQ(kUnknownNullCount, output.null_count);
   ASSERT_EQ(9, output.GetNullCount());
@@ -164,7 +174,7 @@ TEST_F(TestPropagateNulls, UnknownNullCountWithoutNulls) {
   ArrayData input(boolean(), length, {nulls, nullptr}, kUnknownNullCount);
 
   ExecBatch batch({input}, length);
-  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
   EXPECT_EQ(-1, output.null_count);
   EXPECT_EQ(nulls.get(), output.buffers[0].get());
 }
@@ -185,7 +195,7 @@ TEST_F(TestPropagateNulls, SetAllNulls) {
     ArrayData output(boolean(), length, buffers);
 
     ExecBatch batch(values, length);
-    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+    ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
 
     if (preallocate) {
       // Ensure that buffer object the same when we pass in preallocated memory
@@ -228,7 +238,7 @@ TEST_F(TestPropagateNulls, SetAllNulls) {
     // null-scalar earlier in the batch
     ArrayData output(boolean(), length, {nullptr, nullptr});
     ExecBatch batch({MakeNullScalar(boolean()), arr_all_nulls}, length);
-    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+    ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
     ASSERT_EQ(arr_all_nulls->data()->buffers[0].get(), output.buffers[0].get());
   }
 }
@@ -260,7 +270,7 @@ TEST_F(TestPropagateNulls, SingleValueWithNulls) {
       ASSERT_EQ(0, output.offset);
     }
 
-    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+    ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
 
     if (!preallocate) {
       const Buffer* parent_buf = arr->data()->buffers[0].get();
@@ -308,14 +318,14 @@ TEST_F(TestPropagateNulls, ZeroCopyWhenZeroNullsOnOneInput) {
 
   ArrayData output(boolean(), length, {nullptr, nullptr});
   ExecBatch batch({some_nulls, no_nulls}, length);
-  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
   ASSERT_EQ(nulls.get(), output.buffers[0].get());
   ASSERT_EQ(9, output.null_count);
 
   // Flip order of args
   output = ArrayData(boolean(), length, {nullptr, nullptr});
   batch.values = {no_nulls, no_nulls, some_nulls};
-  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
   ASSERT_EQ(nulls.get(), output.buffers[0].get());
   ASSERT_EQ(9, output.null_count);
 
@@ -324,7 +334,7 @@ TEST_F(TestPropagateNulls, ZeroCopyWhenZeroNullsOnOneInput) {
   auto preallocated_mem = std::make_shared<MutableBuffer>(bitmap_data, 2);
   output.null_count = kUnknownNullCount;
   output.buffers[0] = preallocated_mem;
-  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
 
   ASSERT_EQ(preallocated_mem.get(), output.buffers[0].get());
   ASSERT_EQ(9, output.null_count);
@@ -366,7 +376,7 @@ TEST_F(TestPropagateNulls, IntersectsNulls) {
     ArrayData output(boolean(), length, {nulls, nullptr});
     output.offset = output_offset;
 
-    ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+    ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
 
     // Preallocated memory used
     if (preallocate) {
@@ -405,19 +415,257 @@ TEST_F(TestPropagateNulls, NullOutputTypeNoop) {
   ExecBatch batch({rng_->Boolean(100, 0.5, 0.5)}, length);
 
   ArrayData output(null(), length, {nullptr});
-  ASSERT_OK(PropagateNulls(ctx_.get(), batch, &output));
+  ASSERT_OK(PropagateNulls(ctx_.get(), ExecSpan(batch), &output));
   ASSERT_EQ(nullptr, output.buffers[0]);
 }
 
+// ----------------------------------------------------------------------
+// Test PropagateNullsSpans (new span-based implementation). Some of
+// the tests above had to be rewritten because the span-based
+// implementation does not deal with zero-copy optimizations right now
+
+class TestPropagateNullsSpans : public TestComputeInternals {};
+
+TEST_F(TestPropagateNullsSpans, UnknownNullCountWithNullsZeroCopies) {
+  const int64_t length = 16;
+
+  const uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0};
+  auto nulls = std::make_shared<Buffer>(validity_bitmap, 8);
+  auto ty = boolean();
+  ArrayData input(ty, length, {nulls, nullptr}, kUnknownNullCount);
+
+  uint8_t validity_bitmap2[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  auto nulls2 = std::make_shared<Buffer>(validity_bitmap2, 8);
+  ArraySpan output(ty.get(), length);
+  output.buffers[0].data = validity_bitmap2;
+  output.buffers[0].size = 0;
+
+  ExecSpan span(ExecBatch({input}, length));
+  PropagateNullsSpans(span, &output);
+  ASSERT_EQ(kUnknownNullCount, output.null_count);
+  ASSERT_EQ(9, output.GetNullCount());
+}
+
+TEST_F(TestPropagateNullsSpans, UnknownNullCountWithoutNulls) {
+  const int64_t length = 16;
+  constexpr uint8_t validity_bitmap[8] = {255, 255, 0, 0, 0, 0, 0, 0};
+  auto nulls = std::make_shared<Buffer>(validity_bitmap, 8);
+
+  auto ty = boolean();
+  ArrayData input(ty, length, {nulls, nullptr}, kUnknownNullCount);
+
+  uint8_t validity_bitmap2[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  auto nulls2 = std::make_shared<Buffer>(validity_bitmap2, 8);
+  ArraySpan output(ty.get(), length);
+  output.buffers[0].data = validity_bitmap2;
+  output.buffers[0].size = 0;
+
+  ExecSpan span(ExecBatch({input}, length));
+  PropagateNullsSpans(span, &output);
+  ASSERT_EQ(kUnknownNullCount, output.null_count);
+  ASSERT_EQ(0, output.GetNullCount());
+}
+
+TEST_F(TestPropagateNullsSpans, SetAllNulls) {
+  const int64_t length = 16;
+
+  auto CheckSetAllNull = [&](std::vector<Datum> values) {
+    // Make fresh bitmap with all 1's
+    uint8_t bitmap_data[2] = {255, 255};
+    auto buf = std::make_shared<MutableBuffer>(bitmap_data, 2);
+
+    auto ty = boolean();
+    ArraySpan output(ty.get(), length);
+    output.SetBuffer(0, buf);
+
+    ExecSpan span(ExecBatch(values, length));
+    PropagateNullsSpans(span, &output);
+
+    uint8_t expected[2] = {0, 0};
+    ASSERT_EQ(0, std::memcmp(output.buffers[0].data, expected, output.buffers[0].size));
+  };
+
+  // There is a null scalar
+  std::shared_ptr<Scalar> i32_val = std::make_shared<Int32Scalar>(3);
+  std::vector<Datum> vals = {i32_val, MakeNullScalar(boolean())};
+  CheckSetAllNull(vals);
+
+  const double true_prob = 0.5;
+  vals[0] = rng_->Boolean(length, true_prob);
+  CheckSetAllNull(vals);
+
+  auto arr_all_nulls = rng_->Boolean(length, true_prob, /*null_probability=*/1);
+
+  // One value is all null
+  vals = {rng_->Boolean(length, true_prob, /*null_probability=*/0.5), arr_all_nulls};
+  CheckSetAllNull(vals);
+
+  // A value is NullType
+  std::shared_ptr<Array> null_arr = std::make_shared<NullArray>(length);
+  vals = {rng_->Boolean(length, true_prob), null_arr};
+  CheckSetAllNull(vals);
+}
+
+TEST_F(TestPropagateNullsSpans, SingleValueWithNulls) {
+  // Input offset is non-zero (0 mod 8 and nonzero mod 8 cases)
+  const int64_t length = 100;
+  auto arr = rng_->Boolean(length, 0.5, /*null_probability=*/0.5);
+
+  auto CheckSliced = [&](int64_t offset, int64_t out_offset = 0) {
+    // Unaligned bitmap, zero copy not possible
+    auto sliced = arr->Slice(offset);
+    std::vector<Datum> vals = {sliced};
+
+    auto ty = boolean();
+    ArraySpan output(ty.get(), vals[0].length());
+    output.offset = out_offset;
+
+    std::shared_ptr<Buffer> preallocated_bitmap;
+    ASSERT_OK_AND_ASSIGN(
+        preallocated_bitmap,
+        AllocateBuffer(bit_util::BytesForBits(sliced->length() + out_offset)));
+    std::memset(preallocated_bitmap->mutable_data(), 0, preallocated_bitmap->size());
+    output.SetBuffer(0, preallocated_bitmap);
+
+    ExecBatch batch(vals, vals[0].length());
+    PropagateNullsSpans(ExecSpan(batch), &output);
+    ASSERT_EQ(arr->Slice(offset)->null_count(), output.GetNullCount());
+    ASSERT_TRUE(BitmapEquals(output.buffers[0].data, output.offset,
+                             sliced->null_bitmap_data(), sliced->offset(),
+                             output.length));
+    AssertValidityZeroExtraBits(output);
+  };
+
+  CheckSliced(8);
+  CheckSliced(7);
+  CheckSliced(8, /*offset=*/4);
+  CheckSliced(7, /*offset=*/4);
+}
+
+TEST_F(TestPropagateNullsSpans, CasesThatUsedToBeZeroCopy) {
+  // ARROW-16576: testing behaviors that used to be zero copy but are
+  // not anymore
+  const int64_t length = 16;
+
+  auto ty = boolean();
+  constexpr uint8_t validity_bitmap[8] = {254, 0, 0, 0, 0, 0, 0, 0};
+  auto nulls = std::make_shared<Buffer>(validity_bitmap, 8);
+
+  ArraySpan some_nulls(ty.get(), length);
+  some_nulls.SetBuffer(0, nulls);
+  some_nulls.null_count = 9;
+
+  ArraySpan no_nulls(ty.get(), length);
+  no_nulls.null_count = 0;
+
+  {
+    uint8_t bitmap_data[2] = {0, 0};
+    auto preallocated_mem = std::make_shared<Buffer>(bitmap_data, 2);
+
+    ArraySpan output(ty.get(), length);
+    output.SetBuffer(0, preallocated_mem);
+    PropagateNullsSpans(ExecSpan({some_nulls, no_nulls}, length), &output);
+    ASSERT_EQ(
+        0, std::memcmp(output.buffers[0].data, validity_bitmap, output.buffers[0].size));
+    ASSERT_EQ(output.buffers[0].owner, &preallocated_mem);
+    ASSERT_EQ(9, output.GetNullCount());
+  }
+
+  // Flip order of args
+  {
+    uint8_t bitmap_data[2] = {0, 0};
+    auto preallocated_mem = std::make_shared<Buffer>(bitmap_data, 2);
+
+    ArraySpan output(ty.get(), length);
+    output.SetBuffer(0, preallocated_mem);
+    PropagateNullsSpans(ExecSpan({no_nulls, no_nulls, some_nulls}, length), &output);
+    ASSERT_EQ(
+        0, std::memcmp(output.buffers[0].data, validity_bitmap, output.buffers[0].size));
+    ASSERT_EQ(output.buffers[0].owner, &preallocated_mem);
+    ASSERT_EQ(9, output.GetNullCount());
+  }
+}
+
+TEST_F(TestPropagateNullsSpans, IntersectsNulls) {
+  const int64_t length = 16;
+
+  // 0b01111111 0b11001111
+  constexpr uint8_t bitmap1[8] = {127, 207, 0, 0, 0, 0, 0, 0};
+  auto buffer1 = std::make_shared<Buffer>(bitmap1, 8);
+
+  // 0b11111110 0b01111111
+  constexpr uint8_t bitmap2[8] = {254, 127, 0, 0, 0, 0, 0, 0};
+  auto buffer2 = std::make_shared<Buffer>(bitmap2, 8);
+
+  // 0b11101111 0b11111110
+  constexpr uint8_t bitmap3[8] = {239, 254, 0, 0, 0, 0, 0, 0};
+  auto buffer3 = std::make_shared<Buffer>(bitmap3, 8);
+
+  auto ty = boolean();
+
+  ArraySpan arr1(ty.get(), length);
+  arr1.SetBuffer(0, buffer1);
+
+  ArraySpan arr2(ty.get(), length);
+  arr2.SetBuffer(0, buffer2);
+
+  ArraySpan arr3(ty.get(), length);
+  arr3.SetBuffer(0, buffer3);
+
+  auto CheckCase = [&](std::vector<ExecValue> values, int64_t ex_null_count,
+                       const uint8_t* ex_bitmap, int64_t output_offset = 0) {
+    ExecSpan batch(values, length);
+
+    std::shared_ptr<Buffer> nulls;
+    // Make the buffer one byte bigger so we can have non-zero offsets
+    ASSERT_OK_AND_ASSIGN(nulls, AllocateBuffer(3));
+    std::memset(nulls->mutable_data(), 0, nulls->size());
+
+    ArraySpan output(ty.get(), length);
+    output.SetBuffer(0, nulls);
+    output.offset = output_offset;
+
+    PropagateNullsSpans(batch, &output);
+    ASSERT_EQ(&nulls, output.buffers[0].owner);
+    EXPECT_EQ(kUnknownNullCount, output.null_count);
+    EXPECT_EQ(ex_null_count, output.GetNullCount());
+    ASSERT_TRUE(BitmapEquals(output.buffers[0].data, output_offset, ex_bitmap,
+                             /*ex_offset=*/0, length));
+
+    // Now check that the rest of the bits in out_buffer are still 0
+    AssertValidityZeroExtraBits(output);
+  };
+
+  // 0b01101110 0b01001110
+  uint8_t expected1[2] = {110, 78};
+  CheckCase({arr1, arr2, arr3}, 7, expected1);
+  CheckCase({arr1, arr2, arr3}, 7, expected1, /*output_offset=*/4);
+
+  // 0b01111110 0b01001111
+  uint8_t expected2[2] = {126, 79};
+  CheckCase({arr1, arr2}, 5, expected2, /*output_offset=*/4);
+}
+
+TEST_F(TestPropagateNullsSpans, NullOutputTypeNoop) {
+  // Ensure we leave the buffers alone when the output type is null()
+  // TODO(wesm): is this test useful? Can probably delete
+  const int64_t length = 100;
+  ExecBatch batch({rng_->Boolean(100, 0.5, 0.5)}, length);
+
+  auto ty = null();
+  ArraySpan result(ty.get(), length);
+  PropagateNullsSpans(ExecSpan(batch), &result);
+  ASSERT_EQ(nullptr, result.buffers[0].data);
+}
+
 // ----------------------------------------------------------------------
 // ExecBatchIterator
 
 class TestExecBatchIterator : public TestComputeInternals {
  public:
-  void SetupIterator(std::vector<Datum> args,
+  void SetupIterator(const std::vector<Datum>& args,
                      int64_t max_chunksize = kDefaultMaxChunksize) {
-    ASSERT_OK_AND_ASSIGN(iterator_,
-                         ExecBatchIterator::Make(std::move(args), max_chunksize));
+    ASSERT_OK_AND_ASSIGN(iterator_, ExecBatchIterator::Make(args, max_chunksize));
   }
   void CheckIteration(const std::vector<Datum>& args, int chunksize,
                       const std::vector<int>& ex_batch_sizes) {
@@ -540,59 +788,192 @@ TEST_F(TestExecBatchIterator, ZeroLengthInputs) {
   CheckArgs(args);
 }
 
+// ----------------------------------------------------------------------
+// ExecSpanIterator tests
+
+class TestExecSpanIterator : public TestComputeInternals {
+ public:
+  void SetupIterator(const std::vector<Datum>& args,
+                     int64_t max_chunksize = kDefaultMaxChunksize) {
+    ASSERT_OK_AND_ASSIGN(iterator_, ExecSpanIterator::Make(args, max_chunksize));
+  }
+  void CheckIteration(const std::vector<Datum>& args, int chunksize,
+                      const std::vector<int>& ex_batch_sizes) {
+    SetupIterator(args, chunksize);
+    ExecSpan batch;
+    int64_t position = 0;
+    for (size_t i = 0; i < ex_batch_sizes.size(); ++i) {
+      ASSERT_EQ(position, iterator_->position());
+      ASSERT_TRUE(iterator_->Next(&batch));
+      ASSERT_EQ(ex_batch_sizes[i], batch.length);
+
+      for (size_t j = 0; j < args.size(); ++j) {
+        switch (args[j].kind()) {
+          case Datum::SCALAR:
+            ASSERT_TRUE(args[j].scalar()->Equals(*batch[j].scalar));
+            break;
+          case Datum::ARRAY:
+            AssertArraysEqual(*args[j].make_array()->Slice(position, batch.length),
+                              *batch[j].array.ToArray());
+            break;
+          case Datum::CHUNKED_ARRAY: {
+            const ChunkedArray& carr = *args[j].chunked_array();
+            if (batch.length == 0) {
+              ASSERT_EQ(0, carr.length());
+            } else {
+              auto arg_slice = carr.Slice(position, batch.length);
+              // The sliced ChunkedArrays should only ever be 1 chunk
+              ASSERT_EQ(1, arg_slice->num_chunks());
+              AssertArraysEqual(*arg_slice->chunk(0), *batch[j].array.ToArray());
+            }
+          } break;
+          default:
+            break;
+        }
+      }
+      position += ex_batch_sizes[i];
+    }
+    // Ensure that the iterator is exhausted
+    ASSERT_FALSE(iterator_->Next(&batch));
+
+    ASSERT_EQ(iterator_->length(), iterator_->position());
+  }
+
+ protected:
+  std::unique_ptr<ExecSpanIterator> iterator_;
+};
+
+TEST_F(TestExecSpanIterator, Basics) {
+  const int64_t length = 100;
+
+  // Simple case with a single chunk
+  std::vector<Datum> args = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
+                             Datum(std::make_shared<Int32Scalar>(3))};
+  SetupIterator(args);
+
+  ExecSpan batch;
+  ASSERT_TRUE(iterator_->Next(&batch));
+  ASSERT_EQ(3, batch.values.size());
+  ASSERT_EQ(3, batch.num_values());
+  ASSERT_EQ(length, batch.length);
+
+  AssertArraysEqual(*args[0].make_array(), *batch[0].array.ToArray());
+  AssertArraysEqual(*args[1].make_array(), *batch[1].array.ToArray());
+  ASSERT_TRUE(args[2].scalar()->Equals(*batch[2].scalar));
+
+  ASSERT_EQ(length, iterator_->position());
+  ASSERT_FALSE(iterator_->Next(&batch));
+
+  // Split into chunks of size 16
+  CheckIteration(args, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
+}
+
+TEST_F(TestExecSpanIterator, InputValidation) {
+  std::vector<Datum> args = {Datum(GetInt32Array(10)), Datum(GetInt32Array(9))};
+  ASSERT_RAISES(Invalid, ExecSpanIterator::Make(args));
+
+  args = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
+  ASSERT_RAISES(Invalid, ExecSpanIterator::Make(args));
+
+  args = {Datum(GetInt32Array(10))};
+  ASSERT_OK_AND_ASSIGN(auto iterator, ExecSpanIterator::Make(args));
+}
+
+TEST_F(TestExecSpanIterator, ChunkedArrays) {
+  std::vector<Datum> args = {Datum(GetInt32Chunked({0, 20, 10})),
+                             Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
+                             Datum(std::make_shared<Int32Scalar>(5)),
+                             Datum(MakeNullScalar(boolean()))};
+
+  CheckIteration(args, /*chunksize=*/10, {10, 5, 5, 10});
+  CheckIteration(args, /*chunksize=*/20, {15, 5, 10});
+  CheckIteration(args, /*chunksize=*/30, {15, 5, 10});
+}
+
+TEST_F(TestExecSpanIterator, ZeroLengthInputs) {
+  auto carr = std::shared_ptr<ChunkedArray>(new ChunkedArray({}, int32()));
+
+  auto CheckArgs = [&](const std::vector<Datum>& args) {
+    auto iterator = ExecSpanIterator::Make(args).ValueOrDie();
+    ExecSpan batch;
+    ASSERT_FALSE(iterator->Next(&batch));
+  };
+
+  // Zero-length ChunkedArray with zero chunks
+  std::vector<Datum> args = {Datum(carr)};
+  CheckArgs(args);
+
+  // Zero-length array
+  args = {Datum(GetInt32Array(0))};
+  CheckArgs(args);
+
+  // ChunkedArray with single empty chunk
+  args = {Datum(GetInt32Chunked({0}))};
+  CheckArgs(args);
+}
+
 // ----------------------------------------------------------------------
 // Scalar function execution
 
-Status ExecCopy(KernelContext*, const ExecBatch& batch, Datum* out) {
+Status ExecCopyArrayData(KernelContext*, const ExecSpan& batch, ExecResult* out) {
   DCHECK_EQ(1, batch.num_values());
-  const auto& type = checked_cast<const FixedWidthType&>(*batch[0].type());
-  int value_size = type.bit_width() / 8;
+  int value_size = batch[0].type()->byte_width();
 
-  const ArrayData& arg0 = *batch[0].array();
-  ArrayData* out_arr = out->mutable_array();
+  const ArraySpan& arg0 = batch[0].array;
+  ArrayData* out_arr = out->array_data().get();
   uint8_t* dst = out_arr->buffers[1]->mutable_data() + out_arr->offset * value_size;
-  const uint8_t* src = arg0.buffers[1]->data() + arg0.offset * value_size;
+  const uint8_t* src = arg0.buffers[1].data + arg0.offset * value_size;
   std::memcpy(dst, src, batch.length * value_size);
   return Status::OK();
 }
 
-Status ExecComputedBitmap(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExecCopyArraySpan(KernelContext*, const ExecSpan& batch, ExecResult* out) {
+  DCHECK_EQ(1, batch.num_values());
+  int value_size = batch[0].type()->byte_width();
+  const ArraySpan& arg0 = batch[0].array;
+  ArraySpan* out_arr = out->array_span();
+  uint8_t* dst = out_arr->buffers[1].data + out_arr->offset * value_size;
+  const uint8_t* src = arg0.buffers[1].data + arg0.offset * value_size;
+  std::memcpy(dst, src, batch.length * value_size);
+  return Status::OK();
+}
+
+Status ExecComputedBitmap(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   // Propagate nulls not used. Check that the out bitmap isn't the same already
   // as the input bitmap
-  const ArrayData& arg0 = *batch[0].array();
-  ArrayData* out_arr = out->mutable_array();
-
-  if (CountSetBits(arg0.buffers[0]->data(), arg0.offset, batch.length) > 0) {
+  const ArraySpan& arg0 = batch[0].array;
+  ArraySpan* out_arr = out->array_span();
+  if (CountSetBits(arg0.buffers[0].data, arg0.offset, batch.length) > 0) {
     // Check that the bitmap has not been already copied over
-    DCHECK(!BitmapEquals(arg0.buffers[0]->data(), arg0.offset,
-                         out_arr->buffers[0]->data(), out_arr->offset, batch.length));
+    DCHECK(!BitmapEquals(arg0.buffers[0].data, arg0.offset, out_arr->buffers[0].data,
+                         out_arr->offset, batch.length));
   }
 
-  CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length,
-             out_arr->buffers[0]->mutable_data(), out_arr->offset);
-  return ExecCopy(ctx, batch, out);
+  CopyBitmap(arg0.buffers[0].data, arg0.offset, batch.length, out_arr->buffers[0].data,
+             out_arr->offset);
+  return ExecCopyArraySpan(ctx, batch, out);
 }
 
-Status ExecNoPreallocatedData(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExecNoPreallocatedData(KernelContext* ctx, const ExecSpan& batch,
+                              ExecResult* out) {
   // Validity preallocated, but not the data
-  ArrayData* out_arr = out->mutable_array();
+  ArrayData* out_arr = out->array_data().get();
   DCHECK_EQ(0, out_arr->offset);
-  const auto& type = checked_cast<const FixedWidthType&>(*batch[0].type());
-  int value_size = type.bit_width() / 8;
+  int value_size = batch[0].type()->byte_width();
   Status s = (ctx->Allocate(out_arr->length * value_size).Value(&out_arr->buffers[1]));
   DCHECK_OK(s);
-  return ExecCopy(ctx, batch, out);
+  return ExecCopyArrayData(ctx, batch, out);
 }
 
-Status ExecNoPreallocatedAnything(KernelContext* ctx, const ExecBatch& batch,
-                                  Datum* out) {
+Status ExecNoPreallocatedAnything(KernelContext* ctx, const ExecSpan& batch,
+                                  ExecResult* out) {
   // Neither validity nor data preallocated
-  ArrayData* out_arr = out->mutable_array();
+  ArrayData* out_arr = out->array_data().get();
   DCHECK_EQ(0, out_arr->offset);
   Status s = (ctx->AllocateBitmap(out_arr->length).Value(&out_arr->buffers[0]));
   DCHECK_OK(s);
-  const ArrayData& arg0 = *batch[0].array();
-  CopyBitmap(arg0.buffers[0]->data(), arg0.offset, batch.length,
+  const ArraySpan& arg0 = batch[0].array;
+  CopyBitmap(arg0.buffers[0].data, arg0.offset, batch.length,
              out_arr->buffers[0]->mutable_data(), /*offset=*/0);
 
   // Reuse the kernel that allocates the data
@@ -638,22 +1019,23 @@ Result<std::unique_ptr<KernelState>> InitStateful(KernelContext*,
   return std::unique_ptr<KernelState>(new ExampleState{func_options->value});
 }
 
-Status ExecStateful(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExecStateful(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   // We take the value from the state and multiply the data in batch[0] with it
   ExampleState* state = static_cast<ExampleState*>(ctx->state());
   int32_t multiplier = checked_cast<const Int32Scalar&>(*state->value).value;
 
-  const ArrayData& arg0 = *batch[0].array();
-  ArrayData* out_arr = out->mutable_array();
+  const ArraySpan& arg0 = batch[0].array;
+  ArraySpan* out_arr = out->array_span();
   const int32_t* arg0_data = arg0.GetValues<int32_t>(1);
-  int32_t* dst = out_arr->GetMutableValues<int32_t>(1);
+  int32_t* dst = out_arr->GetValues<int32_t>(1);
   for (int64_t i = 0; i < arg0.length; ++i) {
     dst[i] = arg0_data[i] * multiplier;
   }
   return Status::OK();
 }
 
-Status ExecAddInt32(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+// TODO: remove this / refactor it in ARROW-16577
+Status ExecAddInt32(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const Int32Scalar& arg0 = batch[0].scalar_as<Int32Scalar>();
   const Int32Scalar& arg1 = batch[1].scalar_as<Int32Scalar>();
   out->value = std::make_shared<Int32Scalar>(arg0.value + arg1.value);
@@ -685,9 +1067,10 @@ class TestCallScalarFunction : public TestComputeInternals {
                                                  /*doc=*/FunctionDoc::Empty());
 
     // Add a few kernels. Our implementation only accepts arrays
-    ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopy));
-    ASSERT_OK(func->AddKernel({InputType::Array(int32())}, int32(), ExecCopy));
-    ASSERT_OK(func->AddKernel({InputType::Array(float64())}, float64(), ExecCopy));
+    ASSERT_OK(func->AddKernel({InputType::Array(uint8())}, uint8(), ExecCopyArraySpan));
+    ASSERT_OK(func->AddKernel({InputType::Array(int32())}, int32(), ExecCopyArraySpan));
+    ASSERT_OK(
+        func->AddKernel({InputType::Array(float64())}, float64(), ExecCopyArraySpan));
     ASSERT_OK(registry->AddFunction(func));
 
     // A version which doesn't want the executor to call PropagateNulls
@@ -767,7 +1150,7 @@ TEST_F(TestCallScalarFunction, ArgumentValidation) {
 TEST_F(TestCallScalarFunction, PreallocationCases) {
   double null_prob = 0.2;
 
-  auto arr = GetUInt8Array(1000, null_prob);
+  auto arr = GetUInt8Array(100, null_prob);
 
   auto CheckFunction = [&](std::string func_name) {
     ResetContexts();
@@ -792,7 +1175,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) {
     {
       // Chunksize not multiple of 8
       std::vector<Datum> args = {Datum(arr)};
-      exec_ctx_->set_exec_chunksize(111);
+      exec_ctx_->set_exec_chunksize(11);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get()));
       AssertArraysEqual(*arr, *result.make_array());
     }
@@ -800,7 +1183,7 @@ TEST_F(TestCallScalarFunction, PreallocationCases) {
     // Input is chunked, output has one big chunk
     {
       auto carr = std::shared_ptr<ChunkedArray>(
-          new ChunkedArray({arr->Slice(0, 100), arr->Slice(100)}));
+          new ChunkedArray({arr->Slice(0, 10), arr->Slice(10)}));
       std::vector<Datum> args = {Datum(carr)};
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get()));
       std::shared_ptr<ChunkedArray> actual = result.chunked_array();
@@ -812,14 +1195,14 @@ TEST_F(TestCallScalarFunction, PreallocationCases) {
     {
       std::vector<Datum> args = {Datum(arr)};
       exec_ctx_->set_preallocate_contiguous(false);
-      exec_ctx_->set_exec_chunksize(400);
+      exec_ctx_->set_exec_chunksize(40);
       ASSERT_OK_AND_ASSIGN(Datum result, CallFunction(func_name, args, exec_ctx_.get()));
       ASSERT_EQ(Datum::CHUNKED_ARRAY, result.kind());
       const ChunkedArray& carr = *result.chunked_array();
       ASSERT_EQ(3, carr.num_chunks());
-      AssertArraysEqual(*arr->Slice(0, 400), *carr.chunk(0));
-      AssertArraysEqual(*arr->Slice(400, 400), *carr.chunk(1));
-      AssertArraysEqual(*arr->Slice(800), *carr.chunk(2));
+      AssertArraysEqual(*arr->Slice(0, 40), *carr.chunk(0));
+      AssertArraysEqual(*arr->Slice(40, 40), *carr.chunk(1));
+      AssertArraysEqual(*arr->Slice(80), *carr.chunk(2));
     }
   };
 
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index 2b3d4e6feb9..d2b36f0080d 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -314,7 +314,7 @@ Status Function::Validate() const {
 }
 
 Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                                 ArrayKernelExec exec, KernelInit init) {
+                                 ScalarKernel::ExecFunc exec, KernelInit init) {
   RETURN_NOT_OK(CheckArity(in_types));
 
   if (arity_.is_varargs && in_types.size() != 1) {
@@ -336,7 +336,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) {
 }
 
 Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                                 ArrayKernelExec exec, KernelInit init) {
+                                 KernelBatchExec exec, KernelInit init) {
   RETURN_NOT_OK(CheckArity(in_types));
 
   if (arity_.is_varargs && in_types.size() != 1) {
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index face491690f..91696b84fa2 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -314,7 +314,7 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
   /// initialization, preallocation for fixed-width types, and default null
   /// handling (intersect validity bitmaps of inputs).
   Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                   ArrayKernelExec exec, KernelInit init = NULLPTR);
+                   ScalarKernel::ExecFunc exec, KernelInit init = NULLPTR);
 
   /// \brief Add a kernel (function implementation). Returns error if the
   /// kernel's signature does not match the function's arity.
@@ -338,7 +338,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
   /// state initialization, no data preallocation, and no preallocation of the
   /// validity bitmap.
   Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                   ArrayKernelExec exec, KernelInit init = NULLPTR);
+                   KernelBatchExec exec, KernelInit init = NULLPTR);
 
   /// \brief Add a kernel (function implementation). Returns error if the
   /// kernel's signature does not match the function's arity.
diff --git a/cpp/src/arrow/compute/function_benchmark.cc b/cpp/src/arrow/compute/function_benchmark.cc
index a29a766be79..b508ad047fb 100644
--- a/cpp/src/arrow/compute/function_benchmark.cc
+++ b/cpp/src/arrow/compute/function_benchmark.cc
@@ -85,13 +85,15 @@ void BM_CastDispatchBaseline(benchmark::State& state) {
                         .ValueOrDie();
   kernel_context.SetState(cast_state.get());
 
+  ExecSpan input;
+  input.length = 1;
   for (auto _ : state) {
-    Datum timestamp_scalar = MakeNullScalar(double_type);
-    for (Datum int_scalar : int_scalars) {
-      ABORT_NOT_OK(
-          exec(&kernel_context, {{std::move(int_scalar)}, 1}, &timestamp_scalar));
+    ExecResult result;
+    result.value = MakeNullScalar(double_type);
+    for (const std::shared_ptr<Scalar>& int_scalar : int_scalars) {
+      input.values = {ExecValue(int_scalar.get())};
+      ABORT_NOT_OK(exec(&kernel_context, input, &result));
     }
-    benchmark::DoNotOptimize(timestamp_scalar);
   }
 
   state.SetItemsProcessed(state.iterations() * kScalarCount);
@@ -162,11 +164,15 @@ void BM_ExecuteScalarKernelOnScalar(benchmark::State& state) {
   ExecContext exec_context;
   KernelContext kernel_context(&exec_context);
 
+  ExecSpan input;
+  input.length = 1;
   for (auto _ : state) {
     int64_t total = 0;
-    for (const auto& scalar : scalars) {
-      Datum result{MakeNullScalar(int64())};
-      ABORT_NOT_OK(exec(&kernel_context, ExecBatch{{scalar}, /*length=*/1}, &result));
+    for (const std::shared_ptr<Scalar>& scalar : scalars) {
+      ExecResult result;
+      result.value = MakeNullScalar(int64());
+      input.values = {scalar.get()};
+      ABORT_NOT_OK(exec(&kernel_context, input, &result));
       total += result.scalar()->is_valid;
     }
     benchmark::DoNotOptimize(total);
diff --git a/cpp/src/arrow/compute/function_test.cc b/cpp/src/arrow/compute/function_test.cc
index 94e86c7bd57..ec5f3bc170c 100644
--- a/cpp/src/arrow/compute/function_test.cc
+++ b/cpp/src/arrow/compute/function_test.cc
@@ -210,12 +210,16 @@ TEST(VectorFunction, Basics) {
   ASSERT_EQ(Function::VECTOR, varargs_func.kind());
 }
 
-auto ExecNYI = [](KernelContext* ctx, const ExecBatch& args, Datum* out) {
+auto ExecNYI = [](KernelContext* ctx, const ExecSpan& args, ExecResult* out) {
   return Status::NotImplemented("NYI");
 };
 
-template <typename FunctionType>
-void CheckAddDispatch(FunctionType* func) {
+auto ExecNYIOld = [](KernelContext* ctx, const ExecBatch& args, Datum* out) {
+  return Status::NotImplemented("NYI");
+};
+
+template <typename FunctionType, typename ExecType>
+void CheckAddDispatch(FunctionType* func, ExecType exec) {
   using KernelType = typename FunctionType::KernelType;
 
   ASSERT_EQ(0, func->num_kernels());
@@ -224,29 +228,29 @@ void CheckAddDispatch(FunctionType* func) {
   std::vector<InputType> in_types1 = {int32(), int32()};
   OutputType out_type1 = int32();
 
-  ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI));
-  ASSERT_OK(func->AddKernel({int32(), int8()}, int32(), ExecNYI));
+  ASSERT_OK(func->AddKernel(in_types1, out_type1, exec));
+  ASSERT_OK(func->AddKernel({int32(), int8()}, int32(), exec));
 
   // Duplicate sig is okay
-  ASSERT_OK(func->AddKernel(in_types1, out_type1, ExecNYI));
+  ASSERT_OK(func->AddKernel(in_types1, out_type1, exec));
 
   // Add given a descr
-  KernelType descr({float64(), float64()}, float64(), ExecNYI);
+  KernelType descr({float64(), float64()}, float64(), exec);
   ASSERT_OK(func->AddKernel(descr));
 
   ASSERT_EQ(4, func->num_kernels());
   ASSERT_EQ(4, func->kernels().size());
 
   // Try adding some invalid kernels
-  ASSERT_RAISES(Invalid, func->AddKernel({}, int32(), ExecNYI));
-  ASSERT_RAISES(Invalid, func->AddKernel({int32()}, int32(), ExecNYI));
-  ASSERT_RAISES(Invalid, func->AddKernel({int8(), int8(), int8()}, int32(), ExecNYI));
+  ASSERT_RAISES(Invalid, func->AddKernel({}, int32(), exec));
+  ASSERT_RAISES(Invalid, func->AddKernel({int32()}, int32(), exec));
+  ASSERT_RAISES(Invalid, func->AddKernel({int8(), int8(), int8()}, int32(), exec));
 
   // Add valid and invalid kernel using kernel struct directly
-  KernelType valid_kernel({boolean(), boolean()}, boolean(), ExecNYI);
+  KernelType valid_kernel({boolean(), boolean()}, boolean(), exec);
   ASSERT_OK(func->AddKernel(valid_kernel));
 
-  KernelType invalid_kernel({boolean()}, boolean(), ExecNYI);
+  KernelType invalid_kernel({boolean()}, boolean(), exec);
   ASSERT_RAISES(Invalid, func->AddKernel(invalid_kernel));
 
   ASSERT_OK_AND_ASSIGN(const Kernel* kernel, func->DispatchExact({int32(), int32()}));
@@ -265,8 +269,10 @@ TEST(ScalarVectorFunction, DispatchExact) {
   ScalarFunction func1("scalar_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty());
   VectorFunction func2("vector_test", Arity::Binary(), /*doc=*/FunctionDoc::Empty());
 
-  CheckAddDispatch(&func1);
-  CheckAddDispatch(&func2);
+  CheckAddDispatch(&func1, ExecNYI);
+
+  // ARROW-16576: will migrate later to new span-based kernel exec API
+  CheckAddDispatch(&func2, ExecNYIOld);
 }
 
 TEST(ArrayFunction, VarArgs) {
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 6b1f23e78df..3a0fc2ccd64 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -83,16 +83,6 @@ class ARROW_EXPORT KernelContext {
   KernelState* state_ = NULLPTR;
 };
 
-/// \brief The standard kernel execution API that must be implemented for
-/// SCALAR and VECTOR kernel types. This includes both stateless and stateful
-/// kernels. Kernels depending on some execution state access that state via
-/// subclasses of KernelState set on the KernelContext object. May be used for
-/// SCALAR and VECTOR kernel kinds. Implementations should endeavor to write
-/// into pre-allocated memory if they are able, though for some kernels
-/// (e.g. in cases when a builder like StringBuilder) must be employed this may
-/// not be possible.
-using ArrayKernelExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
-
 /// \brief An type-checking interface to permit customizable validation rules
 /// for use with InputType and KernelSignature. This is for scenarios where the
 /// acceptance is not an exact type instance, such as a TIMESTAMP type for a
@@ -486,7 +476,7 @@ struct MemAllocation {
 
 struct Kernel;
 
-/// \brief Arguments to pass to a KernelInit function. A struct is used to help
+/// \brief Arguments to pass to an KernelInit function. A struct is used to help
 /// avoid API breakage should the arguments passed need to be expanded.
 struct KernelInitArgs {
   /// \brief A pointer to the kernel being initialized. The init function may
@@ -548,19 +538,26 @@ struct Kernel {
   SimdLevel::type simd_level = SimdLevel::NONE;
 };
 
-/// \brief Common kernel base data structure for ScalarKernel and
-/// VectorKernel. It is called "ArrayKernel" in that the functions generally
-/// output array values (as opposed to scalar values in the case of aggregate
-/// functions).
-struct ArrayKernel : public Kernel {
-  ArrayKernel() = default;
-
-  ArrayKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
-              KernelInit init = NULLPTR)
+/// \brief Kernel data structure for implementations of ScalarFunction. In
+/// addition to the members found in Kernel, contains the null handling
+/// and memory pre-allocation preferences.
+struct ScalarKernel : public Kernel {
+  /// \brief The scalar kernel execution API that must be implemented for SCALAR
+  /// kernel types. This includes both stateless and stateful kernels. Kernels
+  /// depending on some execution state access that state via subclasses of
+  /// KernelState set on the KernelContext object. Implementations should
+  /// endeavor to write into pre-allocated memory if they are able, though for
+  /// some kernels (e.g. in cases when a builder like StringBuilder) must be
+  /// employed this may not be possible.
+  using ExecFunc = std::function<Status(KernelContext*, const ExecSpan&, ExecResult*)>;
+  ScalarKernel() = default;
+
+  ScalarKernel(std::shared_ptr<KernelSignature> sig, ExecFunc exec,
+               KernelInit init = NULLPTR)
       : Kernel(std::move(sig), init), exec(std::move(exec)) {}
 
-  ArrayKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
-              KernelInit init = NULLPTR)
+  ScalarKernel(std::vector<InputType> in_types, OutputType out_type, ExecFunc exec,
+               KernelInit init = NULLPTR)
       : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
         exec(std::move(exec)) {}
 
@@ -568,7 +565,7 @@ struct ArrayKernel : public Kernel {
   /// implementation, it may only write into preallocated memory, while in some
   /// cases it will allocate its own memory. Any required state is managed
   /// through the KernelContext.
-  ArrayKernelExec exec;
+  ExecFunc exec;
 
   /// \brief Writing execution results into larger contiguous allocations
   /// requires that the kernel be able to write into sliced output ArrayData*,
@@ -576,13 +573,6 @@ struct ArrayKernel : public Kernel {
   /// not be able to do this, so setting this to false disables this
   /// functionality.
   bool can_write_into_slices = true;
-};
-
-/// \brief Kernel data structure for implementations of ScalarFunction. In
-/// addition to the members found in ArrayKernel, contains the null handling
-/// and memory pre-allocation preferences.
-struct ScalarKernel : public ArrayKernel {
-  using ArrayKernel::ArrayKernel;
 
   // For scalar functions preallocated data and intersecting arg validity
   // bitmaps is a reasonable default
@@ -593,34 +583,44 @@ struct ScalarKernel : public ArrayKernel {
 // ----------------------------------------------------------------------
 // VectorKernel (for VectorFunction)
 
-/// \brief See VectorKernel::finalize member for usage
-using VectorFinalize = std::function<Status(KernelContext*, std::vector<Datum>*)>;
+/// \brief scalar kernel execution API that must be implemented for VECTOR
+/// kernel types. This includes both stateless and stateful kernels. Kernels
+/// depending on some execution state access that state via subclasses of
+/// KernelState set on the KernelContext object.
+using KernelBatchExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
 
 /// \brief Kernel data structure for implementations of VectorFunction. In
-/// addition to the members found in ArrayKernel, contains an optional
-/// finalizer function, the null handling and memory pre-allocation preferences
-/// (which have different defaults from ScalarKernel), and some other
-/// execution-related options.
-struct VectorKernel : public ArrayKernel {
+/// contains an optional finalizer function, the null handling and memory
+/// pre-allocation preferences (which have different defaults from
+/// ScalarKernel), and some other execution-related options.
+struct VectorKernel : public Kernel {
+  /// \brief See VectorKernel::finalize member for usage
+  using FinalizeFunc = std::function<Status(KernelContext*, std::vector<Datum>*)>;
+
   VectorKernel() = default;
 
-  VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
-               KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
-      : ArrayKernel(std::move(in_types), std::move(out_type), std::move(exec),
-                    std::move(init)),
+  VectorKernel(std::vector<InputType> in_types, OutputType out_type, KernelBatchExec exec,
+               KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
+      : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
+        exec(std::move(exec)),
         finalize(std::move(finalize)) {}
 
-  VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
-               KernelInit init = NULLPTR, VectorFinalize finalize = NULLPTR)
-      : ArrayKernel(std::move(sig), std::move(exec), std::move(init)),
+  VectorKernel(std::shared_ptr<KernelSignature> sig, KernelBatchExec exec,
+               KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
+      : Kernel(std::move(sig), std::move(init)),
+        exec(std::move(exec)),
         finalize(std::move(finalize)) {}
 
+  /// \brief Perform a single invocation of this kernel. Any required state is
+  /// managed through the KernelContext.
+  KernelBatchExec exec;
+
   /// \brief For VectorKernel, convert intermediate results into finalized
   /// results. Mutates input argument. Some kernels may accumulate state
   /// (example: hashing-related functions) through processing chunked inputs, and
   /// then need to attach some accumulated state to each of the outputs of
   /// processing each chunk of data.
-  VectorFinalize finalize;
+  FinalizeFunc finalize;
 
   /// Since vector kernels generally are implemented rather differently from
   /// scalar/elementwise kernels (and they may not even yield arrays of the same
@@ -629,6 +629,13 @@ struct VectorKernel : public ArrayKernel {
   NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
   MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
 
+  /// \brief Writing execution results into larger contiguous allocations
+  /// requires that the kernel be able to write into sliced output ArrayData*,
+  /// including sliced output validity bitmaps. Some kernel implementations may
+  /// not be able to do this, so setting this to false disables this
+  /// functionality.
+  bool can_write_into_slices = true;
+
   /// Some vector kernels can do chunkwise execution using ExecBatchIterator,
   /// in some cases accumulating some state. Other kernels (like Take) need to
   /// be passed whole arrays and don't work on ChunkedArray inputs
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 16495bc8030..8acdce323ed 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -297,7 +297,7 @@ struct ProductImpl : public ScalarAggregator {
       }
 
       internal::VisitArrayValuesInline<ArrowType>(
-          *data,
+          ArraySpan(*data),
           [&](typename TypeTraits<ArrowType>::CType value) {
             this->product =
                 MultiplyTraits<AccType>::Multiply(*out_type, this->product, value);
@@ -630,7 +630,7 @@ struct IndexImpl : public ScalarAggregator {
     int64_t i = 0;
 
     ARROW_UNUSED(internal::VisitArrayValuesInline<ArgType>(
-        *input,
+        ArraySpan(*input),
         [&](ArgValue v) -> Status {
           if (v == desired) {
             index = i;
diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index 7d3440cbef3..d54ed12a1f7 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -389,7 +389,7 @@ Result<ValueDescr> ModeType(KernelContext*, const std::vector<ValueDescr>& descr
 }
 
 VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type,
-                           ArrayKernelExec exec) {
+                           KernelBatchExec exec) {
   VectorKernel kernel;
   kernel.init = ModeState::Init;
   kernel.can_execute_chunkwise = false;
@@ -433,8 +433,9 @@ void RegisterScalarAggregateMode(FunctionRegistry* registry) {
   DCHECK_OK(func->AddKernel(
       NewModeKernel(boolean(), ModeExecutor<StructType, BooleanType>::Exec)));
   for (const auto& type : NumericTypes()) {
+    // TODO(wesm):
     DCHECK_OK(func->AddKernel(
-        NewModeKernel(type, GenerateNumeric<ModeExecutor, StructType>(*type))));
+        NewModeKernel(type, GenerateNumericOld<ModeExecutor, StructType>(*type))));
   }
   // Type parameters are ignored
   DCHECK_OK(func->AddKernel(
diff --git a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
index 810fb539913..d18d8425946 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_quantile.cc
@@ -500,7 +500,7 @@ void AddQuantileKernels(VectorFunction* func) {
   for (const auto& ty : NumericTypes()) {
     base.signature = KernelSignature::Make({InputType(ty)}, OutputType(ResolveOutput));
     // output type is determined at runtime, set template argument to nulltype
-    base.exec = GenerateNumeric<QuantileExecutor, NullType>(*ty);
+    base.exec = GenerateNumericOld<QuantileExecutor, NullType>(*ty);
     DCHECK_OK(func->AddKernel(base));
   }
   {
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index b31ef408b10..cf2c2b9c195 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -29,15 +29,19 @@ namespace arrow {
 namespace compute {
 namespace internal {
 
-Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   return Status::NotImplemented("This kernel is malformed");
 }
 
-ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) {
-  return [exec](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    ExecBatch flipped_batch = batch;
-    std::swap(flipped_batch.values[0], flipped_batch.values[1]);
-    return exec(ctx, flipped_batch, out);
+Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  return Status::NotImplemented("This kernel is malformed");
+}
+
+ScalarKernel::ExecFunc MakeFlippedBinaryExec(ScalarKernel::ExecFunc exec) {
+  return [exec](KernelContext* ctx, const ExecSpan& span, ExecResult* out) {
+    ExecSpan flipped_span = span;
+    std::swap(flipped_span.values[0], flipped_span.values[1]);
+    return exec(ctx, flipped_span, out);
   };
 }
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index 6d31c1fe246..b3d989ec781 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -58,6 +58,7 @@ using internal::BitmapReader;
 using internal::checked_cast;
 using internal::FirstTimeBitmapWriter;
 using internal::GenerateBitsUnrolled;
+using internal::VisitBitBlocks;
 using internal::VisitBitBlocksVoid;
 using internal::VisitTwoBitBlocksVoid;
 
@@ -237,7 +238,7 @@ struct ArrayIterator<Type, enable_if_c_number_or_decimal<Type>> {
   using T = typename TypeTraits<Type>::ScalarType::ValueType;
   const T* values;
 
-  explicit ArrayIterator(const ArrayData& data) : values(data.GetValues<T>(1)) {}
+  explicit ArrayIterator(const ArraySpan& arr) : values(arr.GetValues<T>(1)) {}
   T operator()() { return *values++; }
 };
 
@@ -245,8 +246,8 @@ template <typename Type>
 struct ArrayIterator<Type, enable_if_boolean<Type>> {
   BitmapReader reader;
 
-  explicit ArrayIterator(const ArrayData& data)
-      : reader(data.buffers[1]->data(), data.offset, data.length) {}
+  explicit ArrayIterator(const ArraySpan& arr)
+      : reader(arr.buffers[1].data, arr.offset, arr.length) {}
   bool operator()() {
     bool out = reader.IsSet();
     reader.Next();
@@ -257,18 +258,17 @@ struct ArrayIterator<Type, enable_if_boolean<Type>> {
 template <typename Type>
 struct ArrayIterator<Type, enable_if_base_binary<Type>> {
   using offset_type = typename Type::offset_type;
-  const ArrayData& arr;
+  const ArraySpan& arr;
   const offset_type* offsets;
   offset_type cur_offset;
   const char* data;
   int64_t position;
 
-  explicit ArrayIterator(const ArrayData& arr)
+  explicit ArrayIterator(const ArraySpan& arr)
       : arr(arr),
-        offsets(reinterpret_cast<const offset_type*>(arr.buffers[1]->data()) +
-                arr.offset),
+        offsets(reinterpret_cast<const offset_type*>(arr.buffers[1].data) + arr.offset),
         cur_offset(offsets[0]),
-        data(reinterpret_cast<const char*>(arr.buffers[2]->data())),
+        data(reinterpret_cast<const char*>(arr.buffers[2].data)),
         position(0) {}
 
   util::string_view operator()() {
@@ -281,15 +281,15 @@ struct ArrayIterator<Type, enable_if_base_binary<Type>> {
 
 template <>
 struct ArrayIterator<FixedSizeBinaryType> {
-  const ArrayData& arr;
+  const ArraySpan& arr;
   const char* data;
   const int32_t width;
   int64_t position;
 
-  explicit ArrayIterator(const ArrayData& arr)
+  explicit ArrayIterator(const ArraySpan& arr)
       : arr(arr),
-        data(reinterpret_cast<const char*>(arr.buffers[1]->data())),
-        width(checked_cast<const FixedSizeBinaryType&>(*arr.type).byte_width()),
+        data(reinterpret_cast<const char*>(arr.buffers[1].data)),
+        width(arr.type->byte_width()),
         position(arr.offset) {}
 
   util::string_view operator()() {
@@ -309,7 +309,7 @@ struct OutputArrayWriter<Type, enable_if_c_number_or_decimal<Type>> {
   using T = typename TypeTraits<Type>::ScalarType::ValueType;
   T* values;
 
-  explicit OutputArrayWriter(ArrayData* data) : values(data->GetMutableValues<T>(1)) {}
+  explicit OutputArrayWriter(ArraySpan* data) : values(data->GetValues<T>(1)) {}
 
   void Write(T value) { *values++ = value; }
 
@@ -340,7 +340,8 @@ struct UnboxScalar<Type, enable_if_has_c_type<Type>> {
 
 template <typename Type>
 struct UnboxScalar<Type, enable_if_has_string_view<Type>> {
-  static util::string_view Unbox(const Scalar& val) {
+  using T = util::string_view;
+  static T Unbox(const Scalar& val) {
     if (!val.is_valid) return util::string_view();
     return util::string_view(*checked_cast<const BaseBinaryScalar&>(val).value);
   }
@@ -348,14 +349,16 @@ struct UnboxScalar<Type, enable_if_has_string_view<Type>> {
 
 template <>
 struct UnboxScalar<Decimal128Type> {
-  static const Decimal128& Unbox(const Scalar& val) {
+  using T = Decimal128;
+  static const T& Unbox(const Scalar& val) {
     return checked_cast<const Decimal128Scalar&>(val).value;
   }
 };
 
 template <>
 struct UnboxScalar<Decimal256Type> {
-  static const Decimal256& Unbox(const Scalar& val) {
+  using T = Decimal256;
+  static const T& Unbox(const Scalar& val) {
     return checked_cast<const Decimal256Scalar&>(val).value;
   }
 };
@@ -397,14 +400,198 @@ struct BoxScalar<Decimal256Type> {
   static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
 };
 
-// A VisitArrayDataInline variant that calls its visitor function with logical
+// ----------------------------------------------------------------------
+// Like VisitArrayDataInline, but for ArraySpans
+
+template <typename T, typename Enable = void>
+struct ArraySpanInlineVisitor {};
+
+// Numeric and primitive C-compatible types
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_has_c_type<T>> {
+  using c_type = typename T::c_type;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    const c_type* data = arr.GetValues<c_type>(1);
+    auto visit_valid = [&](int64_t i) { return valid_func(data[i]); };
+    return VisitBitBlocks(arr.buffers[0].data, arr.offset, arr.length,
+                          std::move(visit_valid), std::forward<NullFunc>(null_func));
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    using c_type = typename T::c_type;
+    const c_type* data = arr.GetValues<c_type>(1);
+    auto visit_valid = [&](int64_t i) { valid_func(data[i]); };
+    VisitBitBlocksVoid(arr.buffers[0].data, arr.offset, arr.length,
+                       std::move(visit_valid), std::forward<NullFunc>(null_func));
+  }
+};
+
+// Boolean
+template <>
+struct ArraySpanInlineVisitor<BooleanType> {
+  using c_type = bool;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    int64_t offset = arr.offset;
+    const uint8_t* data = arr.buffers[1].data;
+    return VisitBitBlocks(
+        arr.buffers[0].data, offset, arr.length,
+        [&](int64_t i) { return valid_func(bit_util::GetBit(data, offset + i)); },
+        std::forward<NullFunc>(null_func));
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    int64_t offset = arr.offset;
+    const uint8_t* data = arr.buffers[1].data;
+    VisitBitBlocksVoid(
+        arr.buffers[0].data, offset, arr.length,
+        [&](int64_t i) { valid_func(bit_util::GetBit(data, offset + i)); },
+        std::forward<NullFunc>(null_func));
+  }
+};
+
+// Binary, String...
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_base_binary<T>> {
+  using c_type = util::string_view;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    using offset_type = typename T::offset_type;
+    constexpr char empty_value = 0;
+
+    if (arr.length == 0) {
+      return Status::OK();
+    }
+    const offset_type* offsets = arr.GetValues<offset_type>(1);
+    const char* data;
+    if (arr.buffers[2].data == NULLPTR) {
+      data = &empty_value;
+    } else {
+      // Do not apply the array offset to the values array; the value_offsets
+      // index the non-sliced values array.
+      data = arr.GetValues<char>(2, /*absolute_offset=*/0);
+    }
+    offset_type cur_offset = *offsets++;
+    return VisitBitBlocks(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          ARROW_UNUSED(i);
+          auto value = util::string_view(data + cur_offset, *offsets - cur_offset);
+          cur_offset = *offsets++;
+          return valid_func(value);
+        },
+        [&]() {
+          cur_offset = *offsets++;
+          return null_func();
+        });
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    using offset_type = typename T::offset_type;
+    constexpr uint8_t empty_value = 0;
+
+    if (arr.length == 0) {
+      return;
+    }
+    const offset_type* offsets = arr.GetValues<offset_type>(1);
+    const uint8_t* data;
+    if (arr.buffers[2].data == NULLPTR) {
+      data = &empty_value;
+    } else {
+      // Do not apply the array offset to the values array; the value_offsets
+      // index the non-sliced values array.
+      data = arr.GetValues<uint8_t>(2, /*absolute_offset=*/0);
+    }
+
+    VisitBitBlocksVoid(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          auto value = util::string_view(reinterpret_cast<const char*>(data + offsets[i]),
+                                         offsets[i + 1] - offsets[i]);
+          valid_func(value);
+        },
+        std::forward<NullFunc>(null_func));
+  }
+};
+
+// FixedSizeBinary, Decimal128
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_fixed_size_binary<T>> {
+  using c_type = util::string_view;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    const int32_t byte_width = arr.type->byte_width();
+    const char* data = arr.GetValues<char>(1,
+                                           /*absolute_offset=*/arr.offset * byte_width);
+    return VisitBitBlocks(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          auto value = util::string_view(data, byte_width);
+          data += byte_width;
+          return valid_func(value);
+        },
+        [&]() {
+          data += byte_width;
+          return null_func();
+        });
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    const int32_t byte_width = arr.type->byte_width();
+    const char* data = arr.GetValues<char>(1,
+                                           /*absolute_offset=*/arr.offset * byte_width);
+    VisitBitBlocksVoid(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          valid_func(util::string_view(data, byte_width));
+          data += byte_width;
+        },
+        [&]() {
+          data += byte_width;
+          null_func();
+        });
+  }
+};
+
+template <typename T, typename ValidFunc, typename NullFunc>
+typename ::arrow::internal::call_traits::enable_if_return<ValidFunc, Status>::type
+VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+  return internal::ArraySpanInlineVisitor<T>::VisitStatus(
+      arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
+}
+
+template <typename T, typename ValidFunc, typename NullFunc>
+typename ::arrow::internal::call_traits::enable_if_return<ValidFunc, void>::type
+VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+  return internal::ArraySpanInlineVisitor<T>::VisitVoid(
+      arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
+}
+
+// A VisitArraySpanInline variant that calls its visitor function with logical
 // values, such as Decimal128 rather than util::string_view.
 
 template <typename T, typename VisitFunc, typename NullFunc>
 static typename arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
-VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func,
                        NullFunc&& null_func) {
-  VisitArrayDataInline<T>(
+  VisitArraySpanInline<T>(
       arr,
       [&](typename GetViewType<T>::PhysicalType v) {
         valid_func(GetViewType<T>::LogicalValue(std::move(v)));
@@ -414,9 +601,9 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
 
 template <typename T, typename VisitFunc, typename NullFunc>
 static typename arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
-VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
+VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func,
                        NullFunc&& null_func) {
-  return VisitArrayDataInline<T>(
+  return VisitArraySpanInline<T>(
       arr,
       [&](typename GetViewType<T>::PhysicalType v) {
         return valid_func(GetViewType<T>::LogicalValue(std::move(v)));
@@ -427,7 +614,7 @@ VisitArrayValuesInline(const ArrayData& arr, VisitFunc&& valid_func,
 // Like VisitArrayValuesInline, but for binary functions.
 
 template <typename Arg0Type, typename Arg1Type, typename VisitFunc, typename NullFunc>
-static void VisitTwoArrayValuesInline(const ArrayData& arr0, const ArrayData& arr1,
+static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& arr1,
                                       VisitFunc&& valid_func, NullFunc&& null_func) {
   ArrayIterator<Arg0Type> arr0_it(arr0);
   ArrayIterator<Arg1Type> arr1_it(arr1);
@@ -441,9 +628,24 @@ static void VisitTwoArrayValuesInline(const ArrayData& arr0, const ArrayData& ar
     arr1_it();
     null_func();
   };
-  VisitTwoBitBlocksVoid(arr0.buffers[0], arr0.offset, arr1.buffers[0], arr1.offset,
-                        arr0.length, std::move(visit_valid), std::move(visit_null));
+  VisitTwoBitBlocksVoid(arr0.buffers[0].data, arr0.offset, arr1.buffers[0].data,
+                        arr1.offset, arr0.length, std::move(visit_valid),
+                        std::move(visit_null));
 }
+// Like ArrayDataVisitor (see visit_data_inline.h), but for ArraySpans
+
+template <typename T>
+struct ArraySpanVisitor {
+  using InlineVisitorType = ArraySpanInlineVisitor<T>;
+  using c_type = typename InlineVisitorType::c_type;
+
+  template <typename Visitor>
+  static Status Visit(const ArraySpan& arr, Visitor* visitor) {
+    return InlineVisitorType::VisitStatus(
+        arr, [visitor](c_type v) { return visitor->VisitValue(v); },
+        [visitor]() { return visitor->VisitNull(); });
+  }
+};
 
 // ----------------------------------------------------------------------
 // Reusable type resolvers
@@ -455,9 +657,10 @@ Result<ValueDescr> ListValuesType(KernelContext*, const std::vector<ValueDescr>&
 // ----------------------------------------------------------------------
 // Generate an array kernel given template classes
 
-Status ExecFail(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
+Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 
-ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec);
+ScalarKernel::ExecFunc MakeFlippedBinaryExec(ScalarKernel::ExecFunc exec);
 
 // ----------------------------------------------------------------------
 // Helpers for iterating over common DataType instances for adding kernels to
@@ -483,60 +686,56 @@ const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes();
 
 // ----------------------------------------------------------------------
 // "Applicators" take an operator definition (which may be scalar-valued or
-// array-valued) and creates an ArrayKernelExec which can be used to add an
+// array-valued) and creates an ScalarKernel::ExecFunc which can be used to add an
 // ArrayKernel to a Function.
 
 namespace applicator {
 
-// Generate an ArrayKernelExec given a functor that handles all of its own
+// Generate an ScalarKernel::ExecFunc given a functor that handles all of its own
 // iteration, etc.
 //
 // Operator must implement
 //
-// static Status Call(KernelContext*, const ArrayData& in, ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& in, Scalar* out)
+// static Status Call(KernelContext*, const ArraySpan& in, ExecResult* out)
+// static Status Call(KernelContext*, const Scalar& in, ExecResult* out)
 template <typename Operator>
-static Status SimpleUnary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  if (batch[0].kind() == Datum::SCALAR) {
-    return Operator::Call(ctx, *batch[0].scalar(), out->scalar().get());
+static Status SimpleUnary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  if (batch[0].is_scalar()) {
+    return Operator::Call(ctx, *batch[0].scalar, out);
   } else if (batch.length > 0) {
-    return Operator::Call(ctx, *batch[0].array(), out->mutable_array());
+    return Operator::Call(ctx, batch[0].array, out);
   }
   return Status::OK();
 }
 
-// Generate an ArrayKernelExec given a functor that handles all of its own
+// Generate an ScalarKernel::ExecFunc given a functor that handles all of its own
 // iteration, etc.
 //
 // Operator must implement
 //
-// static Status Call(KernelContext*, const ArrayData& arg0, const ArrayData& arg1,
-//                    ArrayData* out)
-// static Status Call(KernelContext*, const ArrayData& arg0, const Scalar& arg1,
-//                    ArrayData* out)
-// static Status Call(KernelContext*, const Scalar& arg0, const ArrayData& arg1,
-//                    ArrayData* out)
+// static Status Call(KernelContext*, const ArraySpan& arg0, const ArraySpan& arg1,
+//                    * out)
+// static Status Call(KernelContext*, const ArraySpan& arg0, const Scalar& arg1,
+//                    ExecResult* out)
+// static Status Call(KernelContext*, const Scalar& arg0, const ArraySpan& arg1,
+//                    ExecResult* out)
 // static Status Call(KernelContext*, const Scalar& arg0, const Scalar& arg1,
-//                    Scalar* out)
+//                    ExecResult* out)
 template <typename Operator>
-static Status SimpleBinary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+static Status SimpleBinary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   if (batch.length == 0) return Status::OK();
 
-  if (batch[0].kind() == Datum::ARRAY) {
-    if (batch[1].kind() == Datum::ARRAY) {
-      return Operator::Call(ctx, *batch[0].array(), *batch[1].array(),
-                            out->mutable_array());
+  if (batch[0].is_array()) {
+    if (batch[1].is_array()) {
+      return Operator::Call(ctx, batch[0].array, batch[1].array, out);
     } else {
-      return Operator::Call(ctx, *batch[0].array(), *batch[1].scalar(),
-                            out->mutable_array());
+      return Operator::Call(ctx, batch[0].array, *batch[1].scalar, out);
     }
   } else {
-    if (batch[1].kind() == Datum::ARRAY) {
-      return Operator::Call(ctx, *batch[0].scalar(), *batch[1].array(),
-                            out->mutable_array());
+    if (batch[1].is_array()) {
+      return Operator::Call(ctx, *batch[0].scalar, batch[1].array, out);
     } else {
-      return Operator::Call(ctx, *batch[0].scalar(), *batch[1].scalar(),
-                            out->scalar().get());
+      return Operator::Call(ctx, *batch[0].scalar, *batch[1].scalar, out);
     }
   }
 }
@@ -551,10 +750,8 @@ struct OutputAdapter;
 template <typename Type>
 struct OutputAdapter<Type, enable_if_boolean<Type>> {
   template <typename Generator>
-  static Status Write(KernelContext*, Datum* out, Generator&& generator) {
-    ArrayData* out_arr = out->mutable_array();
-    auto out_bitmap = out_arr->buffers[1]->mutable_data();
-    GenerateBitsUnrolled(out_bitmap, out_arr->offset, out_arr->length,
+  static Status Write(KernelContext*, ArraySpan* out, Generator&& generator) {
+    GenerateBitsUnrolled(out->buffers[1].data, out->offset, out->length,
                          std::forward<Generator>(generator));
     return Status::OK();
   }
@@ -565,11 +762,10 @@ struct OutputAdapter<Type, enable_if_c_number_or_decimal<Type>> {
   using T = typename TypeTraits<Type>::ScalarType::ValueType;
 
   template <typename Generator>
-  static Status Write(KernelContext*, Datum* out, Generator&& generator) {
-    ArrayData* out_arr = out->mutable_array();
-    auto out_data = out_arr->GetMutableValues<T>(1);
+  static Status Write(KernelContext*, ArraySpan* out, Generator&& generator) {
+    T* out_data = out->GetValues<T>(1);
     // TODO: Is this as fast as a more explicitly inlined function?
-    for (int64_t i = 0; i < out_arr->length; ++i) {
+    for (int64_t i = 0; i < out->length; ++i) {
       *out_data++ = generator();
     }
     return Status::OK();
@@ -579,7 +775,7 @@ struct OutputAdapter<Type, enable_if_c_number_or_decimal<Type>> {
 template <typename Type>
 struct OutputAdapter<Type, enable_if_base_binary<Type>> {
   template <typename Generator>
-  static Status Write(KernelContext* ctx, Datum* out, Generator&& generator) {
+  static Status Write(KernelContext* ctx, ArraySpan* out, Generator&& generator) {
     return Status::NotImplemented("NYI");
   }
 };
@@ -607,16 +803,17 @@ struct ScalarUnary {
   using OutValue = typename GetOutputType<OutType>::T;
   using Arg0Value = typename GetViewType<Arg0Type>::T;
 
-  static Status ExecArray(KernelContext* ctx, const ArrayData& arg0, Datum* out) {
+  static Status ExecArray(KernelContext* ctx, const ArraySpan& arg0, ExecResult* out) {
     Status st = Status::OK();
     ArrayIterator<Arg0Type> arg0_it(arg0);
-    RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
-      return Op::template Call<OutValue, Arg0Value>(ctx, arg0_it(), &st);
-    }));
+    RETURN_NOT_OK(
+        OutputAdapter<OutType>::Write(ctx, out->array_span(), [&]() -> OutValue {
+          return Op::template Call<OutValue, Arg0Value>(ctx, arg0_it(), &st);
+        }));
     return st;
   }
 
-  static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+  static Status ExecScalar(KernelContext* ctx, const Scalar& arg0, ExecResult* out) {
     Status st = Status::OK();
     Scalar* out_scalar = out->scalar().get();
     if (arg0.is_valid) {
@@ -630,11 +827,11 @@ struct ScalarUnary {
     return st;
   }
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      return ExecArray(ctx, *batch[0].array(), out);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_array()) {
+      return ExecArray(ctx, batch[0].array, out);
     } else {
-      return ExecScalar(ctx, *batch[0].scalar(), out);
+      return ExecScalar(ctx, *batch[0].scalar, out);
     }
   }
 };
@@ -654,8 +851,8 @@ struct ScalarUnaryNotNullStateful {
 
   template <typename Type, typename Enable = void>
   struct ArrayExec {
-    static Status Exec(const ThisType& functor, KernelContext* ctx,
-                       const ExecBatch& batch, Datum* out) {
+    static Status Exec(const ThisType& functor, KernelContext* ctx, const ExecSpan& batch,
+                       ExecResult* out) {
       ARROW_LOG(FATAL) << "Missing ArrayExec specialization for output type "
                        << out->type();
       return Status::NotImplemented("NYI");
@@ -664,11 +861,10 @@ struct ScalarUnaryNotNullStateful {
 
   template <typename Type>
   struct ArrayExec<Type, enable_if_c_number_or_decimal<Type>> {
-    static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
-                       Datum* out) {
+    static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0,
+                       ExecResult* out) {
       Status st = Status::OK();
-      ArrayData* out_arr = out->mutable_array();
-      auto out_data = out_arr->GetMutableValues<OutValue>(1);
+      auto out_data = out->array_span()->GetValues<OutValue>(1);
       VisitArrayValuesInline<Arg0Type>(
           arg0,
           [&](Arg0Value v) {
@@ -684,8 +880,8 @@ struct ScalarUnaryNotNullStateful {
 
   template <typename Type>
   struct ArrayExec<Type, enable_if_base_binary<Type>> {
-    static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
-                       Datum* out) {
+    static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0,
+                       ExecResult* out) {
       // NOTE: This code is not currently used by any kernels and has
       // suboptimal performance because it's recomputing the validity bitmap
       // that is already computed by the kernel execution layer. Consider
@@ -706,12 +902,12 @@ struct ScalarUnaryNotNullStateful {
 
   template <typename Type>
   struct ArrayExec<Type, enable_if_t<is_boolean_type<Type>::value>> {
-    static Status Exec(const ThisType& functor, KernelContext* ctx, const ArrayData& arg0,
-                       Datum* out) {
+    static Status Exec(const ThisType& functor, KernelContext* ctx, const ArraySpan& arg0,
+                       ExecResult* out) {
       Status st = Status::OK();
-      ArrayData* out_arr = out->mutable_array();
-      FirstTimeBitmapWriter out_writer(out_arr->buffers[1]->mutable_data(),
-                                       out_arr->offset, out_arr->length);
+      ArraySpan* out_arr = out->array_span();
+      FirstTimeBitmapWriter out_writer(out_arr->buffers[1].data, out_arr->offset,
+                                       out_arr->length);
       VisitArrayValuesInline<Arg0Type>(
           arg0,
           [&](Arg0Value v) {
@@ -730,7 +926,7 @@ struct ScalarUnaryNotNullStateful {
     }
   };
 
-  Status Scalar(KernelContext* ctx, const Scalar& arg0, Datum* out) {
+  Status Scalar(KernelContext* ctx, const Scalar& arg0, ExecResult* out) {
     Status st = Status::OK();
     if (arg0.is_valid) {
       Arg0Value arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
@@ -741,11 +937,11 @@ struct ScalarUnaryNotNullStateful {
     return st;
   }
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      return ArrayExec<OutType>::Exec(*this, ctx, *batch[0].array(), out);
+  Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_array()) {
+      return ArrayExec<OutType>::Exec(*this, ctx, batch[0].array, out);
     } else {
-      return Scalar(ctx, *batch[0].scalar(), out);
+      return Scalar(ctx, *batch[0].scalar, out);
     }
   }
 };
@@ -758,7 +954,7 @@ struct ScalarUnaryNotNull {
   using OutValue = typename GetOutputType<OutType>::T;
   using Arg0Value = typename GetViewType<Arg0Type>::T;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // Seed kernel with dummy state
     ScalarUnaryNotNullStateful<OutType, Arg0Type, Op> kernel({});
     return kernel.Exec(ctx, batch, out);
@@ -790,44 +986,47 @@ struct ScalarBinary {
   using Arg0Value = typename GetViewType<Arg0Type>::T;
   using Arg1Value = typename GetViewType<Arg1Type>::T;
 
-  static Status ArrayArray(KernelContext* ctx, const ArrayData& arg0,
-                           const ArrayData& arg1, Datum* out) {
+  static Status ArrayArray(KernelContext* ctx, const ArraySpan& arg0,
+                           const ArraySpan& arg1, ExecResult* out) {
     Status st = Status::OK();
     ArrayIterator<Arg0Type> arg0_it(arg0);
     ArrayIterator<Arg1Type> arg1_it(arg1);
-    RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
-      return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_it(),
-                                                               &st);
-    }));
+    RETURN_NOT_OK(
+        OutputAdapter<OutType>::Write(ctx, out->array_span(), [&]() -> OutValue {
+          return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(),
+                                                                   arg1_it(), &st);
+        }));
     return st;
   }
 
-  static Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
-                            Datum* out) {
+  static Status ArrayScalar(KernelContext* ctx, const ArraySpan& arg0, const Scalar& arg1,
+                            ExecResult* out) {
     Status st = Status::OK();
     ArrayIterator<Arg0Type> arg0_it(arg0);
     auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
-    RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
-      return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(), arg1_val,
-                                                               &st);
-    }));
+    RETURN_NOT_OK(
+        OutputAdapter<OutType>::Write(ctx, out->array_span(), [&]() -> OutValue {
+          return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_it(),
+                                                                   arg1_val, &st);
+        }));
     return st;
   }
 
-  static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
-                            Datum* out) {
+  static Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArraySpan& arg1,
+                            ExecResult* out) {
     Status st = Status::OK();
     auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
     ArrayIterator<Arg1Type> arg1_it(arg1);
-    RETURN_NOT_OK(OutputAdapter<OutType>::Write(ctx, out, [&]() -> OutValue {
-      return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val, arg1_it(),
-                                                               &st);
-    }));
+    RETURN_NOT_OK(
+        OutputAdapter<OutType>::Write(ctx, out->array_span(), [&]() -> OutValue {
+          return Op::template Call<OutValue, Arg0Value, Arg1Value>(ctx, arg0_val,
+                                                                   arg1_it(), &st);
+        }));
     return st;
   }
 
   static Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
-                             Datum* out) {
+                             ExecResult* out) {
     Status st = Status::OK();
     if (out->scalar()->is_valid) {
       auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
@@ -839,18 +1038,18 @@ struct ScalarBinary {
     return st;
   }
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      if (batch[1].kind() == Datum::ARRAY) {
-        return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        return ArrayArray(ctx, batch[0].array, batch[1].array, out);
       } else {
-        return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out);
+        return ArrayScalar(ctx, batch[0].array, *batch[1].scalar, out);
       }
     } else {
-      if (batch[1].kind() == Datum::ARRAY) {
-        return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out);
+      if (batch[1].is_array()) {
+        return ScalarArray(ctx, *batch[0].scalar, batch[1].array, out);
       } else {
-        return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+        return ScalarScalar(ctx, *batch[0].scalar, *batch[1].scalar, out);
       }
     }
   }
@@ -870,10 +1069,10 @@ struct ScalarBinaryNotNullStateful {
 
   // NOTE: In ArrayExec<Type>, Type is really OutputType
 
-  Status ArrayArray(KernelContext* ctx, const ArrayData& arg0, const ArrayData& arg1,
-                    Datum* out) {
+  Status ArrayArray(KernelContext* ctx, const ArraySpan& arg0, const ArraySpan& arg1,
+                    ExecResult* out) {
     Status st = Status::OK();
-    OutputArrayWriter<OutType> writer(out->mutable_array());
+    OutputArrayWriter<OutType> writer(out->array_span());
     VisitTwoArrayValuesInline<Arg0Type, Arg1Type>(
         arg0, arg1,
         [&](Arg0Value u, Arg1Value v) {
@@ -883,10 +1082,11 @@ struct ScalarBinaryNotNullStateful {
     return st;
   }
 
-  Status ArrayScalar(KernelContext* ctx, const ArrayData& arg0, const Scalar& arg1,
-                     Datum* out) {
+  Status ArrayScalar(KernelContext* ctx, const ArraySpan& arg0, const Scalar& arg1,
+                     ExecResult* out) {
     Status st = Status::OK();
-    OutputArrayWriter<OutType> writer(out->mutable_array());
+    ArraySpan* out_span = out->array_span();
+    OutputArrayWriter<OutType> writer(out_span);
     if (arg1.is_valid) {
       const auto arg1_val = UnboxScalar<Arg1Type>::Unbox(arg1);
       VisitArrayValuesInline<Arg0Type>(
@@ -897,15 +1097,16 @@ struct ScalarBinaryNotNullStateful {
           },
           [&]() { writer.WriteNull(); });
     } else {
-      writer.WriteAllNull(out->mutable_array()->length);
+      writer.WriteAllNull(out_span->length);
     }
     return st;
   }
 
-  Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArrayData& arg1,
-                     Datum* out) {
+  Status ScalarArray(KernelContext* ctx, const Scalar& arg0, const ArraySpan& arg1,
+                     ExecResult* out) {
     Status st = Status::OK();
-    OutputArrayWriter<OutType> writer(out->mutable_array());
+    ArraySpan* out_span = out->array_span();
+    OutputArrayWriter<OutType> writer(out_span);
     if (arg0.is_valid) {
       const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
       VisitArrayValuesInline<Arg1Type>(
@@ -916,13 +1117,13 @@ struct ScalarBinaryNotNullStateful {
           },
           [&]() { writer.WriteNull(); });
     } else {
-      writer.WriteAllNull(out->mutable_array()->length);
+      writer.WriteAllNull(out_span->length);
     }
     return st;
   }
 
   Status ScalarScalar(KernelContext* ctx, const Scalar& arg0, const Scalar& arg1,
-                      Datum* out) {
+                      ExecResult* out) {
     Status st = Status::OK();
     if (arg0.is_valid && arg1.is_valid) {
       const auto arg0_val = UnboxScalar<Arg0Type>::Unbox(arg0);
@@ -934,18 +1135,18 @@ struct ScalarBinaryNotNullStateful {
     return st;
   }
 
-  Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      if (batch[1].kind() == Datum::ARRAY) {
-        return ArrayArray(ctx, *batch[0].array(), *batch[1].array(), out);
+  Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_array()) {
+      if (batch[1].is_array()) {
+        return ArrayArray(ctx, batch[0].array, batch[1].array, out);
       } else {
-        return ArrayScalar(ctx, *batch[0].array(), *batch[1].scalar(), out);
+        return ArrayScalar(ctx, batch[0].array, *batch[1].scalar, out);
       }
     } else {
-      if (batch[1].kind() == Datum::ARRAY) {
-        return ScalarArray(ctx, *batch[0].scalar(), *batch[1].array(), out);
+      if (batch[1].is_array()) {
+        return ScalarArray(ctx, *batch[0].scalar, batch[1].array, out);
       } else {
-        return ScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+        return ScalarScalar(ctx, *batch[0].scalar, *batch[1].scalar, out);
       }
     }
   }
@@ -961,7 +1162,7 @@ struct ScalarBinaryNotNull {
   using Arg0Value = typename GetViewType<Arg0Type>::T;
   using Arg1Value = typename GetViewType<Arg1Type>::T;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // Seed kernel with dummy state
     ScalarBinaryNotNullStateful<OutType, Arg0Type, Arg1Type, Op> kernel({});
     return kernel.Exec(ctx, batch, out);
@@ -997,7 +1198,7 @@ using ScalarBinaryNotNullStatefulEqualTypes =
 //
 // template <typename Type0, typename Type1, Args...>
 // struct FUNCTOR {
-//   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+//   static void Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
 //     // IMPLEMENTATION
 //   }
 // };
@@ -1031,7 +1232,7 @@ struct GetTypeId {
 
 // GD for numeric types (integer and floating point)
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateNumeric(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateNumeric(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1059,11 +1260,43 @@ ArrayKernelExec GenerateNumeric(detail::GetTypeId get_id) {
   }
 }
 
+// TODO(wesm): for ARROW-16756, while in transition to a new kernel
+// API I duplicated this generator dispatcher to be able to create old
+// kernel types
+template <template <typename...> class Generator, typename Type0, typename... Args>
+KernelBatchExec GenerateNumericOld(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::INT8:
+      return Generator<Type0, Int8Type, Args...>::Exec;
+    case Type::UINT8:
+      return Generator<Type0, UInt8Type, Args...>::Exec;
+    case Type::INT16:
+      return Generator<Type0, Int16Type, Args...>::Exec;
+    case Type::UINT16:
+      return Generator<Type0, UInt16Type, Args...>::Exec;
+    case Type::INT32:
+      return Generator<Type0, Int32Type, Args...>::Exec;
+    case Type::UINT32:
+      return Generator<Type0, UInt32Type, Args...>::Exec;
+    case Type::INT64:
+      return Generator<Type0, Int64Type, Args...>::Exec;
+    case Type::UINT64:
+      return Generator<Type0, UInt64Type, Args...>::Exec;
+    case Type::FLOAT:
+      return Generator<Type0, FloatType, Args...>::Exec;
+    case Type::DOUBLE:
+      return Generator<Type0, DoubleType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFailOld;
+  }
+}
+
 // Generate a kernel given a templated functor for floating point types
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateFloatingPoint(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateFloatingPoint(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::FLOAT:
       return Generator<Type0, FloatType, Args...>::Exec;
@@ -1079,7 +1312,7 @@ ArrayKernelExec GenerateFloatingPoint(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateInteger(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateInteger(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1104,7 +1337,7 @@ ArrayKernelExec GenerateInteger(detail::GetTypeId get_id) {
 }
 
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GeneratePhysicalInteger(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1135,7 +1368,7 @@ ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) {
 }
 
 template <template <typename...> class KernelGenerator, typename Op, typename... Args>
-ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc ArithmeticExecFromOp(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<Int8Type, Int8Type, Op, Args...>::Exec;
@@ -1165,8 +1398,41 @@ ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
   }
 }
 
+// ARROW-16756: temporarily duplicated until we get all the kernels
+// migrated to the new API
+template <template <typename...> class KernelGenerator, typename Op, typename... Args>
+KernelBatchExec ArithmeticExecFromOpOld(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::INT8:
+      return KernelGenerator<Int8Type, Int8Type, Op, Args...>::Exec;
+    case Type::UINT8:
+      return KernelGenerator<UInt8Type, UInt8Type, Op, Args...>::Exec;
+    case Type::INT16:
+      return KernelGenerator<Int16Type, Int16Type, Op, Args...>::Exec;
+    case Type::UINT16:
+      return KernelGenerator<UInt16Type, UInt16Type, Op, Args...>::Exec;
+    case Type::INT32:
+      return KernelGenerator<Int32Type, Int32Type, Op, Args...>::Exec;
+    case Type::UINT32:
+      return KernelGenerator<UInt32Type, UInt32Type, Op, Args...>::Exec;
+    case Type::DURATION:
+    case Type::INT64:
+    case Type::TIMESTAMP:
+      return KernelGenerator<Int64Type, Int64Type, Op, Args...>::Exec;
+    case Type::UINT64:
+      return KernelGenerator<UInt64Type, UInt64Type, Op, Args...>::Exec;
+    case Type::FLOAT:
+      return KernelGenerator<FloatType, FloatType, Op, Args...>::Exec;
+    case Type::DOUBLE:
+      return KernelGenerator<DoubleType, DoubleType, Op, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFailOld;
+  }
+}
+
 template <template <typename... Args> class Generator, typename... Args>
-ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GeneratePhysicalNumeric(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Int8Type, Args...>::Exec;
@@ -1202,7 +1468,7 @@ ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
 
 // Generate a kernel given a templated functor for decimal types
 template <template <typename... Args> class Generator, typename... Args>
-ArrayKernelExec GenerateDecimalToDecimal(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateDecimalToDecimal(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::DECIMAL128:
       return Generator<Decimal128Type, Args...>::Exec;
@@ -1218,7 +1484,7 @@ ArrayKernelExec GenerateDecimalToDecimal(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateSignedInteger(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateSignedInteger(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1242,7 +1508,7 @@ ArrayKernelExec GenerateSignedInteger(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename... Args>
-ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::NA:
       return Generator<NullType, Args...>::Exec;
@@ -1278,9 +1544,47 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
   }
 }
 
+// XXX: Duplicated temporarily
+template <template <typename...> class Generator, typename... Args>
+KernelBatchExec GenerateTypeAgnosticPrimitiveOld(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::NA:
+      return Generator<NullType, Args...>::Exec;
+    case Type::BOOL:
+      return Generator<BooleanType, Args...>::Exec;
+    case Type::UINT8:
+    case Type::INT8:
+      return Generator<UInt8Type, Args...>::Exec;
+    case Type::UINT16:
+    case Type::INT16:
+      return Generator<UInt16Type, Args...>::Exec;
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::FLOAT:
+    case Type::DATE32:
+    case Type::TIME32:
+    case Type::INTERVAL_MONTHS:
+      return Generator<UInt32Type, Args...>::Exec;
+    case Type::UINT64:
+    case Type::INT64:
+    case Type::DOUBLE:
+    case Type::DATE64:
+    case Type::TIMESTAMP:
+    case Type::TIME64:
+    case Type::DURATION:
+    case Type::INTERVAL_DAY_TIME:
+      return Generator<UInt64Type, Args...>::Exec;
+    case Type::INTERVAL_MONTH_DAY_NANO:
+      return Generator<MonthDayNanoIntervalType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFailOld;
+  }
+}
+
 // similar to GenerateTypeAgnosticPrimitive, but for base variable binary types
 template <template <typename...> class Generator, typename... Args>
-ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
     case Type::STRING:
@@ -1294,9 +1598,25 @@ ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   }
 }
 
+// XXX: Duplicated temporarily
+template <template <typename...> class Generator, typename... Args>
+KernelBatchExec GenerateTypeAgnosticVarBinaryBaseOld(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::BINARY:
+    case Type::STRING:
+      return Generator<BinaryType, Args...>::Exec;
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+      return Generator<LargeBinaryType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFailOld;
+  }
+}
+
 // Generate a kernel given a templated functor for binary and string types
 template <template <typename...> class Generator, typename... Args>
-ArrayKernelExec GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
       return Generator<BinaryType, Args...>::Exec;
@@ -1319,7 +1639,7 @@ ArrayKernelExec GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateVarBinaryBase(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateVarBinaryBase(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
     case Type::STRING:
@@ -1333,9 +1653,25 @@ ArrayKernelExec GenerateVarBinaryBase(detail::GetTypeId get_id) {
   }
 }
 
+// TODO: Duplicated in ARROW-16756
+template <template <typename...> class Generator, typename Type0, typename... Args>
+KernelBatchExec GenerateVarBinaryBaseOld(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::BINARY:
+    case Type::STRING:
+      return Generator<Type0, BinaryType, Args...>::Exec;
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+      return Generator<Type0, LargeBinaryType, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFailOld;
+  }
+}
+
 // See BaseBinary documentation
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateVarBinary(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
       return Generator<Type0, BinaryType, Args...>::Exec;
@@ -1355,7 +1691,7 @@ ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateTemporal(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::DATE32:
       return Generator<Type0, Date32Type, Args...>::Exec;
@@ -1379,7 +1715,7 @@ ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateDecimal(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::DECIMAL128:
       return Generator<Type0, Decimal128Type, Args...>::Exec;
@@ -1391,6 +1727,20 @@ ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
   }
 }
 
+// Temporarily duplicated for ARROW-16756
+template <template <typename...> class Generator, typename Type0, typename... Args>
+KernelBatchExec GenerateDecimalOld(detail::GetTypeId get_id) {
+  switch (get_id.id) {
+    case Type::DECIMAL128:
+      return Generator<Type0, Decimal128Type, Args...>::Exec;
+    case Type::DECIMAL256:
+      return Generator<Type0, Decimal256Type, Args...>::Exec;
+    default:
+      DCHECK(false);
+      return ExecFailOld;
+  }
+}
+
 // END of kernel generator-dispatchers
 // ----------------------------------------------------------------------
 
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 5b92dac2375..ec3dda85d22 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -651,7 +651,7 @@ VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
   auto g = batch[1].array()->GetValues<uint32_t>(1);
   if (batch[0].is_array()) {
     VisitArrayValuesInline<Type>(
-        *batch[0].array(),
+        ArraySpan(*batch[0].array()),
         [&](typename TypeTraits<Type>::CType val) { valid_func(*g++, val); },
         [&]() { null_func(*g++); });
     return;
@@ -676,7 +676,7 @@ VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
   auto g = batch[1].array()->GetValues<uint32_t>(1);
   if (batch[0].is_array()) {
     return VisitArrayValuesInline<Type>(
-        *batch[0].array(),
+        ArraySpan(*batch[0].array()),
         [&](typename GetViewType<Type>::T val) { return valid_func(*g++, val); },
         [&]() { return null_func(*g++); });
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 3138fb6974c..999bd71c30b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -823,7 +823,7 @@ struct Round<ArrowType, kRoundMode, enable_if_decimal<ArrowType>> {
 };
 
 template <typename DecimalType, RoundMode kMode, int32_t kDigits>
-Status FixedRoundDecimalExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status FixedRoundDecimalExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   using Op = Round<DecimalType, kMode>;
   return ScalarUnaryNotNullStateful<DecimalType, DecimalType, Op>(
              Op(kDigits, *out->type()))
@@ -1005,7 +1005,7 @@ struct Trunc {
 // Generate a kernel given a bitwise arithmetic functor. Assumes the
 // functor treats all integer types of equal width identically
 template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
     case Type::UINT8:
@@ -1026,7 +1026,7 @@ ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
 }
 
 template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc ShiftExecFromOp(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
@@ -1051,7 +1051,7 @@ ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
 }
 
 template <template <typename... Args> class KernelGenerator, typename Op>
-ArrayKernelExec GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::FLOAT:
       return KernelGenerator<FloatType, FloatType, Op>::Exec;
@@ -1163,7 +1163,7 @@ void AddDecimalBinaryKernels(const std::string& name, ScalarFunction* func) {
 
 // Generate a kernel given an arithmetic functor
 template <template <typename...> class KernelGenerator, typename OutType, typename Op>
-ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
+ScalarKernel::ExecFunc GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<OutType, Int8Type, Op>::Exec;
@@ -1333,7 +1333,7 @@ struct ArithmeticFloatingPointFunction : public ArithmeticFunction {
 };
 
 // A scalar kernel that ignores (assumed all-null) inputs and returns null.
-Status NullToNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status NullToNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   return Status::OK();
 }
 
@@ -1416,63 +1416,30 @@ std::shared_ptr<ScalarFunction> MakeUnaryArithmeticFunctionNotNull(std::string n
   return func;
 }
 
+#define ROUND_CASE(MODE)                                                       \
+  case RoundMode::MODE: {                                                      \
+    using Op = OpImpl<Type, RoundMode::MODE>;                                  \
+    return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type())) \
+        .Exec(ctx, batch, out);                                                \
+  }
+
 // Exec the round kernel for the given types
 template <typename Type, typename OptionsType,
           template <typename, RoundMode, typename...> class OpImpl>
-Status ExecRound(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExecRound(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   using State = RoundOptionsWrapper<OptionsType>;
   const auto& state = static_cast<const State&>(*ctx->state());
   switch (state.options.round_mode) {
-    case RoundMode::DOWN: {
-      using Op = OpImpl<Type, RoundMode::DOWN>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::UP: {
-      using Op = OpImpl<Type, RoundMode::UP>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::TOWARDS_ZERO: {
-      using Op = OpImpl<Type, RoundMode::TOWARDS_ZERO>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::TOWARDS_INFINITY: {
-      using Op = OpImpl<Type, RoundMode::TOWARDS_INFINITY>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::HALF_DOWN: {
-      using Op = OpImpl<Type, RoundMode::HALF_DOWN>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::HALF_UP: {
-      using Op = OpImpl<Type, RoundMode::HALF_UP>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::HALF_TOWARDS_ZERO: {
-      using Op = OpImpl<Type, RoundMode::HALF_TOWARDS_ZERO>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::HALF_TOWARDS_INFINITY: {
-      using Op = OpImpl<Type, RoundMode::HALF_TOWARDS_INFINITY>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::HALF_TO_EVEN: {
-      using Op = OpImpl<Type, RoundMode::HALF_TO_EVEN>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
-    case RoundMode::HALF_TO_ODD: {
-      using Op = OpImpl<Type, RoundMode::HALF_TO_ODD>;
-      return ScalarUnaryNotNullStateful<Type, Type, Op>(Op(state, *out->type()))
-          .Exec(ctx, batch, out);
-    }
+    ROUND_CASE(DOWN)
+    ROUND_CASE(UP)
+    ROUND_CASE(TOWARDS_ZERO)
+    ROUND_CASE(TOWARDS_INFINITY)
+    ROUND_CASE(HALF_DOWN)
+    ROUND_CASE(HALF_UP)
+    ROUND_CASE(HALF_TOWARDS_ZERO)
+    ROUND_CASE(HALF_TOWARDS_INFINITY)
+    ROUND_CASE(HALF_TO_EVEN)
+    ROUND_CASE(HALF_TO_ODD)
   }
   DCHECK(false);
   return Status::NotImplemented(
@@ -1491,7 +1458,7 @@ std::shared_ptr<ScalarFunction> MakeUnaryRoundFunction(std::string name,
       name, Arity::Unary(), std::move(doc), &kDefaultOptions);
   for (const auto& ty : {float32(), float64(), decimal128(1, 0), decimal256(1, 0)}) {
     auto type_id = ty->id();
-    auto exec = [type_id](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    auto exec = [type_id](KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
       switch (type_id) {
         case Type::FLOAT:
           return ExecRound<FloatType, OptionsType, Op>(ctx, batch, out);
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 1aea247ddf6..1bbabd70970 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -31,19 +31,19 @@ namespace compute {
 namespace {
 
 template <typename ComputeWord>
-void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayData& left,
-                   const ArrayData& right, ArrayData* out) {
-  DCHECK(left.null_count != 0 || right.null_count != 0)
+void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArraySpan& left,
+                   const ArraySpan& right, ArraySpan* out) {
+  DCHECK(left.GetNullCount() != 0 || right.GetNullCount() != 0)
       << "ComputeKleene is unnecessarily expensive for the non-null case";
 
-  Bitmap left_valid_bm{left.buffers[0], left.offset, left.length};
-  Bitmap left_data_bm{left.buffers[1], left.offset, left.length};
+  Bitmap left_valid_bm{left.buffers[0].data, left.offset, left.length};
+  Bitmap left_data_bm{left.buffers[1].data, left.offset, left.length};
 
-  Bitmap right_valid_bm{right.buffers[0], right.offset, right.length};
-  Bitmap right_data_bm{right.buffers[1], right.offset, right.length};
+  Bitmap right_valid_bm{right.buffers[0].data, right.offset, right.length};
+  Bitmap right_data_bm{right.buffers[1].data, right.offset, right.length};
 
-  std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0], out->offset, out->length),
-                                Bitmap(out->buffers[1], out->offset, out->length)};
+  std::array<Bitmap, 2> out_bms{Bitmap(out->buffers[0].data, out->offset, out->length),
+                                Bitmap(out->buffers[1].data, out->offset, out->length)};
 
   auto apply = [&](uint64_t left_valid, uint64_t left_data, uint64_t right_valid,
                    uint64_t right_data, uint64_t* out_validity, uint64_t* out_data) {
@@ -56,7 +56,7 @@ void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayDa
     compute_word(left_true, left_false, right_true, right_false, out_validity, out_data);
   };
 
-  if (right.null_count == 0) {
+  if (right.GetNullCount() == 0) {
     std::array<Bitmap, 3> in_bms{left_valid_bm, left_data_bm, right_data_bm};
     Bitmap::VisitWordsAndWrite(
         in_bms, &out_bms,
@@ -66,7 +66,7 @@ void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayDa
     return;
   }
 
-  if (left.null_count == 0) {
+  if (left.GetNullCount() == 0) {
     std::array<Bitmap, 3> in_bms{left_data_bm, right_valid_bm, right_data_bm};
     Bitmap::VisitWordsAndWrite(
         in_bms, &out_bms,
@@ -76,7 +76,7 @@ void ComputeKleene(ComputeWord&& compute_word, KernelContext* ctx, const ArrayDa
     return;
   }
 
-  DCHECK(left.null_count != 0 && right.null_count != 0);
+  DCHECK(left.GetNullCount() != 0 && right.GetNullCount() != 0);
   std::array<Bitmap, 4> in_bms{left_valid_bm, left_data_bm, right_valid_bm,
                                right_data_bm};
   Bitmap::VisitWordsAndWrite(
@@ -91,26 +91,27 @@ inline BooleanScalar InvertScalar(const Scalar& in) {
                      : BooleanScalar();
 }
 
-inline Bitmap GetBitmap(const ArrayData& arr, int index) {
-  return Bitmap{arr.buffers[index], arr.offset, arr.length};
+inline Bitmap GetBitmap(const ArraySpan& arr, int index) {
+  return Bitmap{arr.buffers[index].data, arr.offset, arr.length};
 }
 
 struct InvertOp {
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
-    *checked_cast<BooleanScalar*>(out) = InvertScalar(in);
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
+    *checked_cast<BooleanScalar*>(out->scalar().get()) = InvertScalar(in);
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
-    GetBitmap(*out, 1).CopyFromInverted(GetBitmap(in, 1));
+  static Status Call(KernelContext* ctx, const ArraySpan& in, ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
+    GetBitmap(*out_span, 1).CopyFromInverted(GetBitmap(in, 1));
     return Status::OK();
   }
 };
 
 template <typename Op>
 struct Commutative {
-  static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const Scalar& left, const ArraySpan& right,
+                     ExecResult* out) {
     return Op::Call(ctx, right, left, out);
   }
 };
@@ -119,30 +120,32 @@ struct AndOp : Commutative<AndOp> {
   using Commutative<AndOp>::Call;
 
   static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
-                     Scalar* out) {
+                     ExecResult* out) {
     if (left.is_valid && right.is_valid) {
-      checked_cast<BooleanScalar*>(out)->value =
+      checked_cast<BooleanScalar*>(out->scalar().get())->value =
           checked_cast<const BooleanScalar&>(left).value &&
           checked_cast<const BooleanScalar&>(right).value;
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const Scalar& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (right.is_valid) {
       checked_cast<const BooleanScalar&>(right).value
-          ? GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1))
-          : GetBitmap(*out, 1).SetBitsTo(false);
+          ? GetBitmap(*out_span, 1).CopyFrom(GetBitmap(left, 1))
+          : GetBitmap(*out_span, 1).SetBitsTo(false);
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
-                     ArrayData* out) {
-    ::arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
-                                 right.buffers[1]->data(), right.offset, right.length,
-                                 out->offset, out->buffers[1]->mutable_data());
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
+    ::arrow::internal::BitmapAnd(left.buffers[1].data, left.offset, right.buffers[1].data,
+                                 right.offset, right.length, out_span->offset,
+                                 out_span->buffers[1].data);
     return Status::OK();
   }
 };
@@ -151,70 +154,73 @@ struct KleeneAndOp : Commutative<KleeneAndOp> {
   using Commutative<KleeneAndOp>::Call;
 
   static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
-                     Scalar* out) {
+                     ExecResult* out) {
     bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
     bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
 
     bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
     bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
 
-    checked_cast<BooleanScalar*>(out)->value = left_true && right_true;
-    out->is_valid = left_false || right_false || (left_true && right_true);
+    Scalar* out_scalar = out->scalar().get();
+    checked_cast<BooleanScalar*>(out_scalar)->value = left_true && right_true;
+    out_scalar->is_valid = left_false || right_false || (left_true && right_true);
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const Scalar& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
     bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
 
     if (right_false) {
-      out->null_count = 0;
-      GetBitmap(*out, 1).SetBitsTo(false);  // all false case
+      GetBitmap(*out_span, 0).SetBitsTo(true);
+      out_span->null_count = 0;
+      GetBitmap(*out_span, 1).SetBitsTo(false);  // all false case
       return Status::OK();
     }
 
     if (right_true) {
       if (left.GetNullCount() == 0) {
-        out->null_count = 0;
+        GetBitmap(*out_span, 0).SetBitsTo(true);
+        out_span->null_count = 0;
       } else {
-        ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(left.length));
-        GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+        GetBitmap(*out_span, 0).CopyFrom(GetBitmap(left, 0));
       }
-      GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+      GetBitmap(*out_span, 1).CopyFrom(GetBitmap(left, 1));
       return Status::OK();
     }
 
     // scalar was null: out[i] is valid iff left[i] was false
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(left.length));
     if (left.GetNullCount() == 0) {
-      ::arrow::internal::InvertBitmap(left.buffers[1]->data(), left.offset, left.length,
-                                      out->buffers[0]->mutable_data(), out->offset);
+      ::arrow::internal::InvertBitmap(left.buffers[1].data, left.offset, left.length,
+                                      out_span->buffers[0].data, out_span->offset);
     } else {
-      ::arrow::internal::BitmapAndNot(left.buffers[0]->data(), left.offset,
-                                      left.buffers[1]->data(), left.offset, left.length,
-                                      out->offset, out->buffers[0]->mutable_data());
+      ::arrow::internal::BitmapAndNot(left.buffers[0].data, left.offset,
+                                      left.buffers[1].data, left.offset, left.length,
+                                      out_span->offset, out_span->buffers[0].data);
     }
-    ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
-                                  out->buffers[1]->mutable_data(), out->offset);
+    ::arrow::internal::CopyBitmap(left.buffers[1].data, left.offset, left.length,
+                                  out_span->buffers[1].data, out_span->offset);
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
-      out->null_count = 0;
+      GetBitmap(*out_span, 0).SetBitsTo(true);
+      out_span->null_count = 0;
       return AndOp::Call(ctx, left, right, out);
     }
 
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(left.length));
     auto compute_word = [](uint64_t left_true, uint64_t left_false, uint64_t right_true,
                            uint64_t right_false, uint64_t* out_valid,
                            uint64_t* out_data) {
       *out_data = left_true & right_true;
       *out_valid = left_false | right_false | (left_true & right_true);
     };
-    ComputeKleene(compute_word, ctx, left, right, out);
+    ComputeKleene(compute_word, ctx, left, right, out_span);
     return Status::OK();
   }
 };
@@ -223,30 +229,33 @@ struct OrOp : Commutative<OrOp> {
   using Commutative<OrOp>::Call;
 
   static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
-                     Scalar* out) {
+                     ExecResult* out) {
+    Scalar* out_scalar = out->scalar().get();
     if (left.is_valid && right.is_valid) {
-      checked_cast<BooleanScalar*>(out)->value =
+      checked_cast<BooleanScalar*>(out_scalar)->value =
           checked_cast<const BooleanScalar&>(left).value ||
           checked_cast<const BooleanScalar&>(right).value;
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const Scalar& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (right.is_valid) {
       checked_cast<const BooleanScalar&>(right).value
-          ? GetBitmap(*out, 1).SetBitsTo(true)
-          : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+          ? GetBitmap(*out_span, 1).SetBitsTo(true)
+          : GetBitmap(*out_span, 1).CopyFrom(GetBitmap(left, 1));
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
-                     ArrayData* out) {
-    ::arrow::internal::BitmapOr(left.buffers[1]->data(), left.offset,
-                                right.buffers[1]->data(), right.offset, right.length,
-                                out->offset, out->buffers[1]->mutable_data());
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
+    ::arrow::internal::BitmapOr(left.buffers[1].data, left.offset, right.buffers[1].data,
+                                right.offset, right.length, out_span->offset,
+                                out_span->buffers[1].data);
     return Status::OK();
   }
 };
@@ -255,63 +264,66 @@ struct KleeneOrOp : Commutative<KleeneOrOp> {
   using Commutative<KleeneOrOp>::Call;
 
   static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
-                     Scalar* out) {
+                     ExecResult* out) {
+    Scalar* out_scalar = out->scalar().get();
     bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
     bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
 
     bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
     bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
 
-    checked_cast<BooleanScalar*>(out)->value = left_true || right_true;
-    out->is_valid = left_true || right_true || (left_false && right_false);
+    checked_cast<BooleanScalar*>(out_scalar)->value = left_true || right_true;
+    out_scalar->is_valid = left_true || right_true || (left_false && right_false);
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const Scalar& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     bool right_true = right.is_valid && checked_cast<const BooleanScalar&>(right).value;
     bool right_false = right.is_valid && !checked_cast<const BooleanScalar&>(right).value;
 
     if (right_true) {
-      out->null_count = 0;
-      GetBitmap(*out, 1).SetBitsTo(true);  // all true case
+      GetBitmap(*out_span, 0).SetBitsTo(true);
+      out_span->null_count = 0;
+      GetBitmap(*out_span, 1).SetBitsTo(true);  // all true case
       return Status::OK();
     }
 
     if (right_false) {
       if (left.GetNullCount() == 0) {
-        out->null_count = 0;
+        GetBitmap(*out_span, 0).SetBitsTo(true);
+        out_span->null_count = 0;
       } else {
-        ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(left.length));
-        GetBitmap(*out, 0).CopyFrom(GetBitmap(left, 0));
+        GetBitmap(*out_span, 0).CopyFrom(GetBitmap(left, 0));
       }
-      GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+      GetBitmap(*out_span, 1).CopyFrom(GetBitmap(left, 1));
       return Status::OK();
     }
 
     // scalar was null: out[i] is valid iff left[i] was true
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(left.length));
     if (left.GetNullCount() == 0) {
-      ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
-                                    out->buffers[0]->mutable_data(), out->offset);
+      ::arrow::internal::CopyBitmap(left.buffers[1].data, left.offset, left.length,
+                                    out_span->buffers[0].data, out_span->offset);
     } else {
-      ::arrow::internal::BitmapAnd(left.buffers[0]->data(), left.offset,
-                                   left.buffers[1]->data(), left.offset, left.length,
-                                   out->offset, out->buffers[0]->mutable_data());
+      ::arrow::internal::BitmapAnd(left.buffers[0].data, left.offset,
+                                   left.buffers[1].data, left.offset, left.length,
+                                   out_span->offset, out_span->buffers[0].data);
     }
-    ::arrow::internal::CopyBitmap(left.buffers[1]->data(), left.offset, left.length,
-                                  out->buffers[1]->mutable_data(), out->offset);
+    ::arrow::internal::CopyBitmap(left.buffers[1].data, left.offset, left.length,
+                                  out_span->buffers[1].data, out_span->offset);
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
-      out->null_count = 0;
+      out_span->null_count = 0;
+      GetBitmap(*out_span, 0).SetBitsTo(true);
       return OrOp::Call(ctx, left, right, out);
     }
 
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(left.length));
     static auto compute_word = [](uint64_t left_true, uint64_t left_false,
                                   uint64_t right_true, uint64_t right_false,
                                   uint64_t* out_valid, uint64_t* out_data) {
@@ -319,7 +331,7 @@ struct KleeneOrOp : Commutative<KleeneOrOp> {
       *out_valid = left_true | right_true | (left_false & right_false);
     };
 
-    ComputeKleene(compute_word, ctx, left, right, out);
+    ComputeKleene(compute_word, ctx, left, right, out_span);
     return Status::OK();
   }
 };
@@ -328,120 +340,127 @@ struct XorOp : Commutative<XorOp> {
   using Commutative<XorOp>::Call;
 
   static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
-                     Scalar* out) {
+                     ExecResult* out) {
+    Scalar* out_scalar = out->scalar().get();
     if (left.is_valid && right.is_valid) {
-      checked_cast<BooleanScalar*>(out)->value =
+      checked_cast<BooleanScalar*>(out_scalar)->value =
           checked_cast<const BooleanScalar&>(left).value ^
           checked_cast<const BooleanScalar&>(right).value;
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const Scalar& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (right.is_valid) {
       checked_cast<const BooleanScalar&>(right).value
-          ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(left, 1))
-          : GetBitmap(*out, 1).CopyFrom(GetBitmap(left, 1));
+          ? GetBitmap(*out_span, 1).CopyFromInverted(GetBitmap(left, 1))
+          : GetBitmap(*out_span, 1).CopyFrom(GetBitmap(left, 1));
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
-                     ArrayData* out) {
-    ::arrow::internal::BitmapXor(left.buffers[1]->data(), left.offset,
-                                 right.buffers[1]->data(), right.offset, right.length,
-                                 out->offset, out->buffers[1]->mutable_data());
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
+    ::arrow::internal::BitmapXor(left.buffers[1].data, left.offset, right.buffers[1].data,
+                                 right.offset, right.length, out_span->offset,
+                                 out_span->buffers[1].data);
     return Status::OK();
   }
 };
 
 struct AndNotOp {
   static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
-                     Scalar* out) {
+                     ExecResult* out) {
     return AndOp::Call(ctx, left, InvertScalar(right), out);
   }
 
-  static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const Scalar& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (left.is_valid) {
       checked_cast<const BooleanScalar&>(left).value
-          ? GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1))
-          : GetBitmap(*out, 1).SetBitsTo(false);
+          ? GetBitmap(*out_span, 1).CopyFromInverted(GetBitmap(right, 1))
+          : GetBitmap(*out_span, 1).SetBitsTo(false);
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const Scalar& right,
+                     ExecResult* out) {
     return AndOp::Call(ctx, left, InvertScalar(right), out);
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
-                     ArrayData* out) {
-    ::arrow::internal::BitmapAndNot(left.buffers[1]->data(), left.offset,
-                                    right.buffers[1]->data(), right.offset, right.length,
-                                    out->offset, out->buffers[1]->mutable_data());
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
+    ::arrow::internal::BitmapAndNot(left.buffers[1].data, left.offset,
+                                    right.buffers[1].data, right.offset, right.length,
+                                    out_span->offset, out_span->buffers[1].data);
     return Status::OK();
   }
 };
 
 struct KleeneAndNotOp {
   static Status Call(KernelContext* ctx, const Scalar& left, const Scalar& right,
-                     Scalar* out) {
+                     ExecResult* out) {
     return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
   }
 
-  static Status Call(KernelContext* ctx, const Scalar& left, const ArrayData& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const Scalar& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     bool left_true = left.is_valid && checked_cast<const BooleanScalar&>(left).value;
     bool left_false = left.is_valid && !checked_cast<const BooleanScalar&>(left).value;
 
     if (left_false) {
-      out->null_count = 0;
-      GetBitmap(*out, 1).SetBitsTo(false);  // all false case
+      GetBitmap(*out_span, 0).SetBitsTo(true);
+      out_span->null_count = 0;
+      GetBitmap(*out_span, 1).SetBitsTo(false);  // all false case
       return Status::OK();
     }
 
     if (left_true) {
       if (right.GetNullCount() == 0) {
-        out->null_count = 0;
+        GetBitmap(*out_span, 0).SetBitsTo(true);
+        out_span->null_count = 0;
       } else {
-        ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(right.length));
-        GetBitmap(*out, 0).CopyFrom(GetBitmap(right, 0));
+        GetBitmap(*out_span, 0).CopyFrom(GetBitmap(right, 0));
       }
-      GetBitmap(*out, 1).CopyFromInverted(GetBitmap(right, 1));
+      GetBitmap(*out_span, 1).CopyFromInverted(GetBitmap(right, 1));
       return Status::OK();
     }
 
     // scalar was null: out[i] is valid iff right[i] was true
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(right.length));
     if (right.GetNullCount() == 0) {
-      ::arrow::internal::CopyBitmap(right.buffers[1]->data(), right.offset, right.length,
-                                    out->buffers[0]->mutable_data(), out->offset);
+      ::arrow::internal::CopyBitmap(right.buffers[1].data, right.offset, right.length,
+                                    out_span->buffers[0].data, out_span->offset);
     } else {
-      ::arrow::internal::BitmapAnd(right.buffers[0]->data(), right.offset,
-                                   right.buffers[1]->data(), right.offset, right.length,
-                                   out->offset, out->buffers[0]->mutable_data());
+      ::arrow::internal::BitmapAnd(right.buffers[0].data, right.offset,
+                                   right.buffers[1].data, right.offset, right.length,
+                                   out_span->offset, out_span->buffers[0].data);
     }
-    ::arrow::internal::InvertBitmap(right.buffers[1]->data(), right.offset, right.length,
-                                    out->buffers[1]->mutable_data(), out->offset);
+    ::arrow::internal::InvertBitmap(right.buffers[1].data, right.offset, right.length,
+                                    out_span->buffers[1].data, out_span->offset);
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const Scalar& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const Scalar& right,
+                     ExecResult* out) {
     return KleeneAndOp::Call(ctx, left, InvertScalar(right), out);
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& left, const ArrayData& right,
-                     ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& left, const ArraySpan& right,
+                     ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (left.GetNullCount() == 0 && right.GetNullCount() == 0) {
-      out->null_count = 0;
+      GetBitmap(*out_span, 0).SetBitsTo(true);
+      out_span->null_count = 0;
       return AndNotOp::Call(ctx, left, right, out);
     }
 
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(left.length));
     static auto compute_word = [](uint64_t left_true, uint64_t left_false,
                                   uint64_t right_true, uint64_t right_false,
                                   uint64_t* out_valid, uint64_t* out_data) {
@@ -449,12 +468,12 @@ struct KleeneAndNotOp {
       *out_valid = left_false | right_true | (left_true & right_false);
     };
 
-    ComputeKleene(compute_word, ctx, left, right, out);
+    ComputeKleene(compute_word, ctx, left, right, out_span);
     return Status::OK();
   }
 };
 
-void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
+void MakeFunction(const std::string& name, int arity, ScalarKernel::ExecFunc exec,
                   FunctionDoc doc, FunctionRegistry* registry,
                   NullHandling::type null_handling = NullHandling::INTERSECTION) {
   auto func = std::make_shared<ScalarFunction>(name, Arity(arity), std::move(doc));
@@ -546,15 +565,12 @@ void RegisterScalarBoolean(FunctionRegistry* registry) {
   MakeFunction("and_not", 2, applicator::SimpleBinary<AndNotOp>, and_not_doc, registry);
   MakeFunction("or", 2, applicator::SimpleBinary<OrOp>, or_doc, registry);
   MakeFunction("xor", 2, applicator::SimpleBinary<XorOp>, xor_doc, registry);
-
-  // The null bitmap is not preallocated for Kleene kernels, as sometimes
-  // all outputs are valid even though some inputs may be null.
   MakeFunction("and_kleene", 2, applicator::SimpleBinary<KleeneAndOp>, and_kleene_doc,
-               registry, NullHandling::COMPUTED_NO_PREALLOCATE);
+               registry, NullHandling::COMPUTED_PREALLOCATE);
   MakeFunction("and_not_kleene", 2, applicator::SimpleBinary<KleeneAndNotOp>,
-               and_not_kleene_doc, registry, NullHandling::COMPUTED_NO_PREALLOCATE);
+               and_not_kleene_doc, registry, NullHandling::COMPUTED_PREALLOCATE);
   MakeFunction("or_kleene", 2, applicator::SimpleBinary<KleeneOrOp>, or_kleene_doc,
-               registry, NullHandling::COMPUTED_NO_PREALLOCATE);
+               registry, NullHandling::COMPUTED_PREALLOCATE);
 }
 
 }  // namespace internal
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index dad94c1ace7..2d7fef1f4a0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -53,13 +53,14 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
   AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());
 
   for (const auto& ty : NumericTypes()) {
-    ArrayKernelExec exec =
+    ScalarKernel::ExecFunc exec =
         GenerateNumeric<applicator::ScalarUnary, BooleanType, IsNonZero>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   for (const auto& ty : BaseBinaryTypes()) {
-    ArrayKernelExec exec = GenerateVarBinaryBase<applicator::ScalarUnaryNotNull,
-                                                 BooleanType, ParseBooleanString>(*ty);
+    ScalarKernel::ExecFunc exec =
+        GenerateVarBinaryBase<applicator::ScalarUnaryNotNull, BooleanType,
+                              ParseBooleanString>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   return {func};
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
index b1e1164fd34..b6cb5e6e9d7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
@@ -32,77 +32,80 @@ using internal::CopyBitmap;
 namespace compute {
 namespace internal {
 
-Status CastDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const CastOptions& options = CastState::Get(ctx);
-  auto out_type = std::static_pointer_cast<DictionaryType>(out->type());
+  const auto& out_type = checked_cast<const DictionaryType&>(*out->type());
 
   // if out type is same as in type, return input
-  if (out_type->Equals(batch[0].type())) {
-    *out = batch[0];
+  if (out_type.Equals(*batch[0].type())) {
+    /// XXX: This is the wrong place to do a zero-copy optimization
+    out->value = batch[0].array.ToArrayData();
     return Status::OK();
   }
 
+  /// TODO: eliminate this code path by no longer supporting
+  /// scalar->scalar direct casting, which increases maintainability
   if (batch[0].is_scalar()) {  // if input is scalar
-    auto in_scalar = checked_cast<const DictionaryScalar&>(*batch[0].scalar());
+    auto in_scalar = checked_cast<const DictionaryScalar&>(*batch[0].scalar);
 
     // if invalid scalar, return null scalar
     if (!in_scalar.is_valid) {
-      *out = MakeNullScalar(out_type);
+      out->value = MakeNullScalar(out_type.GetSharedPtr());
       return Status::OK();
     }
 
     Datum casted_index, casted_dict;
-    if (in_scalar.value.index->type->Equals(out_type->index_type())) {
+    if (in_scalar.value.index->type->Equals(out_type.index_type())) {
       casted_index = in_scalar.value.index;
     } else {
       ARROW_ASSIGN_OR_RAISE(casted_index,
-                            Cast(in_scalar.value.index, out_type->index_type(), options,
+                            Cast(in_scalar.value.index, out_type.index_type(), options,
                                  ctx->exec_context()));
     }
 
-    if (in_scalar.value.dictionary->type()->Equals(out_type->value_type())) {
+    if (in_scalar.value.dictionary->type()->Equals(out_type.value_type())) {
       casted_dict = in_scalar.value.dictionary;
     } else {
       ARROW_ASSIGN_OR_RAISE(
-          casted_dict, Cast(in_scalar.value.dictionary, out_type->value_type(), options,
+          casted_dict, Cast(in_scalar.value.dictionary, out_type.value_type(), options,
                             ctx->exec_context()));
     }
 
-    *out = std::static_pointer_cast<Scalar>(
-        DictionaryScalar::Make(casted_index.scalar(), casted_dict.make_array()));
+    out->value = DictionaryScalar::Make(casted_index.scalar(), casted_dict.make_array());
 
     return Status::OK();
   }
 
   // if input is array
-  const std::shared_ptr<ArrayData>& in_array = batch[0].array();
+  std::shared_ptr<ArrayData> in_array = batch[0].array.ToArrayData();
   const auto& in_type = checked_cast<const DictionaryType&>(*in_array->type);
 
-  ArrayData* out_array = out->mutable_array();
+  ArrayData* out_array = out->array_data().get();
 
-  if (in_type.index_type()->Equals(out_type->index_type())) {
+  /// XXX: again, maybe the wrong place for zero-copy optimizations
+  if (in_type.index_type()->Equals(out_type.index_type())) {
     out_array->buffers[0] = in_array->buffers[0];
     out_array->buffers[1] = in_array->buffers[1];
     out_array->null_count = in_array->GetNullCount();
     out_array->offset = in_array->offset;
   } else {
     // for indices, create a dummy ArrayData with index_type()
-    const std::shared_ptr<ArrayData>& indices_arr =
+    std::shared_ptr<ArrayData> indices_arr =
         ArrayData::Make(in_type.index_type(), in_array->length, in_array->buffers,
                         in_array->GetNullCount(), in_array->offset);
-    ARROW_ASSIGN_OR_RAISE(auto casted_indices, Cast(indices_arr, out_type->index_type(),
+    ARROW_ASSIGN_OR_RAISE(auto casted_indices, Cast(indices_arr, out_type.index_type(),
                                                     options, ctx->exec_context()));
     out_array->buffers[0] = std::move(casted_indices.array()->buffers[0]);
     out_array->buffers[1] = std::move(casted_indices.array()->buffers[1]);
   }
 
   // data (dict)
-  if (in_type.value_type()->Equals(out_type->value_type())) {
+  if (in_type.value_type()->Equals(out_type.value_type())) {
     out_array->dictionary = in_array->dictionary;
   } else {
     const std::shared_ptr<Array>& dict_arr = MakeArray(in_array->dictionary);
-    ARROW_ASSIGN_OR_RAISE(auto casted_data, Cast(dict_arr, out_type->value_type(),
-                                                 options, ctx->exec_context()));
+    ARROW_ASSIGN_OR_RAISE(auto casted_data, Cast(dict_arr, out_type.value_type(), options,
+                                                 ctx->exec_context()));
     out_array->dictionary = casted_data.array();
   }
   return Status::OK();
@@ -112,7 +115,7 @@ std::vector<std::shared_ptr<CastFunction>> GetDictionaryCasts() {
   auto func = std::make_shared<CastFunction>("cast_dictionary", Type::DICTIONARY);
 
   AddCommonCasts(Type::DICTIONARY, kOutputTargetType, func.get());
-  ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastDictionary);
+  ScalarKernel kernel({InputType(Type::DICTIONARY)}, kOutputTargetType, CastToDictionary);
   kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 5254cb1cc05..04edace5998 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -48,16 +48,16 @@ using StaticCastFunc = std::function<void(const void*, int64_t, int64_t, int64_t
 
 template <typename OutType, typename InType, typename Enable = void>
 struct CastPrimitive {
-  static void Exec(const Datum& input, Datum* out) {
+  static void Exec(const ExecValue& input, ExecResult* out) {
     using OutT = typename OutType::c_type;
     using InT = typename InType::c_type;
 
     StaticCastFunc caster = DoStaticCast<OutT, InT>;
-    if (input.kind() == Datum::ARRAY) {
-      const ArrayData& arr = *input.array();
-      ArrayData* out_arr = out->mutable_array();
-      caster(arr.buffers[1]->data(), arr.offset, arr.length, out_arr->offset,
-             out_arr->buffers[1]->mutable_data());
+    if (input.is_array()) {
+      const ArraySpan& arr = input.array;
+      ArraySpan* out_span = out->array_span();
+      caster(arr.buffers[1].data, arr.offset, arr.length, out_span->offset,
+             out_span->buffers[1].data);
     } else {
       // Scalar path. Use the caster with length 1 to place the casted value into
       // the output
@@ -72,16 +72,13 @@ struct CastPrimitive {
 template <typename OutType, typename InType>
 struct CastPrimitive<OutType, InType, enable_if_t<std::is_same<OutType, InType>::value>> {
   // memcpy output
-  static void Exec(const Datum& input, Datum* out) {
+  static void Exec(const ExecValue& input, ExecResult* out) {
     using T = typename InType::c_type;
 
-    if (input.kind() == Datum::ARRAY) {
-      const ArrayData& arr = *input.array();
-      ArrayData* out_arr = out->mutable_array();
-      std::memcpy(
-          reinterpret_cast<T*>(out_arr->buffers[1]->mutable_data()) + out_arr->offset,
-          reinterpret_cast<const T*>(arr.buffers[1]->data()) + arr.offset,
-          arr.length * sizeof(T));
+    if (input.is_array()) {
+      const ArraySpan& arr = input.array;
+      std::memcpy(out->array_span()->GetValues<T>(1), arr.GetValues<T>(1),
+                  arr.length * sizeof(T));
     } else {
       // Scalar path. Use the caster with length 1 to place the casted value into
       // the output
@@ -94,7 +91,7 @@ struct CastPrimitive<OutType, InType, enable_if_t<std::is_same<OutType, InType>:
 };
 
 template <typename InType>
-void CastNumberImpl(Type::type out_type, const Datum& input, Datum* out) {
+void CastNumberImpl(Type::type out_type, const ExecValue& input, ExecResult* out) {
   switch (out_type) {
     case Type::INT8:
       return CastPrimitive<Int8Type, InType>::Exec(input, out);
@@ -123,8 +120,8 @@ void CastNumberImpl(Type::type out_type, const Datum& input, Datum* out) {
 
 }  // namespace
 
-void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Datum& input,
-                              Datum* out) {
+void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type,
+                              const ExecValue& input, ExecResult* out) {
   switch (in_type) {
     case Type::INT8:
       return CastNumberImpl<Int8Type>(out_type, input, out);
@@ -154,10 +151,12 @@ void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Dat
 
 // ----------------------------------------------------------------------
 
-Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  DCHECK(out->is_array());
+Status UnpackDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DCHECK(out->is_array_data());
 
-  DictionaryArray dict_arr(batch[0].array());
+  // TODO: is there an implementation more friendly to the "span" data structures?
+
+  DictionaryArray dict_arr(batch[0].array.ToArrayData());
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
 
   const auto& dict_type = *dict_arr.dictionary()->type();
@@ -166,55 +165,68 @@ Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out)
                            " incompatible with dictionary type ", dict_type.ToString());
   }
 
-  ARROW_ASSIGN_OR_RAISE(*out,
+  Datum take_result;
+  ARROW_ASSIGN_OR_RAISE(take_result,
                         Take(Datum(dict_arr.dictionary()), Datum(dict_arr.indices()),
                              TakeOptions::Defaults(), ctx->exec_context()));
 
   if (!dict_type.Equals(options.to_type)) {
-    ARROW_ASSIGN_OR_RAISE(*out, Cast(*out, options));
+    ARROW_ASSIGN_OR_RAISE(take_result, Cast(take_result, options));
   }
+  out->value = std::move(take_result.array());
   return Status::OK();
 }
 
-Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status OutputAllNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   if (out->is_scalar()) {
     out->scalar()->is_valid = false;
   } else {
-    ArrayData* output = out->mutable_array();
+    // TODO(wesm): there is no good reason to have to use ArrayData here, so we
+    // should clean this up later. This is used in the dict<null>->null cast
+    DCHECK(out->is_array_data());
+    ArrayData* output = out->array_data().get();
     output->buffers = {nullptr};
     output->null_count = batch.length;
   }
   return Status::OK();
 }
 
-Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastFromExtension(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const CastOptions& options = checked_cast<const CastState*>(ctx->state())->options;
 
-  if (batch[0].kind() == Datum::SCALAR) {
-    const auto& ext_scalar = checked_cast<const ExtensionScalar&>(*batch[0].scalar());
-    Datum casted_storage;
-
+  Datum result;
+  if (batch[0].is_scalar()) {
+    const auto& ext_scalar = checked_cast<const ExtensionScalar&>(*batch[0].scalar);
     if (ext_scalar.is_valid) {
-      return Cast(ext_scalar.value, out->type(), options, ctx->exec_context()).Value(out);
+      RETURN_NOT_OK(Cast(ext_scalar.value, out->type()->GetSharedPtr(), options,
+                         ctx->exec_context())
+                        .Value(&result));
     } else {
       const auto& storage_type =
           checked_cast<const ExtensionType&>(*ext_scalar.type).storage_type();
-      return Cast(MakeNullScalar(storage_type), out->type(), options, ctx->exec_context())
-          .Value(out);
+      RETURN_NOT_OK(Cast(MakeNullScalar(storage_type), out->type()->GetSharedPtr(),
+                         options, ctx->exec_context())
+                        .Value(&result));
     }
+    out->value = std::move(result.scalar());
   } else {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-    ExtensionArray extension(batch[0].array());
-    return Cast(*extension.storage(), out->type(), options, ctx->exec_context())
-        .Value(out);
+    DCHECK(batch[0].is_array());
+    ExtensionArray extension(batch[0].array.ToArrayData());
+    std::shared_ptr<Array> result;
+    RETURN_NOT_OK(Cast(*extension.storage(), out->type()->GetSharedPtr(), options,
+                       ctx->exec_context())
+                      .Value(&result));
+    out->value = std::move(result->data());
   }
+  return Status::OK();
 }
 
-Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  // TODO(wesm): handle this case more gracefully
   if (!batch[0].is_scalar()) {
-    ArrayData* output = out->mutable_array();
     std::shared_ptr<Array> nulls;
-    RETURN_NOT_OK(MakeArrayOfNull(output->type, batch.length).Value(&nulls));
+    RETURN_NOT_OK(
+        MakeArrayOfNull(out->type()->GetSharedPtr(), batch.length).Value(&nulls));
     out->value = nulls->data();
   }
   return Status::OK();
@@ -237,17 +249,17 @@ Result<ValueDescr> ResolveOutputFromOptions(KernelContext* ctx,
 
 OutputType kOutputTargetType(ResolveOutputFromOptions);
 
-Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-  // Make a copy of the buffers into a destination array without carrying
-  // the type
-  const ArrayData& input = *batch[0].array();
-  ArrayData* output = out->mutable_array();
-  output->length = input.length;
-  output->SetNullCount(input.null_count);
-  output->buffers = input.buffers;
-  output->offset = input.offset;
-  output->child_data = input.child_data;
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  // TODO(wesm): alternative strategy for zero copy casts after ARROW-16576
+  DCHECK(batch[0].is_array());
+  DCHECK(out->is_array_data());
+  std::shared_ptr<ArrayData> input = batch[0].array.ToArrayData();
+  ArrayData* output = out->array_data().get();
+  output->length = input->length;
+  output->offset = input->offset;
+  output->SetNullCount(input->null_count);
+  output->buffers = std::move(input->buffers);
+  output->child_data = std::move(input->child_data);
   return Status::OK();
 }
 
@@ -255,7 +267,8 @@ void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_ty
                      CastFunction* func) {
   auto sig = KernelSignature::Make({in_type}, out_type);
   ScalarKernel kernel;
-  kernel.exec = TrivialScalarUnaryAsArraysExec(ZeroCopyCastExec);
+  kernel.exec = TrivialScalarUnaryAsArraysExec(ZeroCopyCastExec,
+                                               /*use_array_span=*/false);
   kernel.signature = sig;
   kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
@@ -283,7 +296,8 @@ void AddCommonCasts(Type::type out_type_id, OutputType out_ty, CastFunction* fun
     // XXX: Uses Take and does its own memory allocation for the moment. We can
     // fix this later.
     DCHECK_OK(func->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, out_ty,
-                              TrivialScalarUnaryAsArraysExec(UnpackDictionary),
+                              TrivialScalarUnaryAsArraysExec(UnpackDictionary,
+                                                             /*use_array_span=*/false),
                               NullHandling::COMPUTED_NO_PREALLOCATE,
                               MemAllocation::NO_PREALLOCATE));
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
index 2419d898a68..6a5f1067a20 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.h
@@ -37,23 +37,25 @@ struct CastFunctor {};
 template <typename O, typename I>
 struct CastFunctor<
     O, I, enable_if_t<std::is_same<O, I>::value && is_parameter_free_type<I>::value>> {
-  static Status Exec(KernelContext*, const ExecBatch&, Datum*) { return Status::OK(); }
+  static Status Exec(KernelContext*, const ExecSpan&, ExecResult*) {
+    return Status::OK();
+  }
 };
 
-Status CastFromExtension(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status CastFromExtension(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
 
 // Utility for numeric casts
-void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type, const Datum& input,
-                              Datum* out);
+void CastNumberToNumberUnsafe(Type::type in_type, Type::type out_type,
+                              const ExecValue& input, ExecResult* out);
 
 // ----------------------------------------------------------------------
 // Dictionary to other things
 
-Status UnpackDictionary(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status UnpackDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
 
-Status OutputAllNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status OutputAllNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
 
-Status CastFromNull(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
 
 // Adds a cast function where CastFunctor is specialized and the input and output
 // types are parameter free (have a type_singleton). Scalar inputs are handled by
@@ -65,7 +67,7 @@ void AddSimpleCast(InputType in_ty, OutputType out_ty, CastFunction* func) {
       TrivialScalarUnaryAsArraysExec(CastFunctor<OutType, InType>::Exec)));
 }
 
-Status ZeroCopyCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out);
+Status ZeroCopyCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
 
 void AddZeroCopyCast(Type::type in_type_id, InputType in_type, OutputType out_type,
                      CastFunction* func);
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
index d91bf032e58..beef99c8e5f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
@@ -42,13 +42,15 @@ namespace {
 
 template <typename SrcType, typename DestType>
 typename std::enable_if<SrcType::type_id == DestType::type_id, Status>::type
-CastListOffsets(KernelContext* ctx, const ArrayData& in_array, ArrayData* out_array) {
+CastListOffsets(KernelContext* ctx, const ArraySpan& in_array, ArrayData* out_array) {
   return Status::OK();
 }
 
+// TODO(wesm): memory could be preallocated here and it would make
+// things simpler
 template <typename SrcType, typename DestType>
 typename std::enable_if<SrcType::type_id != DestType::type_id, Status>::type
-CastListOffsets(KernelContext* ctx, const ArrayData& in_array, ArrayData* out_array) {
+CastListOffsets(KernelContext* ctx, const ArraySpan& in_array, ArrayData* out_array) {
   using src_offset_type = typename SrcType::offset_type;
   using dest_offset_type = typename DestType::offset_type;
 
@@ -68,14 +70,14 @@ struct CastList {
   static constexpr bool is_upcast = sizeof(src_offset_type) < sizeof(dest_offset_type);
   static constexpr bool is_downcast = sizeof(src_offset_type) > sizeof(dest_offset_type);
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const CastOptions& options = CastState::Get(ctx);
 
     auto child_type = checked_cast<const DestType&>(*out->type()).value_type();
 
-    if (out->kind() == Datum::SCALAR) {
+    if (out->is_scalar()) {
       // The scalar case is simple, as only the underlying values must be cast
-      const auto& in_scalar = checked_cast<const BaseListScalar&>(*batch[0].scalar());
+      const auto& in_scalar = checked_cast<const BaseListScalar&>(*batch[0].scalar);
       auto out_scalar = checked_cast<BaseListScalar*>(out->scalar().get());
 
       DCHECK(!out_scalar->is_valid);
@@ -88,17 +90,17 @@ struct CastList {
       return Status::OK();
     }
 
-    const ArrayData& in_array = *batch[0].array();
+    const ArraySpan& in_array = batch[0].array;
     auto offsets = in_array.GetValues<src_offset_type>(1);
-    Datum values = in_array.child_data[0];
 
-    ArrayData* out_array = out->mutable_array();
-    out_array->buffers = in_array.buffers;
+    ArrayData* out_array = out->array_data().get();
+    out_array->buffers[0] = in_array.GetBuffer(0);
+    out_array->buffers[1] = in_array.GetBuffer(1);
 
     // Shift bitmap in case the source offset is non-zero
-    if (in_array.offset != 0 && in_array.buffers[0]) {
+    if (in_array.offset != 0 && in_array.buffers[0].data != nullptr) {
       ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
-                            CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(),
+                            CopyBitmap(ctx->memory_pool(), in_array.buffers[0].data,
                                        in_array.offset, in_array.length));
     }
 
@@ -116,6 +118,8 @@ struct CastList {
       }
     }
 
+    std::shared_ptr<ArrayData> values = in_array.child_data[0].ToArrayData();
+
     if (in_array.offset != 0) {
       ARROW_ASSIGN_OR_RAISE(
           out_array->buffers[1],
@@ -125,7 +129,8 @@ struct CastList {
       for (int64_t i = 0; i < in_array.length + 1; ++i) {
         shifted_offsets[i] = static_cast<dest_offset_type>(offsets[i] - offsets[0]);
       }
-      values = in_array.child_data[0]->Slice(offsets[0], offsets[in_array.length]);
+
+      values = values->Slice(offsets[0], offsets[in_array.length]);
     } else {
       RETURN_NOT_OK((CastListOffsets<SrcType, DestType>(ctx, in_array, out_array)));
     }
@@ -134,7 +139,7 @@ struct CastList {
     ARROW_ASSIGN_OR_RAISE(Datum cast_values,
                           Cast(values, child_type, options, ctx->exec_context()));
 
-    DCHECK_EQ(Datum::ARRAY, cast_values.kind());
+    DCHECK(cast_values.is_array());
     out_array->child_data.push_back(cast_values.array());
     return Status::OK();
   }
@@ -151,7 +156,7 @@ void AddListCast(CastFunction* func) {
 }
 
 struct CastStruct {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const CastOptions& options = CastState::Get(ctx);
     const auto& in_type = checked_cast<const StructType&>(*batch[0].type());
     const auto& out_type = checked_cast<const StructType&>(*out->type());
@@ -181,8 +186,8 @@ struct CastStruct {
           in_type.ToString(), " output fields: ", out_type.ToString());
     }
 
-    if (out->kind() == Datum::SCALAR) {
-      const auto& in_scalar = checked_cast<const StructScalar&>(*batch[0].scalar());
+    if (out->is_scalar()) {
+      const auto& in_scalar = checked_cast<const StructScalar&>(*batch[0].scalar);
       auto out_scalar = checked_cast<StructScalar*>(out->scalar().get());
 
       DCHECK(!out_scalar->is_valid);
@@ -201,25 +206,25 @@ struct CastStruct {
       return Status::OK();
     }
 
-    const ArrayData& in_array = *batch[0].array();
-    ArrayData* out_array = out->mutable_array();
+    const ArraySpan& in_array = batch[0].array;
+    ArrayData* out_array = out->array_data().get();
 
-    if (in_array.buffers[0]) {
+    if (in_array.buffers[0].data != nullptr) {
       ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
-                            CopyBitmap(ctx->memory_pool(), in_array.buffers[0]->data(),
+                            CopyBitmap(ctx->memory_pool(), in_array.buffers[0].data,
                                        in_array.offset, in_array.length));
     }
 
     out_field_index = 0;
     for (int field_index : fields_to_select) {
-      const auto& values =
-          in_array.child_data[field_index]->Slice(in_array.offset, in_array.length);
+      const auto& values = (in_array.child_data[field_index].ToArrayData()->Slice(
+          in_array.offset, in_array.length));
       const auto& target_type = out->type()->field(out_field_index++)->type();
 
       ARROW_ASSIGN_OR_RAISE(Datum cast_values,
                             Cast(values, target_type, options, ctx->exec_context()));
 
-      DCHECK_EQ(Datum::ARRAY, cast_values.kind());
+      DCHECK(cast_values.is_array());
       out_array->child_data.push_back(cast_values.array());
     }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 3f8b33e9563..2476cad4b76 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -21,6 +21,7 @@
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/scalar_cast_internal.h"
 #include "arrow/compute/kernels/util_internal.h"
+#include "arrow/scalar.h"
 #include "arrow/util/bit_block_counter.h"
 #include "arrow/util/int_util.h"
 #include "arrow/util/value_parsing.h"
@@ -32,20 +33,25 @@ using internal::CheckIntegersInRange;
 using internal::IntegersCanFit;
 using internal::OptionalBitBlockCounter;
 using internal::ParseValue;
+using internal::PrimitiveScalarBase;
 
 namespace compute {
 namespace internal {
 
-Status CastIntegerToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastIntegerToInteger(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const auto& options = checked_cast<const CastState*>(ctx->state())->options;
   if (!options.allow_int_overflow) {
-    RETURN_NOT_OK(IntegersCanFit(batch[0], *out->type()));
+    if (batch[0].is_array()) {
+      RETURN_NOT_OK(IntegersCanFit(batch[0].array, *out->type()));
+    } else {
+      RETURN_NOT_OK(IntegersCanFit(*batch[0].scalar, *out->type()));
+    }
   }
   CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
   return Status::OK();
 }
 
-Status CastFloatingToFloating(KernelContext*, const ExecBatch& batch, Datum* out) {
+Status CastFloatingToFloating(KernelContext*, const ExecSpan& batch, ExecResult* out) {
   CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
   return Status::OK();
 }
@@ -57,7 +63,7 @@ Status CastFloatingToFloating(KernelContext*, const ExecBatch& batch, Datum* out
 template <typename InType, typename OutType, typename InT = typename InType::c_type,
           typename OutT = typename OutType::c_type>
 ARROW_DISABLE_UBSAN("float-cast-overflow")
-Status CheckFloatTruncation(const Datum& input, const Datum& output) {
+Status CheckFloatTruncation(const ExecValue& input, const ExecResult& output) {
   auto WasTruncated = [&](OutT out_val, InT in_val) -> bool {
     return static_cast<InT>(out_val) != in_val;
   };
@@ -69,26 +75,24 @@ Status CheckFloatTruncation(const Datum& input, const Datum& output) {
                            *output.type());
   };
 
-  if (input.kind() == Datum::SCALAR) {
-    DCHECK_EQ(output.kind(), Datum::SCALAR);
+  if (input.is_scalar()) {
+    DCHECK(output.is_scalar());
     const auto& in_scalar = input.scalar_as<typename TypeTraits<InType>::ScalarType>();
-    const auto& out_scalar = output.scalar_as<typename TypeTraits<OutType>::ScalarType>();
+    const auto& out_scalar =
+        checked_cast<typename TypeTraits<OutType>::ScalarType&>(*output.scalar());
     if (WasTruncatedMaybeNull(out_scalar.value, in_scalar.value, out_scalar.is_valid)) {
       return GetErrorMessage(in_scalar.value);
     }
     return Status::OK();
   }
 
-  const ArrayData& in_array = *input.array();
-  const ArrayData& out_array = *output.array();
+  const ArraySpan& in_array = input.array;
+  const ArraySpan& out_array = *output.array_span();
 
   const InT* in_data = in_array.GetValues<InT>(1);
   const OutT* out_data = out_array.GetValues<OutT>(1);
 
-  const uint8_t* bitmap = nullptr;
-  if (in_array.buffers[0]) {
-    bitmap = in_array.buffers[0]->data();
-  }
+  const uint8_t* bitmap = in_array.buffers[0].data;
   OptionalBitBlockCounter bit_counter(bitmap, in_array.offset, in_array.length);
   int64_t position = 0;
   int64_t offset_position = in_array.offset;
@@ -132,7 +136,7 @@ Status CheckFloatTruncation(const Datum& input, const Datum& output) {
 }
 
 template <typename InType>
-Status CheckFloatToIntTruncationImpl(const Datum& input, const Datum& output) {
+Status CheckFloatToIntTruncationImpl(const ExecValue& input, const ExecResult& output) {
   switch (output.type()->id()) {
     case Type::INT8:
       return CheckFloatTruncation<InType, Int8Type>(input, output);
@@ -157,7 +161,7 @@ Status CheckFloatToIntTruncationImpl(const Datum& input, const Datum& output) {
   return Status::OK();
 }
 
-Status CheckFloatToIntTruncation(const Datum& input, const Datum& output) {
+Status CheckFloatToIntTruncation(const ExecValue& input, const ExecResult& output) {
   switch (input.type()->id()) {
     case Type::FLOAT:
       return CheckFloatToIntTruncationImpl<FloatType>(input, output);
@@ -170,7 +174,7 @@ Status CheckFloatToIntTruncation(const Datum& input, const Datum& output) {
   return Status::OK();
 }
 
-Status CastFloatingToInteger(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastFloatingToInteger(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const auto& options = checked_cast<const CastState*>(ctx->state())->options;
   CastNumberToNumberUnsafe(batch[0].type()->id(), out->type()->id(), batch[0], out);
   if (!options.allow_float_truncate) {
@@ -200,15 +204,21 @@ struct FloatingIntegerBound<double> {
 template <typename InType, typename OutType, typename InT = typename InType::c_type,
           typename OutT = typename OutType::c_type,
           bool IsSigned = is_signed_integer_type<InType>::value>
-Status CheckIntegerFloatTruncateImpl(const Datum& input) {
+Status CheckIntegerFloatTruncateImpl(const ExecValue& input) {
   using InScalarType = typename TypeTraits<InType>::ScalarType;
   const int64_t limit = FloatingIntegerBound<OutT>::value;
   InScalarType bound_lower(IsSigned ? -limit : 0);
   InScalarType bound_upper(limit);
-  return CheckIntegersInRange(input, bound_lower, bound_upper);
+
+  if (input.is_scalar()) {
+    ArraySpan span(*input.scalar);
+    return CheckIntegersInRange(span, bound_lower, bound_upper);
+  } else {
+    return CheckIntegersInRange(input.array, bound_lower, bound_upper);
+  }
 }
 
-Status CheckForIntegerToFloatingTruncation(const Datum& input, Type::type out_type) {
+Status CheckForIntegerToFloatingTruncation(const ExecValue& input, Type::type out_type) {
   switch (input.type()->id()) {
     // Small integers are all exactly representable as whole numbers
     case Type::INT8:
@@ -249,10 +259,11 @@ Status CheckForIntegerToFloatingTruncation(const Datum& input, Type::type out_ty
   return Status::OK();
 }
 
-Status CastIntegerToFloating(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status CastIntegerToFloating(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const auto& options = checked_cast<const CastState*>(ctx->state())->options;
   Type::type out_type = out->type()->id();
   if (!options.allow_float_truncate) {
+    /// XXX: refactor to not use Datum
     RETURN_NOT_OK(CheckForIntegerToFloatingTruncation(batch[0], out_type));
   }
   CastNumberToNumberUnsafe(batch[0].type()->id(), out_type, batch[0], out);
@@ -273,7 +284,7 @@ struct BooleanToNumber {
 
 template <typename O>
 struct CastFunctor<O, BooleanType, enable_if_number<O>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return applicator::ScalarUnary<O, BooleanType, BooleanToNumber>::Exec(ctx, batch,
                                                                           out);
   }
@@ -297,7 +308,7 @@ struct ParseString {
 
 template <typename O, typename I>
 struct CastFunctor<O, I, enable_if_base_binary<I>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return applicator::ScalarUnaryNotNull<O, I, ParseString<O>>::Exec(ctx, batch, out);
   }
 };
@@ -364,7 +375,7 @@ struct CastFunctor<O, I,
                    enable_if_t<is_integer_type<O>::value && is_decimal_type<I>::value>> {
   using out_type = typename O::c_type;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = checked_cast<const CastState*>(ctx->state())->options;
 
     const auto& in_type_inst = checked_cast<const I&>(*batch[0].type());
@@ -411,7 +422,7 @@ struct IntegerToDecimal {
 template <typename O, typename I>
 struct CastFunctor<O, I,
                    enable_if_t<is_decimal_type<O>::value && is_integer_type<I>::value>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& out_type = checked_cast<const O&>(*out->type());
     const auto out_scale = out_type.scale();
     const auto out_precision = out_type.precision();
@@ -511,7 +522,7 @@ struct SafeRescaleDecimal {
 template <typename O, typename I>
 struct CastFunctor<O, I,
                    enable_if_t<is_decimal_type<O>::value && is_decimal_type<I>::value>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = checked_cast<const CastState*>(ctx->state())->options;
 
     const auto& in_type = checked_cast<const I&>(*batch[0].type());
@@ -565,7 +576,7 @@ struct RealToDecimal {
 template <typename O, typename I>
 struct CastFunctor<O, I,
                    enable_if_t<is_decimal_type<O>::value && is_floating_type<I>::value>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = checked_cast<const CastState*>(ctx->state())->options;
     const auto& out_type = checked_cast<const O&>(*out->type());
     const auto out_scale = out_type.scale();
@@ -592,7 +603,7 @@ struct DecimalToReal {
 template <typename O, typename I>
 struct CastFunctor<O, I,
                    enable_if_t<is_floating_type<O>::value && is_decimal_type<I>::value>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& in_type = checked_cast<const I&>(*batch[0].type());
     const auto in_scale = in_type.scale();
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 8ac2123d69e..447baded322 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -19,6 +19,7 @@
 
 #include "arrow/array/array_base.h"
 #include "arrow/array/builder_binary.h"
+#include "arrow/compute/kernels/codegen_internal.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/compute/kernels/scalar_cast_internal.h"
 #include "arrow/compute/kernels/temporal_internal.h"
@@ -49,17 +50,12 @@ struct NumericToStringCastFunctor {
   using BuilderType = typename TypeTraits<O>::BuilderType;
   using FormatterType = StringFormatter<I>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK(out->is_array());
-    const ArrayData& input = *batch[0].array();
-    ArrayData* output = out->mutable_array();
-    return Convert(ctx, input, output);
-  }
-
-  static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK(out->is_array_data());
+    const ArraySpan& input = batch[0].array;
     FormatterType formatter(input.type);
-    BuilderType builder(input.type, ctx->memory_pool());
-    RETURN_NOT_OK(VisitArrayDataInline<I>(
+    BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool());
+    RETURN_NOT_OK(VisitArraySpanInline<I>(
         input,
         [&](value_type v) {
           return formatter(v, [&](util::string_view v) { return builder.Append(v); });
@@ -68,7 +64,7 @@ struct NumericToStringCastFunctor {
 
     std::shared_ptr<Array> output_array;
     RETURN_NOT_OK(builder.Finish(&output_array));
-    *output = std::move(*output_array->data());
+    out->value = std::move(output_array->data());
     return Status::OK();
   }
 };
@@ -82,17 +78,12 @@ struct TemporalToStringCastFunctor {
   using BuilderType = typename TypeTraits<O>::BuilderType;
   using FormatterType = StringFormatter<I>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK(out->is_array());
-    const ArrayData& input = *batch[0].array();
-    ArrayData* output = out->mutable_array();
-    return Convert(ctx, input, output);
-  }
-
-  static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK(out->is_array_data());
+    const ArraySpan& input = batch[0].array;
     FormatterType formatter(input.type);
-    BuilderType builder(input.type, ctx->memory_pool());
-    RETURN_NOT_OK(VisitArrayDataInline<I>(
+    BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool());
+    RETURN_NOT_OK(VisitArraySpanInline<I>(
         input,
         [&](value_type v) {
           return formatter(v, [&](util::string_view v) { return builder.Append(v); });
@@ -101,7 +92,7 @@ struct TemporalToStringCastFunctor {
 
     std::shared_ptr<Array> output_array;
     RETURN_NOT_OK(builder.Finish(&output_array));
-    *output = std::move(*output_array->data());
+    out->value = std::move(output_array->data());
     return Status::OK();
   }
 };
@@ -112,17 +103,12 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
   using BuilderType = typename TypeTraits<O>::BuilderType;
   using FormatterType = StringFormatter<TimestampType>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK(out->is_array());
-    const ArrayData& input = *batch[0].array();
-    ArrayData* output = out->mutable_array();
-    return Convert(ctx, input, output);
-  }
-
-  static Status Convert(KernelContext* ctx, const ArrayData& input, ArrayData* output) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    DCHECK(out->is_array_data());
+    const ArraySpan& input = batch[0].array;
     const auto& timezone = GetInputTimezone(*input.type);
     const auto& ty = checked_cast<const TimestampType&>(*input.type);
-    BuilderType builder(input.type, ctx->memory_pool());
+    BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool());
 
     // Preallocate
     int64_t string_length = 19;  // YYYY-MM-DD HH:MM:SS
@@ -140,7 +126,7 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
 
     if (timezone.empty()) {
       FormatterType formatter(input.type);
-      RETURN_NOT_OK(VisitArrayDataInline<TimestampType>(
+      RETURN_NOT_OK(VisitArraySpanInline<TimestampType>(
           input,
           [&](value_type v) {
             return formatter(v, [&](util::string_view v) { return builder.Append(v); });
@@ -173,12 +159,12 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
     }
     std::shared_ptr<Array> output_array;
     RETURN_NOT_OK(builder.Finish(&output_array));
-    *output = std::move(*output_array->data());
+    out->value = std::move(output_array->data());
     return Status::OK();
   }
 
   template <typename Duration>
-  static Status ConvertZoned(const ArrayData& input, const std::string& timezone,
+  static Status ConvertZoned(const ArraySpan& input, const std::string& timezone,
                              BuilderType* builder) {
     static const std::string kFormatString = "%Y-%m-%d %H:%M:%S%z";
     static const std::string kUtcFormatString = "%Y-%m-%d %H:%M:%SZ";
@@ -187,7 +173,7 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
     ARROW_ASSIGN_OR_RAISE(std::locale locale, GetLocale("C"));
     TimestampFormatter<Duration> formatter{
         timezone == "UTC" ? kUtcFormatString : kFormatString, tz, locale};
-    return VisitArrayDataInline<TimestampType>(
+    return VisitArraySpanInline<TimestampType>(
         input,
         [&](value_type v) {
           ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(v));
@@ -222,7 +208,7 @@ struct Utf8Validator {
 };
 
 template <typename I, typename O>
-Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input,
+Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArraySpan& input,
                                  ArrayData* output) {
   static_assert(std::is_same<I, O>::value, "Cast same-width offsets (no-op)");
   return Status::OK();
@@ -231,7 +217,7 @@ Status CastBinaryToBinaryOffsets(KernelContext* ctx, const ArrayData& input,
 // Upcast offsets
 template <>
 Status CastBinaryToBinaryOffsets<int32_t, int64_t>(KernelContext* ctx,
-                                                   const ArrayData& input,
+                                                   const ArraySpan& input,
                                                    ArrayData* output) {
   using input_offset_type = int32_t;
   using output_offset_type = int64_t;
@@ -249,7 +235,7 @@ Status CastBinaryToBinaryOffsets<int32_t, int64_t>(KernelContext* ctx,
 // Downcast offsets
 template <>
 Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
-                                                   const ArrayData& input,
+                                                   const ArraySpan& input,
                                                    ArrayData* output) {
   using input_offset_type = int64_t;
   using output_offset_type = int32_t;
@@ -268,7 +254,7 @@ Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
                                         sizeof(output_offset_type)));
     memset(output->buffers[1]->mutable_data(), 0,
            output->offset * sizeof(output_offset_type));
-    ::arrow::internal::CastInts(input.GetValues<input_offset_type>(1),
+    ::arrow::internal::CastInts(input_offsets,
                                 output->GetMutableValues<output_offset_type>(1),
                                 output->length + 1);
     return Status::OK();
@@ -277,16 +263,15 @@ Status CastBinaryToBinaryOffsets<int64_t, int32_t>(KernelContext* ctx,
 
 template <typename O, typename I>
 enable_if_base_binary<I, Status> BinaryToBinaryCastExec(KernelContext* ctx,
-                                                        const ExecBatch& batch,
-                                                        Datum* out) {
-  DCHECK(out->is_array());
+                                                        const ExecSpan& batch,
+                                                        ExecResult* out) {
+  DCHECK(out->is_array_data());
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-  const ArrayData& input = *batch[0].array();
+  const ArraySpan& input = batch[0].array;
 
   if (!I::is_utf8 && O::is_utf8 && !options.allow_invalid_utf8) {
     InitializeUTF8();
-
-    ArrayDataVisitor<I> visitor;
+    ArraySpanVisitor<I> visitor;
     Utf8Validator validator;
     RETURN_NOT_OK(visitor.Visit(input, &validator));
   }
@@ -294,23 +279,21 @@ enable_if_base_binary<I, Status> BinaryToBinaryCastExec(KernelContext* ctx,
   // Start with a zero-copy cast, but change indices to expected size
   RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
   return CastBinaryToBinaryOffsets<typename I::offset_type, typename O::offset_type>(
-      ctx, input, out->mutable_array());
+      ctx, input, out->array_data().get());
 }
 
 template <typename O, typename I>
 enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
                 !std::is_same<O, FixedSizeBinaryType>::value,
             Status>
-BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  DCHECK(out->is_array());
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DCHECK(out->is_array_data());
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-  const ArrayData& input = *batch[0].array();
-  ArrayData* output = out->mutable_array();
+  const ArraySpan& input = batch[0].array;
 
   if (O::is_utf8 && !options.allow_invalid_utf8) {
     InitializeUTF8();
-
-    ArrayDataVisitor<I> visitor;
+    ArraySpanVisitor<I> visitor;
     Utf8Validator validator;
     RETURN_NOT_OK(visitor.Visit(input, &validator));
   }
@@ -319,28 +302,31 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   using output_offset_type = typename O::offset_type;
   constexpr output_offset_type kMaxOffset =
       std::numeric_limits<output_offset_type>::max();
-  const int32_t width =
-      checked_cast<const FixedSizeBinaryType&>(*input.type).byte_width();
+  const int32_t width = input.type->byte_width();
   const int64_t max_offset = width * input.length;
   if (max_offset > kMaxOffset) {
     return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
-                           output->type->ToString(), ": input array too large");
+                           out->type()->ToString(), ": input array too large");
   }
 
+  // This presupposes that one was created in the invocation layer
+  ArrayData* output = out->array_data().get();
+
   // Copy buffers over, then generate indices
   output->length = input.length;
   output->SetNullCount(input.null_count);
   if (input.offset == output->offset) {
-    output->buffers[0] = input.buffers[0];
+    output->buffers[0] = input.GetBuffer(0);
   } else {
     ARROW_ASSIGN_OR_RAISE(
         output->buffers[0],
-        arrow::internal::CopyBitmap(ctx->memory_pool(), input.GetValues<uint8_t>(0, 0),
+        arrow::internal::CopyBitmap(ctx->memory_pool(), input.buffers[0].data,
                                     input.offset, input.length));
   }
-  output->buffers[2] = input.buffers[1];
+  // Data buffer (index 1) for FWBinary becomes data buffer for
+  // VarBinary (index 2)
+  output->buffers[2] = input.GetBuffer(1);
   output_offset_type* offsets = output->GetMutableValues<output_offset_type>(1);
-
   offsets[0] = static_cast<output_offset_type>(input.offset * width);
   for (int64_t i = 0; i < input.length; i++) {
     offsets[i + 1] = offsets[i] + width;
@@ -352,20 +338,16 @@ template <typename O, typename I>
 enable_if_t<std::is_same<I, FixedSizeBinaryType>::value &&
                 std::is_same<O, FixedSizeBinaryType>::value,
             Status>
-BinaryToBinaryCastExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  DCHECK(out->is_array());
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DCHECK(out->is_array_data());
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-  const ArrayData& input = *batch[0].array();
-  const int32_t in_width =
-      checked_cast<const FixedSizeBinaryType&>(*input.type).byte_width();
+  const int32_t in_width = batch[0].type()->byte_width();
   const int32_t out_width =
       checked_cast<const FixedSizeBinaryType&>(*options.to_type).byte_width();
-
   if (in_width != out_width) {
-    return Status::Invalid("Failed casting from ", input.type->ToString(), " to ",
+    return Status::Invalid("Failed casting from ", batch[0].type()->ToString(), " to ",
                            options.to_type->ToString(), ": widths must match");
   }
-
   return ZeroCopyCastExec(ctx, batch, out);
 }
 
@@ -382,14 +364,16 @@ void AddNumberToStringCasts(CastFunction* func) {
 
   DCHECK_OK(func->AddKernel(Type::BOOL, {boolean()}, out_ty,
                             TrivialScalarUnaryAsArraysExec(
-                                NumericToStringCastFunctor<OutType, BooleanType>::Exec),
+                                NumericToStringCastFunctor<OutType, BooleanType>::Exec,
+                                /*use_array_span=*/false),
                             NullHandling::COMPUTED_NO_PREALLOCATE));
 
   for (const std::shared_ptr<DataType>& in_ty : NumericTypes()) {
     DCHECK_OK(
         func->AddKernel(in_ty->id(), {in_ty}, out_ty,
                         TrivialScalarUnaryAsArraysExec(
-                            GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty)),
+                            GenerateNumeric<NumericToStringCastFunctor, OutType>(*in_ty),
+                            /*use_array_span=*/false),
                         NullHandling::COMPUTED_NO_PREALLOCATE));
   }
 }
@@ -401,7 +385,8 @@ void AddTemporalToStringCasts(CastFunction* func) {
     DCHECK_OK(func->AddKernel(
         in_ty->id(), {InputType(in_ty->id())}, out_ty,
         TrivialScalarUnaryAsArraysExec(
-            GenerateTemporal<TemporalToStringCastFunctor, OutType>(*in_ty)),
+            GenerateTemporal<TemporalToStringCastFunctor, OutType>(*in_ty),
+            /*use_array_span=*/false),
         NullHandling::COMPUTED_NO_PREALLOCATE));
   }
 }
@@ -412,7 +397,8 @@ void AddBinaryToBinaryCast(CastFunction* func) {
 
   DCHECK_OK(func->AddKernel(
       InType::type_id, {InputType(InType::type_id)}, out_ty,
-      TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec<OutType, InType>),
+      TrivialScalarUnaryAsArraysExec(BinaryToBinaryCastExec<OutType, InType>,
+                                     /*use_array_span=*/false),
       NullHandling::COMPUTED_NO_PREALLOCATE));
 }
 
@@ -458,7 +444,8 @@ std::vector<std::shared_ptr<CastFunction>> GetBinaryLikeCasts() {
       Type::FIXED_SIZE_BINARY, {InputType(Type::FIXED_SIZE_BINARY)},
       OutputType(FirstType),
       TrivialScalarUnaryAsArraysExec(
-          BinaryToBinaryCastExec<FixedSizeBinaryType, FixedSizeBinaryType>),
+          BinaryToBinaryCastExec<FixedSizeBinaryType, FixedSizeBinaryType>,
+          /*use_array_span=*/false),
       NullHandling::COMPUTED_NO_PREALLOCATE));
 
   return {cast_binary, cast_large_binary, cast_string, cast_large_string, cast_fsb};
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
index 74274e963a1..2c04efa57d9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc
@@ -41,10 +41,10 @@ constexpr int64_t kMillisecondsInDay = 86400000;
 
 template <typename in_type, typename out_type>
 Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
-                 const int64_t factor, const ArrayData& input, ArrayData* output) {
+                 const int64_t factor, const ArraySpan& input, ArraySpan* output) {
   const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
-  auto in_data = input.GetValues<in_type>(1);
-  auto out_data = output->GetMutableValues<out_type>(1);
+  const in_type* in_data = input.GetValues<in_type>(1);
+  out_type* out_data = output->GetValues<out_type>(1);
 
   if (factor == 1) {
     for (int64_t i = 0; i < input.length; i++) {
@@ -63,8 +63,8 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
 
       int64_t max_val = std::numeric_limits<int64_t>::max() / factor;
       int64_t min_val = std::numeric_limits<int64_t>::min() / factor;
-      if (input.null_count != 0) {
-        BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
+      if (input.null_count != 0 && input.buffers[0].data != nullptr) {
+        BitmapReader bit_reader(input.buffers[0].data, input.offset, input.length);
         for (int64_t i = 0; i < input.length; i++) {
           if (bit_reader.IsSet() && (in_data[i] < min_val || in_data[i] > max_val)) {
             RAISE_OVERFLOW_CAST(in_data[i]);
@@ -93,8 +93,8 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
   return Status::Invalid("Casting from ", input.type->ToString(), " to ", \
                          output->type->ToString(), " would lose data: ", VAL);
 
-      if (input.null_count != 0) {
-        BitmapReader bit_reader(input.buffers[0]->data(), input.offset, input.length);
+      if (input.null_count != 0 && input.buffers[0].data != nullptr) {
+        BitmapReader bit_reader(input.buffers[0].data, input.offset, input.length);
         for (int64_t i = 0; i < input.length; i++) {
           out_data[i] = static_cast<out_type>(in_data[i] / factor);
           if (bit_reader.IsSet() && (out_data[i] * factor != in_data[i])) {
@@ -119,7 +119,7 @@ Status ShiftTime(KernelContext* ctx, const util::DivideOrMultiply factor_op,
 }
 
 template <template <typename...> class Op, typename OutType, typename... Args>
-Status ExtractTemporal(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+Status ExtractTemporal(KernelContext* ctx, const ExecSpan& batch, ExecResult* out,
                        Args... args) {
   const auto& ty = checked_cast<const TimestampType&>(*batch[0].type());
 
@@ -146,19 +146,15 @@ struct CastFunctor<
     O, I,
     enable_if_t<(is_timestamp_type<O>::value && is_timestamp_type<I>::value) ||
                 (is_duration_type<O>::value && is_duration_type<I>::value)>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const ArraySpan& input = batch[0].array;
+    ArraySpan* output = out->array_span();
 
-    const ArrayData& input = *batch[0].array();
-    ArrayData* output = out->mutable_array();
-
-    // If units are the same, zero copy, otherwise convert
     const auto& in_type = checked_cast<const I&>(*batch[0].type());
     const auto& out_type = checked_cast<const O&>(*output->type);
 
     // The units may be equal if the time zones are different. We might go to
     // lengths to make this zero copy in the future but we leave it for now
-
     auto conversion = util::GetTimestampConversion(in_type.unit(), out_type.unit());
     return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second, input,
                                        output);
@@ -186,8 +182,7 @@ struct CastFunctor<Date32Type, TimestampType> {
     Localizer localizer_;
   };
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return ExtractTemporal<Date32, Date32Type>(ctx, batch, out);
   }
 };
@@ -213,8 +208,7 @@ struct CastFunctor<Date64Type, TimestampType> {
     Localizer localizer_;
   };
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return ExtractTemporal<Date64, Date64Type>(ctx, batch, out);
   }
 };
@@ -281,8 +275,7 @@ struct ExtractTimeDownscaledUnchecked {
 
 template <>
 struct CastFunctor<Time32Type, TimestampType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& in_type = checked_cast<const TimestampType&>(*batch[0].type());
     const auto& out_type = checked_cast<const Time32Type&>(*out->type());
     const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
@@ -310,8 +303,7 @@ struct CastFunctor<Time32Type, TimestampType> {
 
 template <>
 struct CastFunctor<Time64Type, TimestampType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& in_type = checked_cast<const TimestampType&>(*batch[0].type());
     const auto& out_type = checked_cast<const Time64Type&>(*out->type());
     const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options;
@@ -345,11 +337,9 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
   using in_t = typename I::c_type;
   using out_t = typename O::c_type;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
-    const ArrayData& input = *batch[0].array();
-    ArrayData* output = out->mutable_array();
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const ArraySpan& input = batch[0].array;
+    ArraySpan* output = out->array_span();
 
     // If units are the same, zero copy, otherwise convert
     const auto& in_type = checked_cast<const I&>(*input.type);
@@ -366,21 +356,17 @@ struct CastFunctor<O, I, enable_if_t<is_time_type<I>::value && is_time_type<O>::
 
 template <>
 struct CastFunctor<Date64Type, Date32Type> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, kMillisecondsInDay,
-                                       *batch[0].array(), out->mutable_array());
+                                       batch[0].array, out->array_span());
   }
 };
 
 template <>
 struct CastFunctor<Date32Type, Date64Type> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return ShiftTime<int64_t, int32_t>(ctx, util::DIVIDE, kMillisecondsInDay,
-                                       *batch[0].array(), out->mutable_array());
+                                       batch[0].array, out->array_span());
   }
 };
 
@@ -389,9 +375,7 @@ struct CastFunctor<Date32Type, Date64Type> {
 
 template <>
 struct CastFunctor<TimestampType, Date32Type> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& out_type = checked_cast<const TimestampType&>(*out->type());
     // get conversion SECOND -> unit
     auto conversion = util::GetTimestampConversion(TimeUnit::SECOND, out_type.unit());
@@ -400,21 +384,19 @@ struct CastFunctor<TimestampType, Date32Type> {
     // multiply to achieve days -> unit
     conversion.second *= kMillisecondsInDay / 1000;
     return ShiftTime<int32_t, int64_t>(ctx, util::MULTIPLY, conversion.second,
-                                       *batch[0].array(), out->mutable_array());
+                                       batch[0].array, out->array_span());
   }
 };
 
 template <>
 struct CastFunctor<TimestampType, Date64Type> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& out_type = checked_cast<const TimestampType&>(*out->type());
 
     // date64 is ms since epoch
     auto conversion = util::GetTimestampConversion(TimeUnit::MILLI, out_type.unit());
     return ShiftTime<int64_t, int64_t>(ctx, conversion.first, conversion.second,
-                                       *batch[0].array(), out->mutable_array());
+                                       batch[0].array, out->array_span());
   }
 };
 
@@ -454,7 +436,7 @@ struct ParseTimestamp {
 
 template <typename I>
 struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& out_type = checked_cast<const TimestampType&>(*out->type());
     applicator::ScalarUnaryNotNullStateful<TimestampType, I, ParseTimestamp> kernel(
         ParseTimestamp{out_type});
@@ -465,7 +447,8 @@ struct CastFunctor<TimestampType, I, enable_if_t<is_base_binary_type<I>::value>>
 template <typename Type>
 void AddCrossUnitCast(CastFunction* func) {
   ScalarKernel kernel;
-  kernel.exec = TrivialScalarUnaryAsArraysExec(CastFunctor<Type, Type>::Exec);
+  kernel.exec = TrivialScalarUnaryAsArraysExec(CastFunctor<Type, Type>::Exec,
+                                               /*use_array_span=*/true);
   kernel.signature = KernelSignature::Make({InputType(Type::type_id)}, kOutputTargetType);
   DCHECK_OK(func->AddKernel(Type::type_id, std::move(kernel)));
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index f5ccb991d9b..38e8b9f58a2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -163,7 +163,7 @@ struct CompareTimestamps
     : public applicator::ScalarBinaryEqualTypes<OutType, ArgType, Op> {
   using Base = applicator::ScalarBinaryEqualTypes<OutType, ArgType, Op>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& lhs = checked_cast<const TimestampType&>(*batch[0].type());
     const auto& rhs = checked_cast<const TimestampType&>(*batch[1].type());
     if (lhs.timezone().empty() ^ rhs.timezone().empty()) {
@@ -330,15 +330,15 @@ template <typename OutType, typename Op>
 struct ScalarMinMax {
   using OutValue = typename GetOutputType<OutType>::T;
 
-  static void ExecScalar(const ExecBatch& batch,
+  static void ExecScalar(const ExecSpan& batch,
                          const ElementWiseAggregateOptions& options, Scalar* out) {
     // All arguments are scalar
     OutValue value{};
     bool valid = false;
-    for (const auto& arg : batch.values) {
+    for (const ExecValue& arg : batch.values) {
       // Ignore non-scalar arguments so we can use it in the mixed-scalar-and-array case
       if (!arg.is_scalar()) continue;
-      const auto& scalar = *arg.scalar();
+      const Scalar& scalar = *arg.scalar;
       if (!scalar.is_valid) {
         if (options.skip_nulls) continue;
         out->is_valid = false;
@@ -358,30 +358,30 @@ struct ScalarMinMax {
     }
   }
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
     const auto descrs = batch.GetDescriptors();
-    const size_t scalar_count =
-        static_cast<size_t>(std::count_if(batch.values.begin(), batch.values.end(),
-                                          [](const Datum& d) { return d.is_scalar(); }));
+    const size_t scalar_count = static_cast<size_t>(
+        std::count_if(batch.values.begin(), batch.values.end(),
+                      [](const ExecValue& v) { return v.is_scalar(); }));
     if (scalar_count == batch.values.size()) {
       ExecScalar(batch, options, out->scalar().get());
       return Status::OK();
     }
 
-    ArrayData* output = out->mutable_array();
+    ArrayData* output = out->array_data().get();
 
     // At least one array, two or more arguments
-    ArrayDataVector arrays;
-    for (const auto& arg : batch.values) {
-      if (!arg.is_array()) continue;
-      arrays.push_back(arg.array());
+    std::vector<const ArraySpan*> arrays;
+    for (const auto& value : batch.values) {
+      if (!value.is_array()) continue;
+      arrays.push_back(&value.array);
     }
 
     bool initialize_output = true;
     if (scalar_count > 0) {
       ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> temp_scalar,
-                            MakeScalar(out->type(), 0));
+                            MakeScalar(out->type()->GetSharedPtr(), 0));
       ExecScalar(batch, options, temp_scalar.get());
       if (temp_scalar->is_valid) {
         const auto value = UnboxScalar<OutType>::Unbox(*temp_scalar);
@@ -392,7 +392,7 @@ struct ScalarMinMax {
         // Abort early
         ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*temp_scalar, batch.length,
                                                               ctx->memory_pool()));
-        *output = *array->data();
+        out->value = std::move(array->data());
         return Status::OK();
       }
     }
@@ -406,47 +406,45 @@ struct ScalarMinMax {
     if (options.skip_nulls && initialize_output) {
       // OR together the validity buffers of all arrays
       if (std::all_of(arrays.begin(), arrays.end(),
-                      [](const std::shared_ptr<ArrayData>& arr) {
-                        return arr->MayHaveNulls();
-                      })) {
-        for (const auto& arr : arrays) {
+                      [](const ArraySpan* arr) { return arr->MayHaveNulls(); })) {
+        for (const ArraySpan* arr : arrays) {
           if (!arr->MayHaveNulls()) continue;
           if (!output->buffers[0]) {
             ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
-            ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
-
-                                          batch.length,
+            ::arrow::internal::CopyBitmap(arr->buffers[0].data, arr->offset, batch.length,
                                           output->buffers[0]->mutable_data(),
                                           /*dest_offset=*/0);
           } else {
-            ::arrow::internal::BitmapOr(
-                output->buffers[0]->data(), /*left_offset=*/0, arr->buffers[0]->data(),
-                arr->offset, batch.length,
-                /*out_offset=*/0, output->buffers[0]->mutable_data());
+            ::arrow::internal::BitmapOr(output->buffers[0]->data(), /*left_offset=*/0,
+                                        arr->buffers[0].data, arr->offset, batch.length,
+                                        /*out_offset=*/0,
+                                        output->buffers[0]->mutable_data());
           }
         }
       }
     } else if (!options.skip_nulls) {
       // AND together the validity buffers of all arrays
-      for (const auto& arr : arrays) {
+      for (const ArraySpan* arr : arrays) {
         if (!arr->MayHaveNulls()) continue;
         if (!output->buffers[0]) {
           ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(batch.length));
-          ::arrow::internal::CopyBitmap(arr->buffers[0]->data(), arr->offset,
-                                        batch.length, output->buffers[0]->mutable_data(),
+          ::arrow::internal::CopyBitmap(arr->buffers[0].data, arr->offset, batch.length,
+                                        output->buffers[0]->mutable_data(),
                                         /*dest_offset=*/0);
         } else {
           ::arrow::internal::BitmapAnd(output->buffers[0]->data(), /*left_offset=*/0,
-                                       arr->buffers[0]->data(), arr->offset, batch.length,
+                                       arr->buffers[0].data, arr->offset, batch.length,
                                        /*out_offset=*/0,
                                        output->buffers[0]->mutable_data());
         }
       }
     }
 
-    for (const auto& array : arrays) {
-      OutputArrayWriter<OutType> writer(out->mutable_array());
-      ArrayIterator<OutType> out_it(*output);
+    for (const ArraySpan* array : arrays) {
+      // TODO(wesm): this got to be a mess in ARROW-16576, clean up
+      ArraySpan out_span(*output);
+      OutputArrayWriter<OutType> writer(&out_span);
+      ArrayIterator<OutType> out_it(out_span);
       int64_t index = 0;
       VisitArrayValuesInline<OutType>(
           *array,
@@ -475,25 +473,25 @@ struct ScalarMinMax {
 template <typename Op>
 Status ExecBinaryMinMaxScalar(KernelContext* ctx,
                               const ElementWiseAggregateOptions& options,
-                              const ExecBatch& batch, Datum* out) {
+                              const ExecSpan& batch, ExecResult* out) {
   if (batch.values.empty()) {
     return Status::OK();
   }
   auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
   if (!options.skip_nulls) {
     // any nulls in the input will produce a null output
-    for (const auto& value : batch.values) {
-      if (!value.scalar()->is_valid) {
+    for (const ExecValue& value : batch.values) {
+      if (!value.scalar->is_valid) {
         output->is_valid = false;
         return Status::OK();
       }
     }
   }
-  const auto& first_scalar = *batch.values.front().scalar();
+  const auto& first_scalar = *batch.values.front().scalar;
   string_view result = checked_cast<const BaseBinaryScalar&>(first_scalar).view();
   bool valid = first_scalar.is_valid;
-  for (size_t i = 1; i < batch.values.size(); i++) {
-    const auto& scalar = *batch[i].scalar();
+  for (int i = 1; i < batch.num_values(); i++) {
+    const Scalar& scalar = *batch[i].scalar;
     if (!scalar.is_valid) {
       DCHECK(options.skip_nulls);
       continue;
@@ -519,10 +517,9 @@ struct BinaryScalarMinMax {
   using BuilderType = typename TypeTraits<Type>::BuilderType;
   using offset_type = typename Type::offset_type;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
-    if (std::all_of(batch.values.begin(), batch.values.end(),
-                    [](const Datum& d) { return d.is_scalar(); })) {
+    if (batch.is_all_scalar()) {
       return ExecBinaryMinMaxScalar<Op>(ctx, options, batch, out);
     }
     return ExecContainingArrays(ctx, options, batch, out);
@@ -530,7 +527,7 @@ struct BinaryScalarMinMax {
 
   static Status ExecContainingArrays(KernelContext* ctx,
                                      const ElementWiseAggregateOptions& options,
-                                     const ExecBatch& batch, Datum* out) {
+                                     const ExecSpan& batch, ExecResult* out) {
     // Presize data to avoid reallocations, using an estimation of final size.
     int64_t estimated_final_size = EstimateOutputSize(batch);
     BuilderType builder(ctx->memory_pool());
@@ -543,9 +540,9 @@ struct BinaryScalarMinMax {
         result = !result ? value : Op::Call(*result, value);
       };
 
-      for (size_t col = 0; col < batch.values.size(); col++) {
+      for (int col = 0; col < batch.num_values(); col++) {
         if (batch[col].is_scalar()) {
-          const auto& scalar = *batch[col].scalar();
+          const Scalar& scalar = *batch[col].scalar;
           if (scalar.is_valid) {
             visit_value(UnboxScalar<Type>::Unbox(scalar));
           } else if (!options.skip_nulls) {
@@ -553,9 +550,9 @@ struct BinaryScalarMinMax {
             break;
           }
         } else {
-          const auto& array = *batch[col].array();
+          const ArraySpan& array = batch[col].array;
           if (!array.MayHaveNulls() ||
-              bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+              bit_util::GetBit(array.buffers[0].data, array.offset + row)) {
             const auto offsets = array.GetValues<offset_type>(1);
             const auto data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
             const int64_t length = offsets[row + 1] - offsets[row];
@@ -577,25 +574,23 @@ struct BinaryScalarMinMax {
 
     std::shared_ptr<Array> string_array;
     RETURN_NOT_OK(builder.Finish(&string_array));
-    *out = *string_array->data();
-    out->mutable_array()->type = batch[0].type();
-    DCHECK_EQ(batch.length, out->array()->length);
+    out->value = std::move(string_array->data());
+    out->array_data()->type = batch[0].type()->GetSharedPtr();
+    DCHECK_EQ(batch.length, out->array_data()->length);
     return Status::OK();
   }
 
   // Compute an estimation for the length of the output batch.
-  static int64_t EstimateOutputSize(const ExecBatch& batch) {
+  static int64_t EstimateOutputSize(const ExecSpan& batch) {
     int64_t estimated_final_size = 0;
-    for (size_t col = 0; col < batch.values.size(); col++) {
-      const auto& datum = batch[col];
-      if (datum.is_scalar()) {
-        const auto& scalar = checked_cast<const BaseBinaryScalar&>(*datum.scalar());
+    for (const ExecValue& value : batch.values) {
+      if (value.is_scalar()) {
+        const auto& scalar = checked_cast<const BaseBinaryScalar&>(*value.scalar);
         if (scalar.is_valid) {
           estimated_final_size = std::max(estimated_final_size, scalar.value->size());
         }
       } else {
-        DCHECK(datum.is_array());
-        const ArrayData& array = *datum.array();
+        const ArraySpan& array = value.array;
         const auto offsets = array.GetValues<offset_type>(1);
         int64_t estimated_current_size = offsets[array.length] - offsets[0];
         estimated_final_size = std::max(estimated_final_size, estimated_current_size);
@@ -607,10 +602,9 @@ struct BinaryScalarMinMax {
 
 template <typename Op>
 struct FixedSizeBinaryScalarMinMax {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
-    if (std::all_of(batch.values.begin(), batch.values.end(),
-                    [](const Datum& d) { return d.is_scalar(); })) {
+    if (batch.is_all_scalar()) {
       return ExecBinaryMinMaxScalar<Op>(ctx, options, batch, out);
     }
     return ExecContainingArrays(ctx, options, batch, out);
@@ -618,26 +612,26 @@ struct FixedSizeBinaryScalarMinMax {
 
   static Status ExecContainingArrays(KernelContext* ctx,
                                      const ElementWiseAggregateOptions& options,
-                                     const ExecBatch& batch, Datum* out) {
-    const auto batch_type = batch[0].type();
-    const auto binary_type = checked_cast<const FixedSizeBinaryType*>(batch_type.get());
+                                     const ExecSpan& batch, ExecResult* out) {
+    const DataType* batch_type = batch[0].type();
+    const auto binary_type = checked_cast<const FixedSizeBinaryType*>(batch_type);
     int32_t byte_width = binary_type->byte_width();
     // Presize data to avoid reallocations.
     int64_t estimated_final_size = batch.length * byte_width;
-    FixedSizeBinaryBuilder builder(batch_type);
+    FixedSizeBinaryBuilder builder(batch_type->GetSharedPtr());
     RETURN_NOT_OK(builder.Reserve(batch.length));
     RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
 
-    std::vector<string_view> valid_cols(batch.values.size());
+    std::vector<string_view> valid_cols(batch.num_values());
     for (int64_t row = 0; row < batch.length; row++) {
       string_view result;
       auto visit_value = [&](string_view value) {
         result = result.empty() ? value : Op::Call(result, value);
       };
 
-      for (size_t col = 0; col < batch.values.size(); col++) {
+      for (int col = 0; col < batch.num_values(); col++) {
         if (batch[col].is_scalar()) {
-          const auto& scalar = *batch[col].scalar();
+          const Scalar& scalar = *batch[col].scalar;
           if (scalar.is_valid) {
             visit_value(UnboxScalar<FixedSizeBinaryType>::Unbox(scalar));
           } else if (!options.skip_nulls) {
@@ -645,9 +639,9 @@ struct FixedSizeBinaryScalarMinMax {
             break;
           }
         } else {
-          const auto& array = *batch[col].array();
+          const ArraySpan& array = batch[col].array;
           if (!array.MayHaveNulls() ||
-              bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+              bit_util::GetBit(array.buffers[0].data, array.offset + row)) {
             const auto data = array.GetValues<uint8_t>(1, /*absolute_offset=*/0);
             visit_value(string_view(
                 reinterpret_cast<const char*>(data) + row * byte_width, byte_width));
@@ -667,9 +661,9 @@ struct FixedSizeBinaryScalarMinMax {
 
     std::shared_ptr<Array> string_array;
     RETURN_NOT_OK(builder.Finish(&string_array));
-    *out = *string_array->data();
-    out->mutable_array()->type = batch[0].type();
-    DCHECK_EQ(batch.length, out->array()->length);
+    out->value = std::move(string_array->data());
+    out->array_data()->type = batch[0].type()->GetSharedPtr();
+    DCHECK_EQ(batch.length, out->array_data()->length);
     return Status::OK();
   }
 };
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index 00dcc55a8ac..de1f61a3231 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -41,30 +41,17 @@ namespace internal {
 
 namespace {
 
-constexpr uint64_t kAllNull = 0;
-constexpr uint64_t kAllValid = ~kAllNull;
-
-util::optional<uint64_t> GetConstantValidityWord(const Datum& data) {
-  if (data.is_scalar()) {
-    return data.scalar()->is_valid ? kAllValid : kAllNull;
+inline Bitmap GetBitmap(const ExecValue& val, int i) {
+  if (val.is_array()) {
+    return Bitmap{val.array.buffers[i].data, val.array.offset, val.array.length};
+  } else {
+    // scalar
+    return {};
   }
-
-  if (data.array()->null_count == data.array()->length) return kAllNull;
-
-  if (!data.array()->MayHaveNulls()) return kAllValid;
-
-  // no constant validity word available
-  return {};
-}
-
-inline Bitmap GetBitmap(const Datum& datum, int i) {
-  if (datum.is_scalar()) return {};
-  const ArrayData& a = *datum.array();
-  return Bitmap{a.buffers[i], a.offset, a.length};
 }
 
 // Ensure parameterized types are identical.
-Status CheckIdenticalTypes(const Datum* begin, size_t count) {
+Status CheckIdenticalTypes(const ExecValue* begin, int count) {
   const auto& ty = begin->type();
   const auto* end = begin + count;
   for (auto it = begin + 1; it != end; ++it) {
@@ -77,170 +64,239 @@ Status CheckIdenticalTypes(const Datum* begin, size_t count) {
   return Status::OK();
 }
 
+constexpr uint64_t kAllNull = 0;
+constexpr uint64_t kAllValid = ~kAllNull;
+
+util::optional<uint64_t> GetConstantValidityWord(const ExecValue& data) {
+  if (data.is_scalar()) {
+    return data.scalar->is_valid ? kAllValid : kAllNull;
+  }
+
+  if (data.array.null_count == data.array.length) return kAllNull;
+  if (!data.array.MayHaveNulls()) return kAllValid;
+
+  // no constant validity word available
+  return {};
+}
+
 // if the condition is null then output is null otherwise we take validity from the
 // selected argument
 // ie. cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
-template <typename AllocateNullBitmap>
-Status PromoteNullsVisitor(KernelContext* ctx, const Datum& cond_d, const Datum& left_d,
-                           const Datum& right_d, ArrayData* output) {
-  auto cond_const = GetConstantValidityWord(cond_d);
-  auto left_const = GetConstantValidityWord(left_d);
-  auto right_const = GetConstantValidityWord(right_d);
+struct IfElseNullPromoter {
+  KernelContext* ctx;
+  const ArraySpan& cond;
+  const ExecValue& left_d;
+  const ExecValue& right_d;
+  ExecResult* output;
 
   enum { COND_CONST = 1, LEFT_CONST = 2, RIGHT_CONST = 4 };
-  auto flag = COND_CONST * cond_const.has_value() | LEFT_CONST * left_const.has_value() |
-              RIGHT_CONST * right_const.has_value();
-
-  const ArrayData& cond = *cond_d.array();
-  // cond.data will always be available
-  Bitmap cond_data{cond.buffers[1], cond.offset, cond.length};
-  Bitmap cond_valid{cond.buffers[0], cond.offset, cond.length};
-  Bitmap left_valid = GetBitmap(left_d, 0);
-  Bitmap right_valid = GetBitmap(right_d, 0);
-
-  // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
-  // In the following cases, we dont need to allocate out_valid bitmap
-
-  // if cond & left & right all ones, then output is all valid.
-  // if output validity buffer is already allocated (NullHandling::
-  // COMPUTED_PREALLOCATE) -> set all bits
-  // else, return nullptr
-  if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
-    if (AllocateNullBitmap::value) {  // NullHandling::COMPUTED_NO_PREALLOCATE
-      output->buffers[0] = nullptr;
-    } else {  // NullHandling::COMPUTED_PREALLOCATE
-      bit_util::SetBitmap(output->buffers[0]->mutable_data(), output->offset,
-                          output->length);
+  int64_t constant_validity_flag;
+  util::optional<uint64_t> cond_const, left_const, right_const;
+  Bitmap cond_data, cond_valid, left_valid, right_valid;
+
+  IfElseNullPromoter(KernelContext* ctx, const ExecValue& cond_d, const ExecValue& left_d,
+                     const ExecValue& right_d, ExecResult* output)
+      : ctx(ctx), cond(cond_d.array), left_d(left_d), right_d(right_d), output(output) {
+    cond_const = GetConstantValidityWord(cond_d);
+    left_const = GetConstantValidityWord(left_d);
+    right_const = GetConstantValidityWord(right_d);
+
+    // Encodes whether each of the arguments is respectively all-null or
+    // all-valid
+    constant_validity_flag =
+        (COND_CONST * cond_const.has_value() | LEFT_CONST * left_const.has_value() |
+         RIGHT_CONST * right_const.has_value());
+
+    // cond.data will always be available / is always an array
+    cond_data = Bitmap{cond.buffers[1].data, cond.offset, cond.length};
+    cond_valid = Bitmap{cond.buffers[0].data, cond.offset, cond.length};
+    left_valid = GetBitmap(left_d, 0);
+    right_valid = GetBitmap(right_d, 0);
+  }
+
+  Status ExecIntoArrayData(bool need_to_allocate) {
+    ArrayData* out_arr = output->array_data().get();
+
+    // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+    // In the following cases, we dont need to allocate out_valid bitmap
+
+    // if cond & left & right all ones, then output is all valid.
+    // if output validity buffer is already allocated (NullHandling::
+    // COMPUTED_PREALLOCATE) -> set all bits
+    // else, return nullptr
+    if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
+      if (need_to_allocate) {  // NullHandling::COMPUTED_NO_PREALLOCATE
+        out_arr->buffers[0] = nullptr;
+      } else {  // NullHandling::COMPUTED_PREALLOCATE
+        bit_util::SetBitmap(out_arr->buffers[0]->mutable_data(), out_arr->offset,
+                            out_arr->length);
+      }
+    } else if (left_const == kAllValid && right_const == kAllValid) {
+      // if both left and right are valid, no need to calculate out_valid bitmap. Copy
+      // cond validity buffer
+      if (need_to_allocate) {  // NullHandling::COMPUTED_NO_PREALLOCATE
+        // if there's an offset, copy bitmap (cannot slice a bitmap)
+        if (cond.offset) {
+          ARROW_ASSIGN_OR_RAISE(
+              out_arr->buffers[0],
+              arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0].data,
+                                          cond.offset, cond.length));
+        } else {
+          // just copy assign cond validity buffer
+          out_arr->buffers[0] = cond.GetBuffer(0);
+        }
+      } else {  // NullHandling::COMPUTED_PREALLOCATE
+        arrow::internal::CopyBitmap(cond.buffers[0].data, cond.offset, cond.length,
+                                    out_arr->buffers[0]->mutable_data(), out_arr->offset);
+      }
+    } else {
+      if (need_to_allocate) {
+        // following cases requires a separate out_valid buffer. COMPUTED_NO_PREALLOCATE
+        // would not have allocated buffers for it.
+        ARROW_ASSIGN_OR_RAISE(out_arr->buffers[0], ctx->AllocateBitmap(cond.length));
+      }
+      WriteOutput(
+          Bitmap{out_arr->buffers[0]->mutable_data(), out_arr->offset, out_arr->length});
     }
     return Status::OK();
   }
 
-  if (left_const == kAllValid && right_const == kAllValid) {
-    // if both left and right are valid, no need to calculate out_valid bitmap. Copy
-    // cond validity buffer
-    if (AllocateNullBitmap::value) {  // NullHandling::COMPUTED_NO_PREALLOCATE
-      // if there's an offset, copy bitmap (cannot slice a bitmap)
-      if (cond.offset) {
-        ARROW_ASSIGN_OR_RAISE(
-            output->buffers[0],
-            arrow::internal::CopyBitmap(ctx->memory_pool(), cond.buffers[0]->data(),
-                                        cond.offset, cond.length));
-      } else {  // just copy assign cond validity buffer
-        output->buffers[0] = cond.buffers[0];
-      }
-    } else {  // NullHandling::COMPUTED_PREALLOCATE
-      arrow::internal::CopyBitmap(cond.buffers[0]->data(), cond.offset, cond.length,
-                                  output->buffers[0]->mutable_data(), output->offset);
+  Status ExecIntoArraySpan() {
+    ArraySpan* out_span = output->array_span();
+
+    // cond.valid & (cond.data & left.valid | ~cond.data & right.valid)
+    // In the following cases, we dont need to allocate out_valid bitmap
+
+    // if cond & left & right all ones, then output is all valid, so set all
+    // bits
+    if (cond_const == kAllValid && left_const == kAllValid && right_const == kAllValid) {
+      bit_util::SetBitmap(out_span->buffers[0].data, out_span->offset, out_span->length);
+    } else if (left_const == kAllValid && right_const == kAllValid) {
+      // if both left and right are valid, no need to calculate out_valid
+      // bitmap. Copy cond validity buffer
+      arrow::internal::CopyBitmap(cond.buffers[0].data, cond.offset, cond.length,
+                                  out_span->buffers[0].data, out_span->offset);
+    } else {
+      WriteOutput(Bitmap{out_span->buffers[0].data, out_span->offset, out_span->length});
     }
     return Status::OK();
   }
 
-  // lambda function that will be used inside the visitor
-  auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
-                   uint64_t r_valid) {
-    return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
-  };
-
-  if (AllocateNullBitmap::value) {
-    // following cases requires a separate out_valid buffer. COMPUTED_NO_PREALLOCATE
-    // would not have allocated buffers for it.
-    ARROW_ASSIGN_OR_RAISE(output->buffers[0], ctx->AllocateBitmap(cond.length));
+  Status Exec(bool need_to_allocate) {
+    if (output->is_array_data()) {
+      return ExecIntoArrayData(need_to_allocate);
+    } else {
+      if (need_to_allocate) {
+        // TODO: turn this into a DCHECK, but have this strong error to be
+        // helpful for now
+        return Status::Invalid(
+            "Conditional kernel writing into array span must "
+            "preallocate validity bitmap");
+      }
+      return ExecIntoArraySpan();
+    }
   }
 
-  std::array<Bitmap, 1> out_bitmaps{
-      Bitmap{output->buffers[0], output->offset, output->length}};
+  void WriteOutput(Bitmap out_bitmap) {
+    // lambda function that will be used inside the visitor
+    auto apply = [&](uint64_t c_valid, uint64_t c_data, uint64_t l_valid,
+                     uint64_t r_valid) {
+      return c_valid & ((c_data & l_valid) | (~c_data & r_valid));
+    };
 
-  switch (flag) {
-    case COND_CONST | LEFT_CONST | RIGHT_CONST: {
-      std::array<Bitmap, 1> bitmaps{cond_data};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 1>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(*cond_const, words_in[0],
-                                                           *left_const, *right_const);
-                                 });
-      break;
-    }
-    case LEFT_CONST | RIGHT_CONST: {
-      std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 2>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(words_in[0], words_in[1],
-                                                           *left_const, *right_const);
-                                 });
-      break;
-    }
-    case COND_CONST | RIGHT_CONST: {
-      // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
-      // Visit()
-      std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 2>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(*cond_const, words_in[0],
-                                                           words_in[1], *right_const);
-                                 });
-      break;
-    }
-    case RIGHT_CONST: {
-      // bitmaps[R_VALID] might be null; override to make it safe for Visit()
-      std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 3>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(words_in[0], words_in[1],
-                                                           words_in[2], *right_const);
-                                 });
-      break;
-    }
-    case COND_CONST | LEFT_CONST: {
-      // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
-      // Visit()
-      std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 2>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(*cond_const, words_in[0],
-                                                           *left_const, words_in[1]);
-                                 });
-      break;
-    }
-    case LEFT_CONST: {
-      // bitmaps[L_VALID] might be null; override to make it safe for Visit()
-      std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 3>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(words_in[0], words_in[1],
-                                                           *left_const, words_in[2]);
-                                 });
-      break;
-    }
-    case COND_CONST: {
-      // bitmaps[C_VALID] might be null; override to make it safe for Visit()
-      std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 3>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(*cond_const, words_in[0],
-                                                           words_in[1], words_in[2]);
-                                 });
-      break;
-    }
-    case 0: {
-      std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
-      Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
-                                 [&](const std::array<uint64_t, 4>& words_in,
-                                     std::array<uint64_t, 1>* word_out) {
-                                   word_out->at(0) = apply(words_in[0], words_in[1],
-                                                           words_in[2], words_in[3]);
-                                 });
-      break;
+    std::array<Bitmap, 1> out_bitmaps{out_bitmap};
+
+    switch (this->constant_validity_flag) {
+      case COND_CONST | LEFT_CONST | RIGHT_CONST: {
+        std::array<Bitmap, 1> bitmaps{cond_data};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 1>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(*cond_const, words_in[0],
+                                                             *left_const, *right_const);
+                                   });
+        break;
+      }
+      case LEFT_CONST | RIGHT_CONST: {
+        std::array<Bitmap, 2> bitmaps{cond_valid, cond_data};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 2>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(words_in[0], words_in[1],
+                                                             *left_const, *right_const);
+                                   });
+        break;
+      }
+      case COND_CONST | RIGHT_CONST: {
+        // bitmaps[C_VALID], bitmaps[R_VALID] might be null; override to make it safe for
+        // Visit()
+        std::array<Bitmap, 2> bitmaps{cond_data, left_valid};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 2>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(*cond_const, words_in[0],
+                                                             words_in[1], *right_const);
+                                   });
+        break;
+      }
+      case RIGHT_CONST: {
+        // bitmaps[R_VALID] might be null; override to make it safe for Visit()
+        std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, left_valid};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 3>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(words_in[0], words_in[1],
+                                                             words_in[2], *right_const);
+                                   });
+        break;
+      }
+      case COND_CONST | LEFT_CONST: {
+        // bitmaps[C_VALID], bitmaps[L_VALID] might be null; override to make it safe for
+        // Visit()
+        std::array<Bitmap, 2> bitmaps{cond_data, right_valid};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 2>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(*cond_const, words_in[0],
+                                                             *left_const, words_in[1]);
+                                   });
+        break;
+      }
+      case LEFT_CONST: {
+        // bitmaps[L_VALID] might be null; override to make it safe for Visit()
+        std::array<Bitmap, 3> bitmaps{cond_valid, cond_data, right_valid};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 3>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(words_in[0], words_in[1],
+                                                             *left_const, words_in[2]);
+                                   });
+        break;
+      }
+      case COND_CONST: {
+        // bitmaps[C_VALID] might be null; override to make it safe for Visit()
+        std::array<Bitmap, 3> bitmaps{cond_data, left_valid, right_valid};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 3>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(*cond_const, words_in[0],
+                                                             words_in[1], words_in[2]);
+                                   });
+        break;
+      }
+      case 0: {
+        std::array<Bitmap, 4> bitmaps{cond_valid, cond_data, left_valid, right_valid};
+        Bitmap::VisitWordsAndWrite(bitmaps, &out_bitmaps,
+                                   [&](const std::array<uint64_t, 4>& words_in,
+                                       std::array<uint64_t, 1>* word_out) {
+                                     word_out->at(0) = apply(words_in[0], words_in[1],
+                                                             words_in[2], words_in[3]);
+                                   });
+        break;
+      }
     }
   }
-  return Status::OK();
-}
+};
 
 using Word = uint64_t;
 static constexpr int64_t word_len = sizeof(Word) * 8;
@@ -259,10 +315,10 @@ static constexpr int64_t word_len = sizeof(Word) * 8;
 /// It should copy `length` number of elements from source array to output array with
 /// `offset` offset in both arrays
 template <typename HandleBlock, bool invert = false>
-void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
+void RunIfElseLoop(const ArraySpan& cond, const HandleBlock& handle_block) {
   int64_t data_offset = 0;
   int64_t bit_offset = cond.offset;
-  const auto* cond_data = cond.buffers[1]->data();  // this is a BoolArray
+  const auto* cond_data = cond.buffers[1].data;  // this is a BoolArray
 
   BitmapWordReader<Word> cond_reader(cond_data, cond.offset, cond.length);
 
@@ -310,30 +366,32 @@ void RunIfElseLoop(const ArrayData& cond, const HandleBlock& handle_block) {
 }
 
 template <typename HandleBlock>
-void RunIfElseLoopInverted(const ArrayData& cond, const HandleBlock& handle_block) {
+void RunIfElseLoopInverted(const ArraySpan& cond, const HandleBlock& handle_block) {
   RunIfElseLoop<HandleBlock, true>(cond, handle_block);
 }
 
 /// Runs if-else when cond is a scalar. Two special functions are required,
 /// 1.CopyArrayData, 2. BroadcastScalar
 template <typename CopyArrayData, typename BroadcastScalar>
-Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, const Datum& right,
-                       Datum* out, const CopyArrayData& copy_array_data,
+Status RunIfElseScalar(const BooleanScalar& cond, const ExecValue& left,
+                       const ExecValue& right, ExecResult* out,
+                       const CopyArrayData& copy_array_data,
                        const BroadcastScalar& broadcast_scalar) {
   if (left.is_scalar() && right.is_scalar()) {  // output will be a scalar
     if (cond.is_valid) {
-      *out = cond.value ? left.scalar() : right.scalar();
+      const Scalar* which_scalar = cond.value ? left.scalar : right.scalar;
+      out->value = which_scalar->GetSharedPtr();
     } else {
-      *out = MakeNullScalar(left.type());
+      out->value = MakeNullScalar(left.type()->GetSharedPtr());
     }
     return Status::OK();
   }
 
   // either left or right is an array. Output is always an array`
-  const std::shared_ptr<ArrayData>& out_array = out->array();
+  ArraySpan* out_array = out->array_span();
   if (!cond.is_valid) {
     // cond is null; output is all null --> clear validity buffer
-    bit_util::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+    bit_util::ClearBitmap(out_array->buffers[0].data, out_array->offset,
                           out_array->length);
     return Status::OK();
   }
@@ -342,27 +400,27 @@ Status RunIfElseScalar(const BooleanScalar& cond, const Datum& left, const Datum
   const auto& valid_data = cond.value ? left : right;
   if (valid_data.is_array()) {
     // valid_data is an array. Hence copy data to the output buffers
-    const auto& valid_array = valid_data.array();
-    if (valid_array->MayHaveNulls()) {
-      arrow::internal::CopyBitmap(
-          valid_array->buffers[0]->data(), valid_array->offset, valid_array->length,
-          out_array->buffers[0]->mutable_data(), out_array->offset);
+    const ArraySpan& valid_array = valid_data.array;
+    if (valid_array.MayHaveNulls()) {
+      arrow::internal::CopyBitmap(valid_array.buffers[0].data, valid_array.offset,
+                                  valid_array.length, out_array->buffers[0].data,
+                                  out_array->offset);
     } else {  // validity buffer is nullptr --> set all bits
-      bit_util::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+      bit_util::SetBitmap(out_array->buffers[0].data, out_array->offset,
                           out_array->length);
     }
-    copy_array_data(*valid_array, out_array.get());
+    copy_array_data(valid_array, out_array);
     return Status::OK();
 
   } else {  // valid data is scalar
     // valid data is a scalar that needs to be broadcasted
-    const auto& valid_scalar = *valid_data.scalar();
+    const Scalar& valid_scalar = *valid_data.scalar;
     if (valid_scalar.is_valid) {  // if the scalar is non-null, broadcast
-      bit_util::SetBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+      bit_util::SetBitmap(out_array->buffers[0].data, out_array->offset,
                           out_array->length);
-      broadcast_scalar(*valid_data.scalar(), out_array.get());
+      broadcast_scalar(*valid_data.scalar, out_array);
     } else {  // scalar is null, clear the output validity buffer
-      bit_util::ClearBitmap(out_array->buffers[0]->mutable_data(), out_array->offset,
+      bit_util::ClearBitmap(out_array->buffers[0].data, out_array->offset,
                             out_array->length);
     }
     return Status::OK();
@@ -383,35 +441,33 @@ struct IfElseFunctor<Type,
   // A - Array, S - Scalar, X = Array/Scalar
 
   // SXX
-  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
-                     const Datum& right, Datum* out) {
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const ExecValue& left,
+                     const ExecValue& right, ExecResult* out) {
     return RunIfElseScalar(
         cond, left, right, out,
         /*CopyArrayData*/
-        [&](const ArrayData& valid_array, ArrayData* out_array) {
-          std::memcpy(out_array->GetMutableValues<T>(1), valid_array.GetValues<T>(1),
+        [&](const ArraySpan& valid_array, ArraySpan* out_array) {
+          std::memcpy(out_array->GetValues<T>(1), valid_array.GetValues<T>(1),
                       valid_array.length * sizeof(T));
         },
         /*BroadcastScalar*/
-        [&](const Scalar& scalar, ArrayData* out_array) {
+        [&](const Scalar& scalar, ArraySpan* out_array) {
           T scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
-          std::fill(out_array->GetMutableValues<T>(1),
-                    out_array->GetMutableValues<T>(1) + out_array->length, scalar_data);
+          std::fill(out_array->GetValues<T>(1),
+                    out_array->GetValues<T>(1) + out_array->length, scalar_data);
         });
   }
 
   //  AAA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const ArrayData& right, ArrayData* out) {
-    T* out_values = out->template GetMutableValues<T>(1);
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const ArraySpan& right, ExecResult* out) {
+    T* out_values = out->array_span()->GetValues<T>(1);
 
     // copy right data to out_buff
-    const T* right_data = right.GetValues<T>(1);
-    std::memcpy(out_values, right_data, right.length * sizeof(T));
+    std::memcpy(out_values, right.GetValues<T>(1), right.length * sizeof(T));
 
     // selectively copy values from left data
     const T* left_data = left.GetValues<T>(1);
-
     RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
       std::memcpy(out_values + data_offset, left_data + data_offset,
                   num_elems * sizeof(T));
@@ -421,13 +477,12 @@ struct IfElseFunctor<Type,
   }
 
   // ASA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const ArrayData& right, ArrayData* out) {
-    T* out_values = out->template GetMutableValues<T>(1);
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const ArraySpan& right, ExecResult* out) {
+    T* out_values = out->array_span()->GetValues<T>(1);
 
     // copy right data to out_buff
-    const T* right_data = right.GetValues<T>(1);
-    std::memcpy(out_values, right_data, right.length * sizeof(T));
+    std::memcpy(out_values, right.GetValues<T>(1), right.length * sizeof(T));
 
     // selectively copy values from left data
     T left_data = internal::UnboxScalar<Type>::Unbox(left);
@@ -441,9 +496,9 @@ struct IfElseFunctor<Type,
   }
 
   // AAS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const Scalar& right, ArrayData* out) {
-    T* out_values = out->template GetMutableValues<T>(1);
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const Scalar& right, ExecResult* out) {
+    T* out_values = out->array_span()->GetValues<T>(1);
 
     // copy left data to out_buff
     const T* left_data = left.GetValues<T>(1);
@@ -460,9 +515,9 @@ struct IfElseFunctor<Type,
   }
 
   // ASS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const Scalar& right, ArrayData* out) {
-    T* out_values = out->template GetMutableValues<T>(1);
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const Scalar& right, ExecResult* out) {
+    T* out_values = out->array_span()->GetValues<T>(1);
 
     // copy right data to out_buff
     T right_data = internal::UnboxScalar<Type>::Unbox(right);
@@ -484,113 +539,116 @@ struct IfElseFunctor<Type, enable_if_boolean<Type>> {
   // A - Array, S - Scalar, X = Array/Scalar
 
   // SXX
-  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
-                     const Datum& right, Datum* out) {
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const ExecValue& left,
+                     const ExecValue& right, ExecResult* out) {
     return RunIfElseScalar(
         cond, left, right, out,
         /*CopyArrayData*/
-        [&](const ArrayData& valid_array, ArrayData* out_array) {
-          arrow::internal::CopyBitmap(
-              valid_array.buffers[1]->data(), valid_array.offset, valid_array.length,
-              out_array->buffers[1]->mutable_data(), out_array->offset);
+        [&](const ArraySpan& valid_array, ArraySpan* out_array) {
+          arrow::internal::CopyBitmap(valid_array.buffers[1].data, valid_array.offset,
+                                      valid_array.length, out_array->buffers[1].data,
+                                      out_array->offset);
         },
         /*BroadcastScalar*/
-        [&](const Scalar& scalar, ArrayData* out_array) {
+        [&](const Scalar& scalar, ArraySpan* out_array) {
           bool scalar_data = internal::UnboxScalar<Type>::Unbox(scalar);
-          bit_util::SetBitsTo(out_array->buffers[1]->mutable_data(), out_array->offset,
+          bit_util::SetBitsTo(out_array->buffers[1].data, out_array->offset,
                               out_array->length, scalar_data);
         });
   }
 
   // AAA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const ArraySpan& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
     // out_buff = right & ~cond
-    const auto& out_buf = out->buffers[1];
-    arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
-                                  cond.buffers[1]->data(), cond.offset, cond.length,
-                                  out->offset, out_buf->mutable_data());
+    arrow::internal::BitmapAndNot(right.buffers[1].data, right.offset,
+                                  cond.buffers[1].data, cond.offset, cond.length,
+                                  out_arr->offset, out_arr->buffers[1].data);
 
     // out_buff = left & cond
-    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Buffer> temp_buf,
-                          arrow::internal::BitmapAnd(
-                              ctx->memory_pool(), left.buffers[1]->data(), left.offset,
-                              cond.buffers[1]->data(), cond.offset, cond.length, 0));
+    ARROW_ASSIGN_OR_RAISE(
+        std::shared_ptr<Buffer> temp_buf,
+        arrow::internal::BitmapAnd(ctx->memory_pool(), left.buffers[1].data, left.offset,
+                                   cond.buffers[1].data, cond.offset, cond.length, 0));
 
-    arrow::internal::BitmapOr(out_buf->data(), out->offset, temp_buf->data(), 0,
-                              cond.length, out->offset, out_buf->mutable_data());
+    arrow::internal::BitmapOr(out_arr->buffers[1].data, out_arr->offset, temp_buf->data(),
+                              0, cond.length, out_arr->offset, out_arr->buffers[1].data);
 
     return Status::OK();
   }
 
   // ASA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const ArraySpan& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
+
     // out_buff = right & ~cond
-    const auto& out_buf = out->buffers[1];
-    arrow::internal::BitmapAndNot(right.buffers[1]->data(), right.offset,
-                                  cond.buffers[1]->data(), cond.offset, cond.length,
-                                  out->offset, out_buf->mutable_data());
+    arrow::internal::BitmapAndNot(right.buffers[1].data, right.offset,
+                                  cond.buffers[1].data, cond.offset, cond.length,
+                                  out_arr->offset, out_arr->buffers[1].data);
 
     // out_buff = left & cond
     bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
     if (left_data) {
-      arrow::internal::BitmapOr(out_buf->data(), out->offset, cond.buffers[1]->data(),
-                                cond.offset, cond.length, out->offset,
-                                out_buf->mutable_data());
+      arrow::internal::BitmapOr(out_arr->buffers[1].data, out_arr->offset,
+                                cond.buffers[1].data, cond.offset, cond.length,
+                                out_arr->offset, out_arr->buffers[1].data);
     }
 
     return Status::OK();
   }
 
   // AAS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const Scalar& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
+
     // out_buff = left & cond
-    const auto& out_buf = out->buffers[1];
-    arrow::internal::BitmapAnd(left.buffers[1]->data(), left.offset,
-                               cond.buffers[1]->data(), cond.offset, cond.length,
-                               out->offset, out_buf->mutable_data());
+    arrow::internal::BitmapAnd(left.buffers[1].data, left.offset, cond.buffers[1].data,
+                               cond.offset, cond.length, out_arr->offset,
+                               out_arr->buffers[1].data);
 
     bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
 
     // out_buff = left & cond | right & ~cond
     if (right_data) {
-      arrow::internal::BitmapOrNot(out_buf->data(), out->offset, cond.buffers[1]->data(),
-                                   cond.offset, cond.length, out->offset,
-                                   out_buf->mutable_data());
+      arrow::internal::BitmapOrNot(out_arr->buffers[1].data, out_arr->offset,
+                                   cond.buffers[1].data, cond.offset, cond.length,
+                                   out_arr->offset, out_arr->buffers[1].data);
     }
 
     return Status::OK();
   }
 
   // ASS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const Scalar& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
+
     bool left_data = internal::UnboxScalar<BooleanType>::Unbox(left);
     bool right_data = internal::UnboxScalar<BooleanType>::Unbox(right);
 
-    const auto& out_buf = out->buffers[1];
+    uint8_t* out_buf = out_arr->buffers[1].data;
 
     // out_buf = left & cond | right & ~cond
-    //    std::shared_ptr<Buffer> out_buf = nullptr;
     if (left_data) {
       if (right_data) {
         // out_buf = ones
-        bit_util::SetBitmap(out_buf->mutable_data(), out->offset, cond.length);
+        bit_util::SetBitmap(out_buf, out_arr->offset, cond.length);
       } else {
         // out_buf = cond
-        arrow::internal::CopyBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
-                                    out_buf->mutable_data(), out->offset);
+        arrow::internal::CopyBitmap(cond.buffers[1].data, cond.offset, cond.length,
+                                    out_buf, out_arr->offset);
       }
     } else {
       if (right_data) {
         // out_buf = ~cond
-        arrow::internal::InvertBitmap(cond.buffers[1]->data(), cond.offset, cond.length,
-                                      out_buf->mutable_data(), out->offset);
+        arrow::internal::InvertBitmap(cond.buffers[1].data, cond.offset, cond.length,
+                                      out_buf, out_arr->offset);
       } else {
         // out_buf = zeros
-        bit_util::ClearBitmap(out_buf->mutable_data(), out->offset, cond.length);
+        bit_util::ClearBitmap(out_buf, out_arr->offset, cond.length);
       }
     }
 
@@ -598,6 +656,42 @@ struct IfElseFunctor<Type, enable_if_boolean<Type>> {
   }
 };
 
+static Status IfElseGenericSXXCall(KernelContext* ctx, const BooleanScalar& cond,
+                                   const ExecValue& left, const ExecValue& right,
+                                   ExecResult* out) {
+  if (left.is_scalar() && right.is_scalar()) {
+    if (cond.is_valid) {
+      const Scalar* which_scalar = cond.value ? left.scalar : right.scalar;
+      out->value = which_scalar->GetSharedPtr();
+    } else {
+      out->value = MakeNullScalar(left.type()->GetSharedPtr());
+    }
+    return Status::OK();
+  }
+  // either left or right is an array. Output is always an array
+  int64_t out_arr_len = std::max(left.length(), right.length());
+  if (!cond.is_valid) {
+    // cond is null; just create a null array
+    ARROW_ASSIGN_OR_RAISE(
+        std::shared_ptr<Array> result,
+        MakeArrayOfNull(left.type()->GetSharedPtr(), out_arr_len, ctx->memory_pool()));
+    out->value = std::move(result->data());
+    return Status::OK();
+  }
+
+  const ExecValue& valid_data = cond.value ? left : right;
+  if (valid_data.is_array()) {
+    out->value = valid_data.array.ToArrayData();
+  } else {
+    // valid data is a scalar that needs to be broadcasted
+    ARROW_ASSIGN_OR_RAISE(
+        std::shared_ptr<Array> result,
+        MakeArrayFromScalar(*valid_data.scalar, out_arr_len, ctx->memory_pool()));
+    out->value = std::move(result->data());
+  }
+  return Status::OK();
+}
+
 template <typename Type>
 struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
   using OffsetType = typename TypeTraits<Type>::OffsetType::c_type;
@@ -607,43 +701,25 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
   // A - Array, S - Scalar, X = Array/Scalar
 
   // SXX
-  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
-                     const Datum& right, Datum* out) {
-    if (left.is_scalar() && right.is_scalar()) {
-      if (cond.is_valid) {
-        *out = cond.value ? left.scalar() : right.scalar();
-      } else {
-        *out = MakeNullScalar(left.type());
-      }
-      return Status::OK();
-    }
-    // either left or right is an array. Output is always an array
-    int64_t out_arr_len = std::max(left.length(), right.length());
-    if (!cond.is_valid) {
-      // cond is null; just create a null array
-      ARROW_ASSIGN_OR_RAISE(*out,
-                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
-      return Status::OK();
-    }
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const ExecValue& left,
+                     const ExecValue& right, ExecResult* out) {
+    return IfElseGenericSXXCall(ctx, cond, left, right, out);
+  }
 
-    const auto& valid_data = cond.value ? left : right;
-    if (valid_data.is_array()) {
-      *out = valid_data;
-    } else {
-      // valid data is a scalar that needs to be broadcasted
-      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
-                                                      ctx->memory_pool()));
-    }
+  static Status WrapResult(BuilderType* builder, ArrayData* out) {
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> out_arr, builder->Finish());
+    out->SetNullCount(out_arr->data()->null_count);
+    out->buffers = std::move(out_arr->data()->buffers);
     return Status::OK();
   }
 
   //  AAA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const ArraySpan& right, ExecResult* out) {
     const auto* left_offsets = left.GetValues<OffsetType>(1);
-    const uint8_t* left_data = left.buffers[2]->data();
+    const uint8_t* left_data = left.buffers[2].data;
     const auto* right_offsets = right.GetValues<OffsetType>(1);
-    const uint8_t* right_data = right.buffers[2]->data();
+    const uint8_t* right_data = right.buffers[2].data;
 
     // allocate data buffer conservatively
     int64_t data_buff_alloc = left_offsets[left.length] - left_offsets[0] +
@@ -654,7 +730,7 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
     ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
 
     RunLoop(
-        cond, *out,
+        cond, *out->array_data(),
         [&](int64_t i) {
           builder.UnsafeAppend(left_data + left_offsets[i],
                                left_offsets[i + 1] - left_offsets[i]);
@@ -664,23 +740,17 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
                                right_offsets[i + 1] - right_offsets[i]);
         },
         [&]() { builder.UnsafeAppendNull(); });
-    ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
-    out->SetNullCount(out_arr->data()->null_count);
-    out->buffers[0] = std::move(out_arr->data()->buffers[0]);
-    out->buffers[1] = std::move(out_arr->data()->buffers[1]);
-    out->buffers[2] = std::move(out_arr->data()->buffers[2]);
-    return Status::OK();
+    return WrapResult(&builder, out->array_data().get());
   }
 
   // ASA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const ArraySpan& right, ExecResult* out) {
     util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
     auto left_size = static_cast<OffsetType>(left_data.size());
 
     const auto* right_offsets = right.GetValues<OffsetType>(1);
-    const uint8_t* right_data = right.buffers[2]->data();
+    const uint8_t* right_data = right.buffers[2].data;
 
     // allocate data buffer conservatively
     int64_t data_buff_alloc =
@@ -691,26 +761,21 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
     ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
 
     RunLoop(
-        cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+        cond, *out->array_data(),
+        [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
         [&](int64_t i) {
           builder.UnsafeAppend(right_data + right_offsets[i],
                                right_offsets[i + 1] - right_offsets[i]);
         },
         [&]() { builder.UnsafeAppendNull(); });
-    ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
-    out->SetNullCount(out_arr->data()->null_count);
-    out->buffers[0] = std::move(out_arr->data()->buffers[0]);
-    out->buffers[1] = std::move(out_arr->data()->buffers[1]);
-    out->buffers[2] = std::move(out_arr->data()->buffers[2]);
-    return Status::OK();
+    return WrapResult(&builder, out->array_data().get());
   }
 
   // AAS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const Scalar& right, ExecResult* out) {
     const auto* left_offsets = left.GetValues<OffsetType>(1);
-    const uint8_t* left_data = left.buffers[2]->data();
+    const uint8_t* left_data = left.buffers[2].data;
 
     util::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
     auto right_size = static_cast<OffsetType>(right_data.size());
@@ -724,25 +789,19 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
     ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
 
     RunLoop(
-        cond, *out,
+        cond, *out->array_data(),
         [&](int64_t i) {
           builder.UnsafeAppend(left_data + left_offsets[i],
                                left_offsets[i + 1] - left_offsets[i]);
         },
         [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
         [&]() { builder.UnsafeAppendNull(); });
-    ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
-    out->SetNullCount(out_arr->data()->null_count);
-    out->buffers[0] = std::move(out_arr->data()->buffers[0]);
-    out->buffers[1] = std::move(out_arr->data()->buffers[1]);
-    out->buffers[2] = std::move(out_arr->data()->buffers[2]);
-    return Status::OK();
+    return WrapResult(&builder, out->array_data().get());
   }
 
   // ASS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const Scalar& right, ExecResult* out) {
     util::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
     auto left_size = static_cast<OffsetType>(left_data.size());
 
@@ -756,23 +815,18 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
     ARROW_RETURN_NOT_OK(builder.ReserveData(data_buff_alloc));
 
     RunLoop(
-        cond, *out, [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
+        cond, *out->array_data(),
+        [&](int64_t i) { builder.UnsafeAppend(left_data.data(), left_size); },
         [&](int64_t i) { builder.UnsafeAppend(right_data.data(), right_size); },
         [&]() { builder.UnsafeAppendNull(); });
-    ARROW_ASSIGN_OR_RAISE(auto out_arr, builder.Finish());
-
-    out->SetNullCount(out_arr->data()->null_count);
-    out->buffers[0] = std::move(out_arr->data()->buffers[0]);
-    out->buffers[1] = std::move(out_arr->data()->buffers[1]);
-    out->buffers[2] = std::move(out_arr->data()->buffers[2]);
-    return Status::OK();
+    return WrapResult(&builder, out->array_data().get());
   }
 
   template <typename HandleLeft, typename HandleRight, typename HandleNull>
-  static void RunLoop(const ArrayData& cond, const ArrayData& output,
+  static void RunLoop(const ArraySpan& cond, const ArrayData& output,
                       HandleLeft&& handle_left, HandleRight&& handle_right,
                       HandleNull&& handle_null) {
-    const auto* cond_data = cond.buffers[1]->data();
+    const uint8_t* cond_data = cond.buffers[1].data;
 
     if (output.buffers[0]) {  // output may have nulls
       // output validity buffer is allocated internally from the IfElseFunctor. Therefore
@@ -799,23 +853,21 @@ struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
   // A - Array, S - Scalar, X = Array/Scalar
 
   // SXX
-  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
-                     const Datum& right, Datum* out) {
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const ExecValue& left,
+                     const ExecValue& right, ExecResult* out) {
     ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type(), *right.type()));
     return RunIfElseScalar(
         cond, left, right, out,
         /*CopyArrayData*/
-        [&](const ArrayData& valid_array, ArrayData* out_array) {
-          std::memcpy(
-              out_array->buffers[1]->mutable_data() + out_array->offset * byte_width,
-              valid_array.buffers[1]->data() + valid_array.offset * byte_width,
-              valid_array.length * byte_width);
+        [&](const ArraySpan& valid_array, ArraySpan* out_array) {
+          std::memcpy(out_array->buffers[1].data + out_array->offset * byte_width,
+                      valid_array.buffers[1].data + valid_array.offset * byte_width,
+                      valid_array.length * byte_width);
         },
         /*BroadcastScalar*/
-        [&](const Scalar& scalar, ArrayData* out_array) {
+        [&](const Scalar& scalar, ArraySpan* out_array) {
           const uint8_t* scalar_data = UnboxBinaryScalar(scalar);
-          uint8_t* start =
-              out_array->buffers[1]->mutable_data() + out_array->offset * byte_width;
+          uint8_t* start = out_array->buffers[1].data + out_array->offset * byte_width;
           for (int64_t i = 0; i < out_array->length; i++) {
             std::memcpy(start + i * byte_width, scalar_data, byte_width);
           }
@@ -823,17 +875,19 @@ struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
   }
 
   //  AAA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const ArraySpan& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
+
     ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
-    auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+    auto* out_values = out_arr->buffers[1].data + out_arr->offset * byte_width;
 
     // copy right data to out_buff
-    const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+    const uint8_t* right_data = right.buffers[1].data + right.offset * byte_width;
     std::memcpy(out_values, right_data, right.length * byte_width);
 
     // selectively copy values from left data
-    const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+    const uint8_t* left_data = left.buffers[1].data + left.offset * byte_width;
 
     RunIfElseLoop(cond, [&](int64_t data_offset, int64_t num_elems) {
       std::memcpy(out_values + data_offset * byte_width,
@@ -844,13 +898,15 @@ struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
   }
 
   // ASA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const ArraySpan& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
+
     ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
-    auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+    auto* out_values = out_arr->buffers[1].data + out_arr->offset * byte_width;
 
     // copy right data to out_buff
-    const uint8_t* right_data = right.buffers[1]->data() + right.offset * byte_width;
+    const uint8_t* right_data = right.buffers[1].data + right.offset * byte_width;
     std::memcpy(out_values, right_data, right.length * byte_width);
 
     // selectively copy values from left data
@@ -868,13 +924,15 @@ struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
   }
 
   // AAS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const Scalar& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
+
     ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
-    auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+    auto* out_values = out_arr->buffers[1].data + out_arr->offset * byte_width;
 
     // copy left data to out_buff
-    const uint8_t* left_data = left.buffers[1]->data() + left.offset * byte_width;
+    const uint8_t* left_data = left.buffers[1].data + left.offset * byte_width;
     std::memcpy(out_values, left_data, left.length * byte_width);
 
     const uint8_t* right_data = UnboxBinaryScalar(right);
@@ -892,10 +950,12 @@ struct IfElseFunctor<Type, enable_if_fixed_size_binary<Type>> {
   }
 
   // ASS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const Scalar& right, ExecResult* out) {
+    ArraySpan* out_arr = out->array_span();
+
     ARROW_ASSIGN_OR_RAISE(auto byte_width, GetByteWidth(*left.type, *right.type));
-    auto* out_values = out->buffers[1]->mutable_data() + out->offset * byte_width;
+    auto* out_values = out_arr->buffers[1].data + out_arr->offset * byte_width;
 
     // copy right data to out_buff
     const uint8_t* right_data = UnboxBinaryScalar(right);
@@ -948,39 +1008,14 @@ struct NestedIfElseExec {
   // A - Array, S - Scalar, X = Array/Scalar
 
   // SXX
-  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const Datum& left,
-                     const Datum& right, Datum* out) {
-    if (left.is_scalar() && right.is_scalar()) {
-      if (cond.is_valid) {
-        *out = cond.value ? left.scalar() : right.scalar();
-      } else {
-        *out = MakeNullScalar(left.type());
-      }
-      return Status::OK();
-    }
-    // either left or right is an array. Output is always an array
-    int64_t out_arr_len = std::max(left.length(), right.length());
-    if (!cond.is_valid) {
-      // cond is null; just create a null array
-      ARROW_ASSIGN_OR_RAISE(*out,
-                            MakeArrayOfNull(left.type(), out_arr_len, ctx->memory_pool()))
-      return Status::OK();
-    }
-
-    const auto& valid_data = cond.value ? left : right;
-    if (valid_data.is_array()) {
-      *out = valid_data;
-    } else {
-      // valid data is a scalar that needs to be broadcasted
-      ARROW_ASSIGN_OR_RAISE(*out, MakeArrayFromScalar(*valid_data.scalar(), out_arr_len,
-                                                      ctx->memory_pool()));
-    }
-    return Status::OK();
+  static Status Call(KernelContext* ctx, const BooleanScalar& cond, const ExecValue& left,
+                     const ExecValue& right, ExecResult* out) {
+    return IfElseGenericSXXCall(ctx, cond, left, right, out);
   }
 
   //  AAA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const ArraySpan& right, ExecResult* out) {
     return RunLoop(
         ctx, cond, out,
         [&](ArrayBuilder* builder, int64_t i, int64_t length) {
@@ -992,8 +1027,8 @@ struct NestedIfElseExec {
   }
 
   // ASA
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const ArrayData& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const ArraySpan& right, ExecResult* out) {
     return RunLoop(
         ctx, cond, out,
         [&](ArrayBuilder* builder, int64_t i, int64_t length) {
@@ -1005,8 +1040,8 @@ struct NestedIfElseExec {
   }
 
   // AAS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const ArrayData& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const ArraySpan& left,
+                     const Scalar& right, ExecResult* out) {
     return RunLoop(
         ctx, cond, out,
         [&](ArrayBuilder* builder, int64_t i, int64_t length) {
@@ -1018,8 +1053,8 @@ struct NestedIfElseExec {
   }
 
   // ASS
-  static Status Call(KernelContext* ctx, const ArrayData& cond, const Scalar& left,
-                     const Scalar& right, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar& left,
+                     const Scalar& right, ExecResult* out) {
     return RunLoop(
         ctx, cond, out,
         [&](ArrayBuilder* builder, int64_t i, int64_t length) {
@@ -1031,15 +1066,16 @@ struct NestedIfElseExec {
   }
 
   template <typename HandleLeft, typename HandleRight>
-  static Status RunLoop(KernelContext* ctx, const ArrayData& cond, ArrayData* out,
+  static Status RunLoop(KernelContext* ctx, const ArraySpan& cond, ExecResult* out,
                         HandleLeft&& handle_left, HandleRight&& handle_right) {
     std::unique_ptr<ArrayBuilder> raw_builder;
-    RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type, &raw_builder));
-    RETURN_NOT_OK(raw_builder->Reserve(out->length));
+    RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
+                                        &raw_builder));
+    RETURN_NOT_OK(raw_builder->Reserve(out->length()));
 
-    const auto* cond_data = cond.buffers[1]->data();
-    if (cond.buffers[0]) {
-      BitRunReader reader(cond.buffers[0]->data(), cond.offset, cond.length);
+    const auto* cond_data = cond.buffers[1].data;
+    if (cond.buffers[0].data != nullptr) {
+      BitRunReader reader(cond.buffers[0].data, cond.offset, cond.length);
       int64_t position = 0;
       while (true) {
         auto run = reader.NextRun();
@@ -1072,31 +1108,27 @@ struct NestedIfElseExec {
       }
     }
     ARROW_ASSIGN_OR_RAISE(auto out_arr, raw_builder->Finish());
-    *out = std::move(*out_arr->data());
+    out->value = std::move(out_arr->data());
     return Status::OK();
   }
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[1], /*count=*/2));
     if (batch[0].is_scalar()) {
       const auto& cond = batch[0].scalar_as<BooleanScalar>();
       return Call(ctx, cond, batch[1], batch[2], out);
     }
-    if (batch[1].kind() == Datum::ARRAY) {
-      if (batch[2].kind() == Datum::ARRAY) {  // AAA
-        return Call(ctx, *batch[0].array(), *batch[1].array(), *batch[2].array(),
-                    out->mutable_array());
+    if (batch[1].is_array()) {
+      if (batch[2].is_array()) {  // AAA
+        return Call(ctx, batch[0].array, batch[1].array, batch[2].array, out);
       } else {  // AAS
-        return Call(ctx, *batch[0].array(), *batch[1].array(), *batch[2].scalar(),
-                    out->mutable_array());
+        return Call(ctx, batch[0].array, batch[1].array, *batch[2].scalar, out);
       }
     } else {
-      if (batch[2].kind() == Datum::ARRAY) {  // ASA
-        return Call(ctx, *batch[0].array(), *batch[1].scalar(), *batch[2].array(),
-                    out->mutable_array());
+      if (batch[2].is_array()) {  // ASA
+        return Call(ctx, batch[0].array, *batch[1].scalar, batch[2].array, out);
       } else {  // ASS
-        return Call(ctx, *batch[0].array(), *batch[1].scalar(), *batch[2].scalar(),
-                    out->mutable_array());
+        return Call(ctx, batch[0].array, *batch[1].scalar, *batch[2].scalar, out);
       }
     }
   }
@@ -1104,7 +1136,7 @@ struct NestedIfElseExec {
 
 template <typename Type, typename AllocateMem>
 struct ResolveIfElseExec {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // Check is unconditional because parametric types like timestamp
     // are templated as integer
     RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[1], /*count=*/2));
@@ -1115,25 +1147,26 @@ struct ResolveIfElseExec {
       return IfElseFunctor<Type>::Call(ctx, cond, batch[1], batch[2], out);
     }
 
-    // cond is array. Use functors to sort things out
-    ARROW_RETURN_NOT_OK(PromoteNullsVisitor<AllocateMem>(ctx, batch[0], batch[1],
-                                                         batch[2], out->mutable_array()));
+    // cond is array. Compute the output validity bitmap and then invoke the
+    // correct functor
+    IfElseNullPromoter null_promoter(ctx, batch[0], batch[1], batch[2], out);
+    ARROW_RETURN_NOT_OK(null_promoter.Exec(AllocateMem::value));
 
-    if (batch[1].kind() == Datum::ARRAY) {
-      if (batch[2].kind() == Datum::ARRAY) {  // AAA
-        return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
-                                         *batch[2].array(), out->mutable_array());
+    if (batch[1].is_array()) {
+      if (batch[2].is_array()) {  // AAA
+        return IfElseFunctor<Type>::Call(ctx, batch[0].array, batch[1].array,
+                                         batch[2].array, out);
       } else {  // AAS
-        return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].array(),
-                                         *batch[2].scalar(), out->mutable_array());
+        return IfElseFunctor<Type>::Call(ctx, batch[0].array, batch[1].array,
+                                         *batch[2].scalar, out);
       }
     } else {
-      if (batch[2].kind() == Datum::ARRAY) {  // ASA
-        return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
-                                         *batch[2].array(), out->mutable_array());
+      if (batch[2].is_array()) {  // ASA
+        return IfElseFunctor<Type>::Call(ctx, batch[0].array, *batch[1].scalar,
+                                         batch[2].array, out);
       } else {  // ASS
-        return IfElseFunctor<Type>::Call(ctx, *batch[0].array(), *batch[1].scalar(),
-                                         *batch[2].scalar(), out->mutable_array());
+        return IfElseFunctor<Type>::Call(ctx, batch[0].array, *batch[1].scalar,
+                                         *batch[2].scalar, out);
       }
     }
   }
@@ -1141,13 +1174,14 @@ struct ResolveIfElseExec {
 
 template <typename AllocateMem>
 struct ResolveIfElseExec<NullType, AllocateMem> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // if all are scalars, return a null scalar
     if (batch[0].is_scalar() && batch[1].is_scalar() && batch[2].is_scalar()) {
-      *out = MakeNullScalar(null());
+      out->value = MakeNullScalar(null());
     } else {
-      ARROW_ASSIGN_OR_RAISE(*out,
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> result,
                             MakeArrayOfNull(null(), batch.length, ctx->memory_pool()));
+      out->value = std::move(result->data());
     }
     return Status::OK();
   }
@@ -1280,35 +1314,35 @@ void AddNestedIfElseKernels(const std::shared_ptr<IfElseFunction>& scalar_functi
   }
 }
 
-// Copy fixed-width values from a scalar/array datum into an output values buffer
+// Copy fixed-width values from a scalar/array into an output values buffer
 template <typename Type>
-void CopyValues(const Datum& in_values, const int64_t in_offset, const int64_t length,
+void CopyValues(const ExecValue& in_values, const int64_t in_offset, const int64_t length,
                 uint8_t* out_valid, uint8_t* out_values, const int64_t out_offset) {
   if (in_values.is_scalar()) {
-    const auto& scalar = *in_values.scalar();
+    const Scalar& scalar = *in_values.scalar;
     if (out_valid) {
       bit_util::SetBitsTo(out_valid, out_offset, length, scalar.is_valid);
     }
     CopyDataUtils<Type>::CopyData(*scalar.type, scalar, /*in_offset=*/0, out_values,
                                   out_offset, length);
   } else {
-    const ArrayData& array = *in_values.array();
+    const ArraySpan& array = in_values.array;
     if (out_valid) {
       if (array.MayHaveNulls()) {
         if (length == 1) {
           // CopyBitmap is slow for short runs
           bit_util::SetBitTo(
               out_valid, out_offset,
-              bit_util::GetBit(array.buffers[0]->data(), array.offset + in_offset));
+              bit_util::GetBit(array.buffers[0].data, array.offset + in_offset));
         } else {
-          arrow::internal::CopyBitmap(array.buffers[0]->data(), array.offset + in_offset,
+          arrow::internal::CopyBitmap(array.buffers[0].data, array.offset + in_offset,
                                       length, out_valid, out_offset);
         }
       } else {
         bit_util::SetBitsTo(out_valid, out_offset, length, true);
       }
     }
-    CopyDataUtils<Type>::CopyData(*array.type, array.buffers[1]->data(),
+    CopyDataUtils<Type>::CopyData(*array.type, array.buffers[1].data,
                                   array.offset + in_offset, out_values, out_offset,
                                   length);
   }
@@ -1341,15 +1375,15 @@ void CopyOneScalarValue(const Scalar& scalar, uint8_t* out_valid, uint8_t* out_v
 }
 
 template <typename Type>
-void CopyOneValue(const Datum& in_values, const int64_t in_offset, uint8_t* out_valid,
+void CopyOneValue(const ExecValue& in_values, const int64_t in_offset, uint8_t* out_valid,
                   uint8_t* out_values, const int64_t out_offset) {
   if (in_values.is_array()) {
-    const ArrayData& array = *in_values.array();
+    const ArraySpan& array = in_values.array;
     CopyOneArrayValue<Type>(*array.type, array.GetValues<uint8_t>(0, 0),
                             array.GetValues<uint8_t>(1, 0), array.offset + in_offset,
                             out_valid, out_values, out_offset);
   } else {
-    CopyOneScalarValue<Type>(*in_values.scalar(), out_valid, out_values, out_offset);
+    CopyOneScalarValue<Type>(*in_values.scalar, out_valid, out_values, out_offset);
   }
 }
 
@@ -1412,64 +1446,92 @@ struct CaseWhenFunction : ScalarFunction {
 
 // Implement a 'case when' (SQL)/'select' (NumPy) function for any scalar conditions
 template <typename Type>
-Status ExecScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  const auto& conds = checked_cast<const StructScalar&>(*batch.values[0].scalar());
+Status ExecScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const auto& conds = checked_cast<const StructScalar&>(*batch[0].scalar);
   if (!conds.is_valid) {
     return Status::Invalid("cond struct must not be null");
   }
-  Datum result;
+  ExecValue result;
+  bool has_result = false;
   for (size_t i = 0; i < batch.values.size() - 1; i++) {
     if (i < conds.value.size()) {
       const Scalar& cond = *conds.value[i];
       if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
         result = batch[i + 1];
+        has_result = true;
         break;
       }
     } else {
       // ELSE clause
       result = batch[i + 1];
+      has_result = true;
       break;
     }
   }
   if (out->is_scalar()) {
-    *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
+    out->value = result.is_scalar() ? result.scalar->GetSharedPtr()
+                                    : MakeNullScalar(out->type()->GetSharedPtr());
     return Status::OK();
   }
-  ArrayData* output = out->mutable_array();
-  if (is_dictionary_type<Type>::value) {
-    const Datum& dict_from = result.is_value() ? result : batch[1];
-    if (dict_from.is_scalar()) {
-      output->dictionary = checked_cast<const DictionaryScalar&>(*dict_from.scalar())
-                               .value.dictionary->data();
-    } else {
-      output->dictionary = dict_from.array()->dictionary;
-    }
-  }
-  if (!result.is_value()) {
+
+  std::shared_ptr<Scalar> temp;
+  if (!has_result) {
     // All conditions false, no 'else' argument
-    result = MakeNullScalar(out->type());
+    temp = MakeNullScalar(out->type()->GetSharedPtr());
+    result = temp.get();
+  }
+
+  // TODO(wesm): clean this up to have less duplication
+  if (out->is_array_data()) {
+    ArrayData* output = out->array_data().get();
+    if (is_dictionary_type<Type>::value) {
+      const ExecValue& dict_from = has_result ? result : batch[1];
+      if (dict_from.is_scalar()) {
+        output->dictionary = checked_cast<const DictionaryScalar&>(*dict_from.scalar)
+                                 .value.dictionary->data();
+      } else {
+        output->dictionary = dict_from.array.ToArrayData()->dictionary;
+      }
+    }
+    CopyValues<Type>(result, /*in_offset=*/0, batch.length,
+                     output->GetMutableValues<uint8_t>(0, 0),
+                     output->GetMutableValues<uint8_t>(1, 0), output->offset);
+  } else {
+    // ArraySpan
+    ArraySpan* output = out->array_span();
+    if (is_dictionary_type<Type>::value) {
+      const ExecValue& dict_from = has_result ? result : batch[1];
+      output->child_data.resize(1);
+      if (dict_from.is_scalar()) {
+        output->child_data[0].SetMembers(
+            *checked_cast<const DictionaryScalar&>(*dict_from.scalar)
+                 .value.dictionary->data());
+      } else {
+        output->child_data[0] = dict_from.array;
+      }
+    }
+    CopyValues<Type>(result, /*in_offset=*/0, batch.length,
+                     output->GetValues<uint8_t>(0, 0), output->GetValues<uint8_t>(1, 0),
+                     output->offset);
   }
-  CopyValues<Type>(result, /*in_offset=*/0, batch.length,
-                   output->GetMutableValues<uint8_t>(0, 0),
-                   output->GetMutableValues<uint8_t>(1, 0), output->offset);
   return Status::OK();
 }
 
 // Implement 'case when' for any mix of scalar/array arguments for any fixed-width type,
 // given helper functions to copy data from a source array to a target array
 template <typename Type>
-Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  const auto& conds_array = *batch.values[0].array();
+Status ExecArrayCaseWhen(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ArraySpan& conds_array = batch[0].array;
   if (conds_array.GetNullCount() > 0) {
     return Status::Invalid("cond struct must not have top-level nulls");
   }
-  ArrayData* output = out->mutable_array();
+  ArraySpan* output = out->array_span();
   const int64_t out_offset = output->offset;
   const auto num_value_args = batch.values.size() - 1;
   const bool have_else_arg =
       static_cast<size_t>(conds_array.type->num_fields()) < num_value_args;
-  uint8_t* out_valid = output->buffers[0]->mutable_data();
-  uint8_t* out_values = output->buffers[1]->mutable_data();
+  uint8_t* out_valid = output->buffers[0].data;
+  uint8_t* out_values = output->buffers[1].data;
 
   if (have_else_arg) {
     // Copy 'else' value into output
@@ -1486,11 +1548,11 @@ Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out)
   std::memset(mask, 0xFF, mask_buffer->size());
 
   // Then iterate through each argument in turn and set elements.
-  for (size_t i = 0; i < batch.values.size() - (have_else_arg ? 2 : 1); i++) {
-    const ArrayData& cond_array = *conds_array.child_data[i];
+  for (int i = 0; i < batch.num_values() - (have_else_arg ? 2 : 1); i++) {
+    const ArraySpan& cond_array = conds_array.child_data[i];
     const int64_t cond_offset = conds_array.offset + cond_array.offset;
-    const uint8_t* cond_values = cond_array.buffers[1]->data();
-    const Datum& values_datum = batch[i + 1];
+    const uint8_t* cond_values = cond_array.buffers[1].data;
+    const ExecValue& value = batch[i + 1];
     int64_t offset = 0;
 
     if (cond_array.GetNullCount() == 0) {
@@ -1500,15 +1562,15 @@ Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out)
       while (offset < batch.length) {
         const auto block = counter.NextAndWord();
         if (block.AllSet()) {
-          CopyValues<Type>(values_datum, offset, block.length, out_valid, out_values,
+          CopyValues<Type>(value, offset, block.length, out_valid, out_values,
                            out_offset + offset);
           bit_util::SetBitsTo(mask, offset, block.length, false);
         } else if (block.popcount) {
           for (int64_t j = 0; j < block.length; ++j) {
             if (bit_util::GetBit(mask, offset + j) &&
                 bit_util::GetBit(cond_values, cond_offset + offset + j)) {
-              CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
-                               out_values, out_offset + offset + j);
+              CopyValues<Type>(value, offset + j, /*length=*/1, out_valid, out_values,
+                               out_offset + offset + j);
               bit_util::SetBitTo(mask, offset + j, false);
             }
           }
@@ -1517,7 +1579,7 @@ Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out)
       }
     } else {
       // Visit mask & cond bitmap & cond validity
-      const uint8_t* cond_valid = cond_array.buffers[0]->data();
+      const uint8_t* cond_valid = cond_array.buffers[0].data;
       Bitmap bitmaps[3] = {{mask, /*offset=*/0, batch.length},
                            {cond_values, cond_offset, batch.length},
                            {cond_valid, cond_offset, batch.length}};
@@ -1525,7 +1587,7 @@ Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out)
         const uint64_t word = words[0] & words[1] & words[2];
         const int64_t block_length = std::min<int64_t>(64, batch.length - offset);
         if (word == std::numeric_limits<uint64_t>::max()) {
-          CopyValues<Type>(values_datum, offset, block_length, out_valid, out_values,
+          CopyValues<Type>(value, offset, block_length, out_valid, out_values,
                            out_offset + offset);
           bit_util::SetBitsTo(mask, offset, block_length, false);
         } else if (word) {
@@ -1533,8 +1595,8 @@ Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out)
             if (bit_util::GetBit(mask, offset + j) &&
                 bit_util::GetBit(cond_valid, cond_offset + offset + j) &&
                 bit_util::GetBit(cond_values, cond_offset + offset + j)) {
-              CopyValues<Type>(values_datum, offset + j, /*length=*/1, out_valid,
-                               out_values, out_offset + offset + j);
+              CopyValues<Type>(value, offset + j, /*length=*/1, out_valid, out_values,
+                               out_offset + offset + j);
               bit_util::SetBitTo(mask, offset + j, false);
             }
           }
@@ -1576,7 +1638,7 @@ Status ExecArrayCaseWhen(KernelContext* ctx, const ExecBatch& batch, Datum* out)
 
 template <typename Type, typename Enable = void>
 struct CaseWhenFunctor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     if (batch.values[0].is_array()) {
       return ExecArrayCaseWhen<Type>(ctx, batch, out);
     }
@@ -1586,72 +1648,75 @@ struct CaseWhenFunctor {
 
 template <>
 struct CaseWhenFunctor<NullType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return Status::OK();
   }
 };
 
-Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecBatch& batch,
-                                  Datum* out) {
-  const auto& conds = checked_cast<const StructScalar&>(*batch.values[0].scalar());
-  Datum result;
-  for (size_t i = 0; i < batch.values.size() - 1; i++) {
-    if (i < conds.value.size()) {
+Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch,
+                                  ExecResult* out) {
+  const auto& conds = checked_cast<const StructScalar&>(*batch[0].scalar);
+  ExecValue result;
+  bool has_result = false;
+  for (int i = 0; i < batch.num_values() - 1; i++) {
+    if (i < static_cast<int>(conds.value.size())) {
       const Scalar& cond = *conds.value[i];
       if (cond.is_valid && internal::UnboxScalar<BooleanType>::Unbox(cond)) {
         result = batch[i + 1];
+        has_result = true;
         break;
       }
     } else {
       // ELSE clause
       result = batch[i + 1];
+      has_result = true;
       break;
     }
   }
   if (out->is_scalar()) {
-    DCHECK(result.is_scalar() || result.kind() == Datum::NONE);
-    *out = result.is_scalar() ? result.scalar() : MakeNullScalar(out->type());
+    DCHECK(result.is_scalar() || !has_result);
+    out->value = result.is_scalar() ? result.scalar->GetSharedPtr()
+                                    : MakeNullScalar(out->type()->GetSharedPtr());
     return Status::OK();
   }
-  ArrayData* output = out->mutable_array();
-  if (!result.is_value()) {
+  if (!has_result) {
     // All conditions false, no 'else' argument
     ARROW_ASSIGN_OR_RAISE(
-        auto array, MakeArrayOfNull(output->type, batch.length, ctx->memory_pool()));
-    *output = *array->data();
+        std::shared_ptr<Array> array,
+        MakeArrayOfNull(out->type()->GetSharedPtr(), batch.length, ctx->memory_pool()));
+    out->value = std::move(array->data());
   } else if (result.is_scalar()) {
-    ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*result.scalar(), batch.length,
+    ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*result.scalar, batch.length,
                                                           ctx->memory_pool()));
-    *output = *array->data();
+    out->value = std::move(array->data());
   } else {
-    *output = *result.array();
+    out->value = result.array.ToArrayData();
   }
   return Status::OK();
 }
 
-// Use std::function for reserve_data to avoid instantiating template so much
+// Use std::function for reserve_data to avoid instantiating as many templates
 template <typename AppendScalar>
 static Status ExecVarWidthArrayCaseWhenImpl(
-    KernelContext* ctx, const ExecBatch& batch, Datum* out,
+    KernelContext* ctx, const ExecSpan& batch, ExecResult* out,
     std::function<Status(ArrayBuilder*)> reserve_data, AppendScalar append_scalar) {
-  const auto& conds_array = *batch.values[0].array();
-  ArrayData* output = out->mutable_array();
-  const bool have_else_arg =
-      static_cast<size_t>(conds_array.type->num_fields()) < (batch.values.size() - 1);
+  const ArraySpan& conds_array = batch[0].array;
+  const bool have_else_arg = conds_array.type->num_fields() < (batch.num_values() - 1);
   std::unique_ptr<ArrayBuilder> raw_builder;
-  RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type(), &raw_builder));
+  RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
+                                      &raw_builder));
   RETURN_NOT_OK(raw_builder->Reserve(batch.length));
   RETURN_NOT_OK(reserve_data(raw_builder.get()));
 
   for (int64_t row = 0; row < batch.length; row++) {
-    int64_t selected = have_else_arg ? static_cast<int64_t>(batch.values.size() - 1) : -1;
+    int64_t selected = have_else_arg ? (batch.num_values() - 1) : -1;
     for (int64_t arg = 0; static_cast<size_t>(arg) < conds_array.child_data.size();
          arg++) {
-      const ArrayData& cond_array = *conds_array.child_data[arg];
-      if ((!cond_array.buffers[0] ||
-           bit_util::GetBit(cond_array.buffers[0]->data(),
+      const ArraySpan& cond_array = conds_array.child_data[arg];
+      if ((cond_array.buffers[0].data == nullptr ||
+           bit_util::GetBit(cond_array.buffers[0].data,
                             conds_array.offset + cond_array.offset + row)) &&
-          bit_util::GetBit(cond_array.buffers[1]->data(),
+          bit_util::GetBit(cond_array.buffers[1].data,
                            conds_array.offset + cond_array.offset + row)) {
         selected = arg + 1;
         break;
@@ -1661,33 +1726,32 @@ static Status ExecVarWidthArrayCaseWhenImpl(
       RETURN_NOT_OK(raw_builder->AppendNull());
       continue;
     }
-    const Datum& source = batch.values[selected];
+    const ExecValue& source = batch[selected];
     if (source.is_scalar()) {
-      const auto& scalar = *source.scalar();
+      const Scalar& scalar = *source.scalar;
       if (!scalar.is_valid) {
         RETURN_NOT_OK(raw_builder->AppendNull());
       } else {
         RETURN_NOT_OK(append_scalar(raw_builder.get(), scalar));
       }
     } else {
-      const auto& array = source.array();
-      if (!array->buffers[0] ||
-          bit_util::GetBit(array->buffers[0]->data(), array->offset + row)) {
-        RETURN_NOT_OK(raw_builder->AppendArraySlice(*array, row, /*length=*/1));
+      const ArraySpan& array = source.array;
+      if (array.buffers[0].data == nullptr ||
+          bit_util::GetBit(array.buffers[0].data, array.offset + row)) {
+        RETURN_NOT_OK(raw_builder->AppendArraySlice(array, row, /*length=*/1));
       } else {
         RETURN_NOT_OK(raw_builder->AppendNull());
       }
     }
   }
-
   ARROW_ASSIGN_OR_RAISE(auto temp_output, raw_builder->Finish());
-  *output = *temp_output->data();
+  out->value = std::move(temp_output->data());
   return Status::OK();
 }
 
 // Single instantiation using ArrayBuilder::AppendScalar for append_scalar
 static Status ExecVarWidthArrayCaseWhen(
-    KernelContext* ctx, const ExecBatch& batch, Datum* out,
+    KernelContext* ctx, const ExecSpan& batch, ExecResult* out,
     std::function<Status(ArrayBuilder*)> reserve_data) {
   return ExecVarWidthArrayCaseWhenImpl(
       ctx, batch, out, std::move(reserve_data),
@@ -1700,7 +1764,8 @@ template <typename Type>
 struct CaseWhenFunctor<Type, enable_if_base_binary<Type>> {
   using offset_type = typename Type::offset_type;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    /// TODO(wesm): should this be a DCHECK? Or checked elsewhere
     if (batch[0].null_count() > 0) {
       return Status::Invalid("cond struct must not have outer nulls");
     }
@@ -1710,22 +1775,21 @@ struct CaseWhenFunctor<Type, enable_if_base_binary<Type>> {
     return ExecArray(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return ExecVarWidthArrayCaseWhenImpl(
         ctx, batch, out,
         // ReserveData
         [&](ArrayBuilder* raw_builder) {
           int64_t reservation = 0;
-          for (size_t arg = 1; arg < batch.values.size(); arg++) {
-            auto source = batch.values[arg];
+          for (int arg = 1; arg < batch.num_values(); arg++) {
+            const ExecValue& source = batch[arg];
             if (source.is_scalar()) {
-              const auto& scalar =
-                  checked_cast<const BaseBinaryScalar&>(*source.scalar());
+              const auto& scalar = checked_cast<const BaseBinaryScalar&>(*source.scalar);
               if (!scalar.value) continue;
               reservation =
                   std::max<int64_t>(reservation, batch.length * scalar.value->size());
             } else {
-              const auto& array = *source.array();
+              const ArraySpan& array = source.array;
               const auto& offsets = array.GetValues<offset_type>(1);
               reservation =
                   std::max<int64_t>(reservation, offsets[array.length] - offsets[0]);
@@ -1748,7 +1812,8 @@ template <typename Type>
 struct CaseWhenFunctor<Type, enable_if_var_size_list<Type>> {
   using offset_type = typename Type::offset_type;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    /// TODO(wesm): should this be a DCHECK? Or checked elsewhere
     if (batch[0].null_count() > 0) {
       return Status::Invalid("cond struct must not have outer nulls");
     }
@@ -1758,7 +1823,7 @@ struct CaseWhenFunctor<Type, enable_if_var_size_list<Type>> {
     return ExecArray(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return ExecVarWidthArrayCaseWhen(
         ctx, batch, out,
         // ReserveData
@@ -1767,16 +1832,16 @@ struct CaseWhenFunctor<Type, enable_if_var_size_list<Type>> {
           auto child_builder = builder->value_builder();
 
           int64_t reservation = 0;
-          for (size_t arg = 1; arg < batch.values.size(); arg++) {
-            auto source = batch.values[arg];
+          for (int arg = 1; arg < batch.num_values(); arg++) {
+            const ExecValue& source = batch[arg];
             if (!source.is_array()) {
-              const auto& scalar = checked_cast<const BaseListScalar&>(*source.scalar());
+              const auto& scalar = checked_cast<const BaseListScalar&>(*source.scalar);
               if (!scalar.value) continue;
               reservation =
                   std::max<int64_t>(reservation, batch.length * scalar.value->length());
             } else {
-              const auto& array = *source.array();
-              reservation = std::max<int64_t>(reservation, array.child_data[0]->length);
+              const ArraySpan& array = source.array;
+              reservation = std::max<int64_t>(reservation, array.child_data[0].length);
             }
           }
           return child_builder->Reserve(reservation);
@@ -1789,7 +1854,8 @@ Status ReserveNoData(ArrayBuilder*) { return Status::OK(); }
 
 template <>
 struct CaseWhenFunctor<MapType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    /// TODO(wesm): should this be a DCHECK? Or checked elsewhere
     if (batch[0].null_count() > 0) {
       return Status::Invalid("cond struct must not have outer nulls");
     }
@@ -1799,15 +1865,15 @@ struct CaseWhenFunctor<MapType> {
     return ExecArray(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    std::function<Status(ArrayBuilder*)> reserve_data = ReserveNoData;
-    return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data));
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    return ExecVarWidthArrayCaseWhen(ctx, batch, out, ReserveNoData);
   }
 };
 
 template <>
 struct CaseWhenFunctor<StructType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    /// TODO(wesm): should this be a DCHECK? Or checked elsewhere
     if (batch[0].null_count() > 0) {
       return Status::Invalid("cond struct must not have outer nulls");
     }
@@ -1817,15 +1883,15 @@ struct CaseWhenFunctor<StructType> {
     return ExecArray(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    std::function<Status(ArrayBuilder*)> reserve_data = ReserveNoData;
-    return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data));
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    return ExecVarWidthArrayCaseWhen(ctx, batch, out, ReserveNoData);
   }
 };
 
 template <>
 struct CaseWhenFunctor<FixedSizeListType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    /// TODO(wesm): should this be a DCHECK? Or checked elsewhere
     if (batch[0].null_count() > 0) {
       return Status::Invalid("cond struct must not have outer nulls");
     }
@@ -1835,7 +1901,7 @@ struct CaseWhenFunctor<FixedSizeListType> {
     return ExecArray(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& ty = checked_cast<const FixedSizeListType&>(*out->type());
     const int64_t width = ty.list_size();
     return ExecVarWidthArrayCaseWhen(
@@ -1852,7 +1918,8 @@ struct CaseWhenFunctor<FixedSizeListType> {
 
 template <typename Type>
 struct CaseWhenFunctor<Type, enable_if_union<Type>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    /// TODO(wesm): should this be a DCHECK? Or checked elsewhere
     if (batch[0].null_count() > 0) {
       return Status::Invalid("cond struct must not have outer nulls");
     }
@@ -1862,15 +1929,15 @@ struct CaseWhenFunctor<Type, enable_if_union<Type>> {
     return ExecArray(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    std::function<Status(ArrayBuilder*)> reserve_data = ReserveNoData;
-    return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data));
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    return ExecVarWidthArrayCaseWhen(ctx, batch, out, ReserveNoData);
   }
 };
 
 template <>
 struct CaseWhenFunctor<DictionaryType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    /// TODO(wesm): should this be a DCHECK? Or checked elsewhere
     if (batch[0].null_count() > 0) {
       return Status::Invalid("cond struct must not have outer nulls");
     }
@@ -1880,9 +1947,8 @@ struct CaseWhenFunctor<DictionaryType> {
     return ExecArray(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    std::function<Status(ArrayBuilder*)> reserve_data = ReserveNoData;
-    return ExecVarWidthArrayCaseWhen(ctx, batch, out, std::move(reserve_data));
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    return ExecVarWidthArrayCaseWhen(ctx, batch, out, ReserveNoData);
   }
 };
 
@@ -1923,19 +1989,19 @@ struct CoalesceFunction : ScalarFunction {
 };
 
 // Implement a 'coalesce' (SQL) operator for any number of scalar inputs
-Status ExecScalarCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  for (const auto& datum : batch.values) {
-    if (datum.scalar()->is_valid) {
-      *out = datum;
+Status ExecScalarCoalesce(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  for (const auto& value : batch.values) {
+    if (value.scalar->is_valid) {
+      out->value = value.scalar->GetSharedPtr();
       break;
     }
   }
   return Status::OK();
 }
 
-// Helper: copy from a source datum into all null slots of the output
+// Helper: copy from a source value into all null slots of the output
 template <typename Type>
-void CopyValuesAllValid(Datum source, uint8_t* out_valid, uint8_t* out_values,
+void CopyValuesAllValid(const ExecValue& source, uint8_t* out_valid, uint8_t* out_values,
                         const int64_t out_offset, const int64_t length) {
   BitRunReader bit_reader(out_valid, out_offset, length);
   int64_t offset = 0;
@@ -1980,30 +2046,30 @@ void InitializeNullSlots(const DataType& type, uint8_t* out_valid, uint8_t* out_
 
 // Implement 'coalesce' for any mix of scalar/array arguments for any fixed-width type
 template <typename Type>
-Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  ArrayData* output = out->mutable_array();
+Status ExecArrayCoalesce(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  ArraySpan* output = out->array_span();
   const int64_t out_offset = output->offset;
   // Use output validity buffer as mask to decide what values to copy
-  uint8_t* out_valid = output->buffers[0]->mutable_data();
+  uint8_t* out_valid = output->buffers[0].data;
 
   // Clear output validity buffer - no values are set initially
   bit_util::SetBitsTo(out_valid, out_offset, batch.length, false);
-  uint8_t* out_values = output->buffers[1]->mutable_data();
+  uint8_t* out_values = output->buffers[1].data;
 
-  for (const auto& datum : batch.values) {
-    if (datum.null_count() == 0) {
+  for (const ExecValue& value : batch.values) {
+    if (value.null_count() == 0) {
       // Valid scalar, or all-valid array
-      CopyValuesAllValid<Type>(datum, out_valid, out_values, out_offset, batch.length);
+      CopyValuesAllValid<Type>(value, out_valid, out_values, out_offset, batch.length);
       break;
-    } else if (datum.is_array()) {
+    } else if (value.is_array()) {
       // Array with nulls
-      const ArrayData& arr = *datum.array();
+      const ArraySpan& arr = value.array;
       const int64_t in_offset = arr.offset;
-      const int64_t in_null_count = arr.null_count;
-      DCHECK_GT(in_null_count, 0);  // computed in datum.null_count()
+      const int64_t in_null_count = arr.GetNullCount();
+      DCHECK_GT(in_null_count, 0);
       const DataType& type = *arr.type;
-      const uint8_t* in_valid = arr.buffers[0]->data();
-      const uint8_t* in_values = arr.buffers[1]->data();
+      const uint8_t* in_valid = arr.buffers[0].data;
+      const uint8_t* in_values = arr.buffers[1].data;
 
       if (in_null_count < 0.8 * batch.length) {
         // The input is not mostly null, we deem it more efficient to
@@ -2032,7 +2098,7 @@ Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out)
         while (offset < batch.length) {
           const auto block = counter.NextAndNotWord();
           if (block.AllSet()) {
-            CopyValues<Type>(datum, offset, block.length, out_valid, out_values,
+            CopyValues<Type>(value, offset, block.length, out_valid, out_values,
                              out_offset + offset);
           } else if (block.popcount) {
             for (int64_t j = 0; j < block.length; ++j) {
@@ -2059,17 +2125,17 @@ Status ExecArrayCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out)
 // Special case: implement 'coalesce' for an array and a scalar for any
 // fixed-width type (a 'fill_null' operation)
 template <typename Type>
-Status ExecArrayScalarCoalesce(KernelContext* ctx, Datum left, Datum right,
-                               int64_t length, Datum* out) {
-  ArrayData* output = out->mutable_array();
+Status ExecArrayScalarCoalesce(KernelContext* ctx, const ExecValue& left,
+                               const ExecValue& right, int64_t length, ExecResult* out) {
+  ArraySpan* output = out->array_span();
   const int64_t out_offset = output->offset;
-  uint8_t* out_valid = output->buffers[0]->mutable_data();
-  uint8_t* out_values = output->buffers[1]->mutable_data();
+  uint8_t* out_valid = output->buffers[0].data;
+  uint8_t* out_values = output->buffers[1].data;
 
-  const ArrayData& left_arr = *left.array();
-  const uint8_t* left_valid = left_arr.buffers[0]->data();
-  const uint8_t* left_values = left_arr.buffers[1]->data();
-  const Scalar& right_scalar = *right.scalar();
+  const ArraySpan& left_arr = left.array;
+  const uint8_t* left_valid = left_arr.buffers[0].data;
+  const uint8_t* left_values = left_arr.buffers[1].data;
+  const Scalar& right_scalar = *right.scalar;
 
   if (left.null_count() < length * 0.2) {
     // There are less than 20% nulls in the left array, so first copy
@@ -2125,25 +2191,26 @@ Status ExecArrayScalarCoalesce(KernelContext* ctx, Datum left, Datum right,
 // Special case: implement 'coalesce' for any 2 arguments for any fixed-width
 // type (a 'fill_null' operation)
 template <typename Type>
-Status ExecBinaryCoalesce(KernelContext* ctx, Datum left, Datum right, int64_t length,
-                          Datum* out) {
+Status ExecBinaryCoalesce(KernelContext* ctx, const ExecValue& left,
+                          const ExecValue& right, int64_t length, ExecResult* out) {
+  // TODO(wesm): remove the scalar output path
   if (left.is_scalar() && right.is_scalar()) {
     // Both scalar
-    *out = left.scalar()->is_valid ? left : right;
+    out->value = (left.scalar->is_valid ? left : right).scalar->GetSharedPtr();
     return Status::OK();
   }
 
-  ArrayData* output = out->mutable_array();
+  ArraySpan* output = out->array_span();
   const int64_t out_offset = output->offset;
-  uint8_t* out_valid = output->buffers[0]->mutable_data();
-  uint8_t* out_values = output->buffers[1]->mutable_data();
+  uint8_t* out_valid = output->buffers[0].data;
+  uint8_t* out_values = output->buffers[1].data;
 
   const int64_t left_null_count = left.null_count();
   const int64_t right_null_count = right.null_count();
 
   if (left.is_scalar()) {
     // (Scalar, Any)
-    CopyValues<Type>(left.scalar()->is_valid ? left : right, /*in_offset=*/0, length,
+    CopyValues<Type>(left.scalar->is_valid ? left : right, /*in_offset=*/0, length,
                      out_valid, out_values, out_offset);
     return Status::OK();
   } else if (left_null_count == 0) {
@@ -2156,13 +2223,12 @@ Status ExecBinaryCoalesce(KernelContext* ctx, Datum left, Datum right, int64_t l
   }
 
   // (Array, Array)
-  const ArrayData& left_arr = *left.array();
-  const ArrayData& right_arr = *right.array();
-  const uint8_t* left_valid = left_arr.buffers[0]->data();
-  const uint8_t* left_values = left_arr.buffers[1]->data();
-  const uint8_t* right_valid =
-      right_null_count > 0 ? right_arr.buffers[0]->data() : nullptr;
-  const uint8_t* right_values = right_arr.buffers[1]->data();
+  const ArraySpan& left_arr = left.array;
+  const ArraySpan& right_arr = right.array;
+  const uint8_t* left_valid = left_arr.buffers[0].data;
+  const uint8_t* left_values = left_arr.buffers[1].data;
+  const uint8_t* right_valid = right_arr.buffers[0].data;
+  const uint8_t* right_values = right_arr.buffers[1].data;
 
   BitRunReader bit_reader(left_valid, left_arr.offset, left_arr.length);
   int64_t offset = 0;
@@ -2195,42 +2261,44 @@ Status ExecBinaryCoalesce(KernelContext* ctx, Datum left, Datum right, int64_t l
 }
 
 template <typename AppendScalar>
-static Status ExecVarWidthCoalesceImpl(KernelContext* ctx, const ExecBatch& batch,
-                                       Datum* out,
+static Status ExecVarWidthCoalesceImpl(KernelContext* ctx, const ExecSpan& batch,
+                                       ExecResult* out,
                                        std::function<Status(ArrayBuilder*)> reserve_data,
                                        AppendScalar append_scalar) {
   // Special case: grab any leading non-null scalar or array arguments
-  for (const auto& datum : batch.values) {
-    if (datum.is_scalar()) {
-      if (!datum.scalar()->is_valid) continue;
+  for (const ExecValue& value : batch.values) {
+    if (value.is_scalar()) {
+      if (!value.scalar->is_valid) continue;
       ARROW_ASSIGN_OR_RAISE(
-          *out, MakeArrayFromScalar(*datum.scalar(), batch.length, ctx->memory_pool()));
+          std::shared_ptr<Array> result,
+          MakeArrayFromScalar(*value.scalar, batch.length, ctx->memory_pool()));
+      out->value = std::move(result->data());
       return Status::OK();
-    } else if (datum.is_array() && !datum.array()->MayHaveNulls()) {
-      *out = datum;
+    } else if (value.is_array() && !value.array.MayHaveNulls()) {
+      out->value = value.array.ToArrayData();
       return Status::OK();
     }
     break;
   }
-  ArrayData* output = out->mutable_array();
   std::unique_ptr<ArrayBuilder> raw_builder;
-  RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type(), &raw_builder));
+  RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
+                                      &raw_builder));
   RETURN_NOT_OK(raw_builder->Reserve(batch.length));
   RETURN_NOT_OK(reserve_data(raw_builder.get()));
 
   for (int64_t i = 0; i < batch.length; i++) {
     bool set = false;
-    for (const auto& datum : batch.values) {
-      if (datum.is_scalar()) {
-        if (datum.scalar()->is_valid) {
-          RETURN_NOT_OK(append_scalar(raw_builder.get(), *datum.scalar()));
+    for (const auto& value : batch.values) {
+      if (value.is_scalar()) {
+        if (value.scalar->is_valid) {
+          RETURN_NOT_OK(append_scalar(raw_builder.get(), *value.scalar));
           set = true;
           break;
         }
       } else {
-        const ArrayData& source = *datum.array();
+        const ArraySpan& source = value.array;
         if (!source.MayHaveNulls() ||
-            bit_util::GetBit(source.buffers[0]->data(), source.offset + i)) {
+            bit_util::GetBit(source.buffers[0].data, source.offset + i)) {
           RETURN_NOT_OK(raw_builder->AppendArraySlice(source, i, /*length=*/1));
           set = true;
           break;
@@ -2240,12 +2308,13 @@ static Status ExecVarWidthCoalesceImpl(KernelContext* ctx, const ExecBatch& batc
     if (!set) RETURN_NOT_OK(raw_builder->AppendNull());
   }
   ARROW_ASSIGN_OR_RAISE(auto temp_output, raw_builder->Finish());
-  *output = *temp_output->data();
-  output->type = batch[0].type();
+  out->value = std::move(temp_output->data());
+  out->array_data()->type = batch[0].type()->GetSharedPtr();
   return Status::OK();
 }
 
-static Status ExecVarWidthCoalesce(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+static Status ExecVarWidthCoalesce(KernelContext* ctx, const ExecSpan& batch,
+                                   ExecResult* out,
                                    std::function<Status(ArrayBuilder*)> reserve_data) {
   return ExecVarWidthCoalesceImpl(ctx, batch, out, std::move(reserve_data),
                                   [](ArrayBuilder* builder, const Scalar& scalar) {
@@ -2255,16 +2324,16 @@ static Status ExecVarWidthCoalesce(KernelContext* ctx, const ExecBatch& batch, D
 
 template <typename Type, typename Enable = void>
 struct CoalesceFunctor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     if (!TypeTraits<Type>::is_parameter_free) {
-      RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[0], batch.values.size()));
+      RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[0], batch.num_values()));
     }
     // Special case for two arguments (since "fill_null" is a common operation)
     if (batch.num_values() == 2) {
       return ExecBinaryCoalesce<Type>(ctx, batch[0], batch[1], batch.length, out);
     }
-    for (const auto& datum : batch.values) {
-      if (datum.is_array()) {
+    for (const auto& value : batch.values) {
+      if (value.is_array()) {
         return ExecArrayCoalesce<Type>(ctx, batch, out);
       }
     }
@@ -2274,7 +2343,7 @@ struct CoalesceFunctor {
 
 template <>
 struct CoalesceFunctor<NullType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return Status::OK();
   }
 };
@@ -2284,30 +2353,28 @@ struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
   using offset_type = typename Type::offset_type;
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch.num_values() == 2 && batch.values[0].is_array() &&
-        batch.values[1].is_scalar()) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    if (batch.num_values() == 2 && batch[0].is_array() && batch[1].is_scalar()) {
       // Specialized implementation for common case ('fill_null' operation)
-      return ExecArrayScalar(ctx, *batch.values[0].array(), *batch.values[1].scalar(),
-                             out);
+      return ExecArrayScalar(ctx, batch[0].array, *batch[1].scalar, out);
     }
-    for (const auto& datum : batch.values) {
-      if (datum.is_array()) {
+    for (const auto& value : batch.values) {
+      if (value.is_array()) {
         return ExecArray(ctx, batch, out);
       }
     }
     return ExecScalarCoalesce(ctx, batch, out);
   }
 
-  static Status ExecArrayScalar(KernelContext* ctx, const ArrayData& left,
-                                const Scalar& right, Datum* out) {
+  static Status ExecArrayScalar(KernelContext* ctx, const ArraySpan& left,
+                                const Scalar& right, ExecResult* out) {
     const int64_t null_count = left.GetNullCount();
     if (null_count == 0 || !right.is_valid) {
-      *out = left;
+      // TODO(wesm): avoid ToArrayData()
+      out->value = left.ToArrayData();
       return Status::OK();
     }
-    ArrayData* output = out->mutable_array();
-    BuilderType builder(left.type, ctx->memory_pool());
+    BuilderType builder(left.type->GetSharedPtr(), ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(left.length));
     const auto& scalar = checked_cast<const BaseBinaryScalar&>(right);
     const offset_type* offsets = left.GetValues<offset_type>(1);
@@ -2320,27 +2387,28 @@ struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
     RETURN_NOT_OK(builder.ReserveData(static_cast<offset_type>(data_reserve)));
 
     util::string_view fill_value(*scalar.value);
-    VisitArrayDataInline<Type>(
+    VisitArraySpanInline<Type>(
         left, [&](util::string_view s) { builder.UnsafeAppend(s); },
         [&]() { builder.UnsafeAppend(fill_value); });
 
     ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
-    *output = *temp_output->data();
-    output->type = left.type;
+    out->value = std::move(temp_output->data());
+    out->array_data()->type = left.type->GetSharedPtr();
     return Status::OK();
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    // TODO: do this without ToArrayData()?
     return ExecVarWidthCoalesceImpl(
         ctx, batch, out,
         [&](ArrayBuilder* builder) {
           int64_t reservation = 0;
-          for (const auto& datum : batch.values) {
-            if (datum.is_array()) {
-              const ArrayType array(datum.array());
+          for (const auto& value : batch.values) {
+            if (value.is_array()) {
+              const ArrayType array(value.array.ToArrayData());
               reservation = std::max<int64_t>(reservation, array.total_values_length());
             } else {
-              const auto& scalar = *datum.scalar();
+              const Scalar& scalar = *value.scalar;
               if (scalar.is_valid) {
                 const int64_t size = UnboxScalar<Type>::Unbox(scalar).size();
                 reservation = std::max<int64_t>(reservation, batch.length * size);
@@ -2360,62 +2428,61 @@ template <typename Type>
 struct CoalesceFunctor<
     Type, enable_if_t<(is_nested_type<Type>::value || is_dictionary_type<Type>::value) &&
                       !is_union_type<Type>::value>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[0], batch.values.size()));
-    for (const auto& datum : batch.values) {
-      if (datum.is_array()) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[0], batch.num_values()));
+    for (const auto& value : batch.values) {
+      if (value.is_array()) {
         return ExecArray(ctx, batch, out);
       }
     }
     return ExecScalarCoalesce(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    std::function<Status(ArrayBuilder*)> reserve_data = ReserveNoData;
-    return ExecVarWidthCoalesce(ctx, batch, out, reserve_data);
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    return ExecVarWidthCoalesce(ctx, batch, out, ReserveNoData);
   }
 };
 
 template <typename Type>
 struct CoalesceFunctor<Type, enable_if_union<Type>> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // Unions don't have top-level nulls, so a specialized implementation is needed
-    RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[0], batch.values.size()));
+    RETURN_NOT_OK(CheckIdenticalTypes(&batch.values[0], batch.num_values()));
 
-    for (const auto& datum : batch.values) {
-      if (datum.is_array()) {
+    for (const auto& value : batch.values) {
+      if (value.is_array()) {
         return ExecArray(ctx, batch, out);
       }
     }
     return ExecScalar(ctx, batch, out);
   }
 
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    ArrayData* output = out->mutable_array();
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     std::unique_ptr<ArrayBuilder> raw_builder;
-    RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type(), &raw_builder));
+    RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
+                                        &raw_builder));
     RETURN_NOT_OK(raw_builder->Reserve(batch.length));
 
     const UnionType& type = checked_cast<const UnionType&>(*out->type());
     for (int64_t i = 0; i < batch.length; i++) {
       bool set = false;
-      for (const auto& datum : batch.values) {
-        if (datum.is_scalar()) {
-          const auto& scalar = checked_cast<const UnionScalar&>(*datum.scalar());
+      for (const auto& value : batch.values) {
+        if (value.is_scalar()) {
+          const auto& scalar = checked_cast<const UnionScalar&>(*value.scalar);
           if (scalar.is_valid && scalar.value->is_valid) {
             RETURN_NOT_OK(raw_builder->AppendScalar(scalar));
             set = true;
             break;
           }
         } else {
-          const ArrayData& source = *datum.array();
+          const ArraySpan& source = value.array;
           // Peek at the relevant child array's validity bitmap
           if (std::is_same<Type, SparseUnionType>::value) {
             const int8_t type_id = source.GetValues<int8_t>(1)[i];
             const int child_id = type.child_ids()[type_id];
-            const ArrayData& child = *source.child_data[child_id];
+            const ArraySpan& child = source.child_data[child_id];
             if (!child.MayHaveNulls() ||
-                bit_util::GetBit(child.buffers[0]->data(),
+                bit_util::GetBit(child.buffers[0].data,
                                  source.offset + child.offset + i)) {
               RETURN_NOT_OK(raw_builder->AppendArraySlice(source, i, /*length=*/1));
               set = true;
@@ -2425,9 +2492,9 @@ struct CoalesceFunctor<Type, enable_if_union<Type>> {
             const int8_t type_id = source.GetValues<int8_t>(1)[i];
             const int32_t offset = source.GetValues<int32_t>(2)[i];
             const int child_id = type.child_ids()[type_id];
-            const ArrayData& child = *source.child_data[child_id];
+            const ArraySpan& child = source.child_data[child_id];
             if (!child.MayHaveNulls() ||
-                bit_util::GetBit(child.buffers[0]->data(), child.offset + offset)) {
+                bit_util::GetBit(child.buffers[0].data, child.offset + offset)) {
               RETURN_NOT_OK(raw_builder->AppendArraySlice(source, i, /*length=*/1));
               set = true;
               break;
@@ -2438,16 +2505,16 @@ struct CoalesceFunctor<Type, enable_if_union<Type>> {
       if (!set) RETURN_NOT_OK(raw_builder->AppendNull());
     }
     ARROW_ASSIGN_OR_RAISE(auto temp_output, raw_builder->Finish());
-    *output = *temp_output->data();
+    out->value = std::move(temp_output->data());
     return Status::OK();
   }
 
-  static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    for (const auto& datum : batch.values) {
-      const auto& scalar = checked_cast<const UnionScalar&>(*datum.scalar());
+  static Status ExecScalar(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    for (const auto& value : batch.values) {
+      const auto& scalar = checked_cast<const UnionScalar&>(*value.scalar);
       // Union scalars can have top-level validity
       if (scalar.is_valid && scalar.value->is_valid) {
-        *out = datum;
+        out->value = value.scalar->GetSharedPtr();
         break;
       }
     }
@@ -2456,15 +2523,21 @@ struct CoalesceFunctor<Type, enable_if_union<Type>> {
 };
 
 template <typename Type>
-Status ExecScalarChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  const auto& index_scalar = *batch[0].scalar();
+Status ExecScalarChoose(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  DCHECK(!out->is_array_data());
+  // For fixed-width types, this kernel has a full preallocation
+  const auto& index_scalar = *batch[0].scalar;
   if (!index_scalar.is_valid) {
-    if (out->is_array()) {
-      auto source = MakeNullScalar(out->type());
-      ArrayData* output = out->mutable_array();
-      CopyValues<Type>(source, /*row=*/0, batch.length,
-                       output->GetMutableValues<uint8_t>(0, /*absolute_offset=*/0),
-                       output->GetMutableValues<uint8_t>(1, /*absolute_offset=*/0),
+    if (out->is_array_span()) {
+      // TODO(wesm): more graceful implementation than using
+      // MakeNullScalar, which is a little bit lazy
+      std::shared_ptr<Scalar> source = MakeNullScalar(out->type()->GetSharedPtr());
+      ArraySpan* output = out->array_span();
+      ExecValue copy_source;
+      copy_source.SetScalar(source.get());
+      CopyValues<Type>(copy_source, /*row=*/0, batch.length,
+                       output->GetValues<uint8_t>(0, /*absolute_offset=*/0),
+                       output->GetValues<uint8_t>(1, /*absolute_offset=*/0),
                        output->offset);
     }
     return Status::OK();
@@ -2473,38 +2546,38 @@ Status ExecScalarChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out)
   if (index < 0 || static_cast<size_t>(index + 1) >= batch.values.size()) {
     return Status::IndexError("choose: index ", index, " out of range");
   }
-  auto source = batch.values[index + 1];
+  auto source = batch[index + 1];
   if (out->is_scalar()) {
-    *out = source;
+    // All inputs to choose were scalar values
+    out->value = source.scalar->GetSharedPtr();
   } else {
-    ArrayData* output = out->mutable_array();
+    ArraySpan* output = out->array_span();
     CopyValues<Type>(source, /*row=*/0, batch.length,
-                     output->GetMutableValues<uint8_t>(0, /*absolute_offset=*/0),
-                     output->GetMutableValues<uint8_t>(1, /*absolute_offset=*/0),
+                     output->GetValues<uint8_t>(0, /*absolute_offset=*/0),
+                     output->GetValues<uint8_t>(1, /*absolute_offset=*/0),
                      output->offset);
   }
   return Status::OK();
 }
 
 template <typename Type>
-Status ExecArrayChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  ArrayData* output = out->mutable_array();
+Status ExecArrayChoose(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  ArraySpan* output = out->array_span();
   const int64_t out_offset = output->offset;
   // Need a null bitmap if any input has nulls
   uint8_t* out_valid = nullptr;
   if (std::any_of(batch.values.begin(), batch.values.end(),
-                  [](const Datum& d) { return d.null_count() > 0; })) {
-    out_valid = output->buffers[0]->mutable_data();
+                  [](const ExecValue& d) { return d.null_count() > 0; })) {
+    out_valid = output->buffers[0].data;
   } else {
-    bit_util::SetBitsTo(output->buffers[0]->mutable_data(), out_offset, batch.length,
-                        true);
+    bit_util::SetBitsTo(output->buffers[0].data, out_offset, batch.length, true);
   }
-  uint8_t* out_values = output->buffers[1]->mutable_data();
+  uint8_t* out_values = output->buffers[1].data;
   int64_t row = 0;
   return VisitArrayValuesInline<Int64Type>(
-      *batch[0].array(),
+      batch[0].array,
       [&](int64_t index) {
-        if (index < 0 || static_cast<size_t>(index + 1) >= batch.values.size()) {
+        if (index < 0 || (index + 1) >= batch.num_values()) {
           return Status::IndexError("choose: index ", index, " out of range");
         }
         const auto& source = batch.values[index + 1];
@@ -2524,7 +2597,7 @@ Status ExecArrayChoose(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
 template <typename Type, typename Enable = void>
 struct ChooseFunctor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     if (batch.values[0].is_scalar()) {
       return ExecScalarChoose<Type>(ctx, batch, out);
     }
@@ -2534,7 +2607,7 @@ struct ChooseFunctor {
 
 template <>
 struct ChooseFunctor<NullType> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return Status::OK();
   }
 };
@@ -2544,47 +2617,55 @@ struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
   using offset_type = typename Type::offset_type;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch.values[0].is_scalar()) {
-      const auto& index_scalar = *batch[0].scalar();
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_scalar()) {
+      const Scalar& index_scalar = *batch[0].scalar;
       if (!index_scalar.is_valid) {
-        if (out->is_array()) {
-          ARROW_ASSIGN_OR_RAISE(
-              auto temp_array,
-              MakeArrayOfNull(out->type(), batch.length, ctx->memory_pool()));
-          *out->mutable_array() = *temp_array->data();
+        if (out->is_array_data()) {
+          ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> temp_array,
+                                MakeArrayOfNull(out->type()->GetSharedPtr(), batch.length,
+                                                ctx->memory_pool()));
+          out->value = std::move(temp_array->data());
         }
         return Status::OK();
       }
       auto index = UnboxScalar<Int64Type>::Unbox(index_scalar);
-      if (index < 0 || static_cast<size_t>(index + 1) >= batch.values.size()) {
+      if (index < 0 || (index + 1) >= batch.num_values()) {
         return Status::IndexError("choose: index ", index, " out of range");
       }
-      auto source = batch.values[index + 1];
-      if (source.is_scalar() && out->is_array()) {
-        ARROW_ASSIGN_OR_RAISE(
-            auto temp_array,
-            MakeArrayFromScalar(*source.scalar(), batch.length, ctx->memory_pool()));
-        *out->mutable_array() = *temp_array->data();
+      const ExecValue& source = batch.values[index + 1];
+      if (source.is_scalar()) {
+        if (out->is_array_data()) {
+          ARROW_ASSIGN_OR_RAISE(
+              std::shared_ptr<Array> temp_array,
+              MakeArrayFromScalar(*source.scalar, batch.length, ctx->memory_pool()));
+          out->value = std::move(temp_array->data());
+        } else {
+          DCHECK(out->is_scalar());
+          out->value = source.scalar->GetSharedPtr();
+        }
       } else {
-        *out = source;
+        DCHECK(out->is_array_data());
+        // source is an array
+        // TODO(wesm): avoiding ToArrayData()
+        out->value = source.array.ToArrayData();
       }
       return Status::OK();
     }
 
     // Row-wise implementation
-    BuilderType builder(out->type(), ctx->memory_pool());
+    BuilderType builder(out->type()->GetSharedPtr(), ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(batch.length));
     int64_t reserve_data = 0;
     for (const auto& value : batch.values) {
       if (value.is_scalar()) {
-        if (!value.scalar()->is_valid) continue;
+        if (!value.scalar->is_valid) continue;
         const auto row_length =
-            checked_cast<const BaseBinaryScalar&>(*value.scalar()).value->size();
+            checked_cast<const BaseBinaryScalar&>(*value.scalar).value->size();
         reserve_data = std::max<int64_t>(reserve_data, batch.length * row_length);
         continue;
       }
-      const ArrayData& arr = *value.array();
+      const ArraySpan& arr = value.array;
       const offset_type* offsets = arr.GetValues<offset_type>(1);
       const offset_type values_length = offsets[arr.length] - offsets[0];
       reserve_data = std::max<int64_t>(reserve_data, values_length);
@@ -2592,7 +2673,7 @@ struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
     RETURN_NOT_OK(builder.ReserveData(reserve_data));
     int64_t row = 0;
     RETURN_NOT_OK(VisitArrayValuesInline<Int64Type>(
-        *batch[0].array(),
+        batch[0].array,
         [&](int64_t index) {
           if (index < 0 || static_cast<size_t>(index + 1) >= batch.values.size()) {
             return Status::IndexError("choose: index ", index, " out of range");
@@ -2604,27 +2685,26 @@ struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
           row++;
           return builder.AppendNull();
         }));
-    auto actual_type = out->type();
     std::shared_ptr<Array> temp_output;
     RETURN_NOT_OK(builder.Finish(&temp_output));
-    ArrayData* output = out->mutable_array();
-    *output = *temp_output->data();
+    std::shared_ptr<DataType> actual_result_type = out->type()->GetSharedPtr();
+    out->value = std::move(temp_output->data());
     // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
-    output->type = std::move(actual_type);
+    out->array_data()->type = std::move(actual_result_type);
     return Status::OK();
   }
 
-  static Status CopyValue(const Datum& datum, BuilderType* builder, int64_t row) {
-    if (datum.is_scalar()) {
-      const auto& scalar = checked_cast<const BaseBinaryScalar&>(*datum.scalar());
+  static Status CopyValue(const ExecValue& value, BuilderType* builder, int64_t row) {
+    if (value.is_scalar()) {
+      const auto& scalar = checked_cast<const BaseBinaryScalar&>(*value.scalar);
       if (!scalar.value) return builder->AppendNull();
       return builder->Append(scalar.value->data(),
                              static_cast<offset_type>(scalar.value->size()));
     }
-    const ArrayData& source = *datum.array();
+    const ArraySpan& source = value.array;
     if (!source.MayHaveNulls() ||
-        bit_util::GetBit(source.buffers[0]->data(), source.offset + row)) {
-      const uint8_t* data = source.buffers[2]->data();
+        bit_util::GetBit(source.buffers[0].data, source.offset + row)) {
+      const uint8_t* data = source.buffers[2].data;
       const offset_type* offsets = source.GetValues<offset_type>(1);
       const offset_type offset0 = offsets[row];
       const offset_type offset1 = offsets[row + 1];
@@ -2656,7 +2736,7 @@ struct ChooseFunction : ScalarFunction {
 };
 
 void AddCaseWhenKernel(const std::shared_ptr<CaseWhenFunction>& scalar_function,
-                       detail::GetTypeId get_id, ArrayKernelExec exec) {
+                       detail::GetTypeId get_id, ScalarKernel::ExecFunc exec) {
   ScalarKernel kernel(
       KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)},
                             OutputType(LastType),
@@ -2691,7 +2771,7 @@ void AddBinaryCaseWhenKernels(const std::shared_ptr<CaseWhenFunction>& scalar_fu
 }
 
 void AddCoalesceKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
-                       detail::GetTypeId get_id, ArrayKernelExec exec) {
+                       detail::GetTypeId get_id, ScalarKernel::ExecFunc exec) {
   ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
                                             /*is_varargs=*/true),
                       exec);
@@ -2710,7 +2790,7 @@ void AddPrimitiveCoalesceKernels(const std::shared_ptr<ScalarFunction>& scalar_f
 }
 
 void AddChooseKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
-                     detail::GetTypeId get_id, ArrayKernelExec exec) {
+                     detail::GetTypeId get_id, ScalarKernel::ExecFunc exec) {
   ScalarKernel kernel(
       KernelSignature::Make({Type::INT64, InputType(get_id.id)}, OutputType(LastType),
                             /*is_varargs=*/true),
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc
index 1e40d7aabab..59209b08bd9 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -31,21 +31,19 @@ namespace internal {
 namespace {
 
 template <typename Type, typename offset_type = typename Type::offset_type>
-Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ListValueLength(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using OffsetScalarType = typename TypeTraits<Type>::OffsetScalarType;
 
-  if (batch[0].kind() == Datum::ARRAY) {
-    typename TypeTraits<Type>::ArrayType list(batch[0].array());
-    ArrayData* out_arr = out->mutable_array();
-    auto out_values = out_arr->GetMutableValues<offset_type>(1);
-    const offset_type* offsets = list.raw_value_offsets();
-    ::arrow::internal::VisitBitBlocksVoid(
-        list.data()->buffers[0], list.offset(), list.length(),
-        [&](int64_t position) {
-          *out_values++ = offsets[position + 1] - offsets[position];
-        },
-        [&]() { *out_values++ = 0; });
+  if (batch[0].is_array()) {
+    const ArraySpan& arr = batch[0].array;
+    ArraySpan* out_arr = out->array_span();
+    auto out_values = out_arr->GetValues<offset_type>(1);
+    const offset_type* offsets = arr.GetValues<offset_type>(1);
+    // Offsets are always well-defined and monotonic, even for null values
+    for (int64_t i = 0; i < arr.length; ++i) {
+      *out_values++ = offsets[i + 1] - offsets[i];
+    }
   } else {
     const auto& arg0 = batch[0].scalar_as<ScalarType>();
     if (arg0.is_valid) {
@@ -57,12 +55,13 @@ Status ListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   return Status::OK();
 }
 
-Status FixedSizeListValueLength(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status FixedSizeListValueLength(KernelContext* ctx, const ExecSpan& batch,
+                                ExecResult* out) {
   auto width = checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
-  if (batch[0].kind() == Datum::ARRAY) {
-    const auto& arr = *batch[0].array();
-    ArrayData* out_arr = out->mutable_array();
-    auto* out_values = out_arr->GetMutableValues<int32_t>(1);
+  if (batch[0].is_array()) {
+    const ArraySpan& arr = batch[0].array;
+    ArraySpan* out_arr = out->array_span();
+    int32_t* out_values = out_arr->GetValues<int32_t>(1);
     std::fill(out_values, out_values + arr.length, width);
   } else {
     const auto& arg0 = batch[0].scalar_as<FixedSizeListScalar>();
@@ -83,34 +82,87 @@ const FunctionDoc list_value_length_doc{
 
 template <typename Type, typename IndexType>
 struct ListElementArray {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    using ListArrayType = typename TypeTraits<Type>::ArrayType;
-    using IndexScalarType = typename TypeTraits<IndexType>::ScalarType;
+  using ListArrayType = typename TypeTraits<Type>::ArrayType;
+  using IndexScalarType = typename TypeTraits<IndexType>::ScalarType;
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const auto& index_scalar = batch[1].scalar_as<IndexScalarType>();
+    if (ARROW_PREDICT_FALSE(!index_scalar.is_valid)) {
+      return Status::Invalid("Index must not be null");
+    }
+    const ArraySpan& list = batch[0].array;
+    const ArraySpan& list_values = list.child_data[0];
+    const offset_type* offsets = list.GetValues<offset_type>(1);
+
+    auto index = index_scalar.value;
+    if (ARROW_PREDICT_FALSE(index < 0)) {
+      return Status::Invalid("Index ", index,
+                             " is out of bounds: should be greater than or equal to 0");
+    }
+    std::unique_ptr<ArrayBuilder> builder;
+
+    const Type* list_type = checked_cast<const Type*>(list.type);
+    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list_type->value_type(), &builder));
+    RETURN_NOT_OK(builder->Reserve(list.length));
+    for (int i = 0; i < list.length; ++i) {
+      if (list.IsNull(i)) {
+        RETURN_NOT_OK(builder->AppendNull());
+        continue;
+      }
+
+      const offset_type value_offset = offsets[i];
+      const offset_type value_length = offsets[i + 1] - offsets[i];
+      if (ARROW_PREDICT_FALSE(index >=
+                              static_cast<typename IndexType::c_type>(value_length))) {
+        return Status::Invalid("Index ", index, " is out of bounds: should be in [0, ",
+                               value_length, ")");
+      }
+      RETURN_NOT_OK(builder->AppendArraySlice(list_values, value_offset + index, 1));
+    }
+    ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+    out->value = result->data();
+    return Status::OK();
+  }
+};
+
+template <typename Type, typename IndexType>
+struct FixedSizeListElementArray {
+  using IndexScalarType = typename TypeTraits<IndexType>::ScalarType;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& index_scalar = batch[1].scalar_as<IndexScalarType>();
     if (ARROW_PREDICT_FALSE(!index_scalar.is_valid)) {
       return Status::Invalid("Index must not be null");
     }
-    ListArrayType list_array(batch[0].array());
+
+    auto item_size = checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+
+    const ArraySpan& list = batch[0].array;
+    const ArraySpan& list_values = list.child_data[0];
+
     auto index = index_scalar.value;
     if (ARROW_PREDICT_FALSE(index < 0)) {
       return Status::Invalid("Index ", index,
                              " is out of bounds: should be greater than or equal to 0");
     }
     std::unique_ptr<ArrayBuilder> builder;
-    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list_array.value_type(), &builder));
-    RETURN_NOT_OK(builder->Reserve(list_array.length()));
-    for (int i = 0; i < list_array.length(); ++i) {
-      if (list_array.IsNull(i)) {
+
+    const Type* list_type = checked_cast<const Type*>(list.type);
+    RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list_type->value_type(), &builder));
+    RETURN_NOT_OK(builder->Reserve(list.length));
+    for (int i = 0; i < list.length; ++i) {
+      if (list.IsNull(i)) {
         RETURN_NOT_OK(builder->AppendNull());
         continue;
       }
-      std::shared_ptr<arrow::Array> value_array = list_array.value_slice(i);
-      auto len = value_array->length();
-      if (ARROW_PREDICT_FALSE(index >= static_cast<typename IndexType::c_type>(len))) {
+      if (ARROW_PREDICT_FALSE(index >=
+                              static_cast<typename IndexType::c_type>(item_size))) {
         return Status::Invalid("Index ", index, " is out of bounds: should be in [0, ",
-                               len, ")");
+                               item_size, ")");
       }
-      RETURN_NOT_OK(builder->AppendArraySlice(*value_array->data(), index, 1));
+      RETURN_NOT_OK(builder->AppendArraySlice(list_values,
+                                              (list.offset + i) * item_size + index, 1));
     }
     ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
     out->value = result->data();
@@ -120,7 +172,7 @@ struct ListElementArray {
 
 template <typename, typename IndexType>
 struct ListElementScalar {
-  static Status Exec(KernelContext* /*ctx*/, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* /*ctx*/, const ExecSpan& batch, ExecResult* out) {
     using IndexScalarType = typename TypeTraits<IndexType>::ScalarType;
     const auto& index_scalar = batch[1].scalar_as<IndexScalarType>();
     if (ARROW_PREDICT_FALSE(!index_scalar.is_valid)) {
@@ -145,14 +197,14 @@ struct ListElementScalar {
   }
 };
 
-template <typename InListType>
+template <typename InListType, template <typename...> class Functor>
 void AddListElementArrayKernels(ScalarFunction* func) {
   for (const auto& index_type : IntTypes()) {
     auto inputs = {InputType::Array(InListType::type_id), InputType::Scalar(index_type)};
     auto output = OutputType{ListValuesType};
     auto sig = KernelSignature::Make(std::move(inputs), std::move(output),
                                      /*is_varargs=*/false);
-    auto scalar_exec = GenerateInteger<ListElementArray, InListType>({index_type->id()});
+    auto scalar_exec = GenerateInteger<Functor, InListType>({index_type->id()});
     ScalarKernel kernel{std::move(sig), std::move(scalar_exec)};
     kernel.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
     kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
@@ -161,9 +213,9 @@ void AddListElementArrayKernels(ScalarFunction* func) {
 }
 
 void AddListElementArrayKernels(ScalarFunction* func) {
-  AddListElementArrayKernels<ListType>(func);
-  AddListElementArrayKernels<LargeListType>(func);
-  AddListElementArrayKernels<FixedSizeListType>(func);
+  AddListElementArrayKernels<ListType, ListElementArray>(func);
+  AddListElementArrayKernels<LargeListType, ListElementArray>(func);
+  AddListElementArrayKernels<FixedSizeListType, FixedSizeListElementArray>(func);
 }
 
 void AddListElementScalarKernels(ScalarFunction* func) {
@@ -190,9 +242,10 @@ const FunctionDoc list_element_doc(
     {"lists", "index"});
 
 struct StructFieldFunctor {
-  static Status ExecArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = OptionsWrapper<StructFieldOptions>::Get(ctx);
-    std::shared_ptr<Array> current = batch[0].make_array();
+
+    std::shared_ptr<Array> current = MakeArray(batch[0].array.ToArrayData());
     for (const auto& index : options.indices) {
       RETURN_NOT_OK(CheckIndex(index, *current->type()));
       switch (current->type()->id()) {
@@ -243,43 +296,44 @@ struct StructFieldFunctor {
                                    *current->type());
       }
     }
-    *out = current;
+    out->value = std::move(current->data());
     return Status::OK();
   }
 
-  static Status ExecScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status ExecScalar(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = OptionsWrapper<StructFieldOptions>::Get(ctx);
-    const std::shared_ptr<Scalar>* current = &batch[0].scalar();
+    const Scalar* current = batch[0].scalar;
     for (const auto& index : options.indices) {
-      RETURN_NOT_OK(CheckIndex(index, *(*current)->type));
-      if (!(*current)->is_valid) {
+      RETURN_NOT_OK(CheckIndex(index, *current->type));
+      if (!current->is_valid) {
         // out should already be a null scalar of the appropriate type
         return Status::OK();
       }
 
-      switch ((*current)->type->id()) {
+      switch (current->type->id()) {
         case Type::STRUCT: {
-          current = &checked_cast<const StructScalar&>(**current).value[index];
+          current = checked_cast<const StructScalar&>(*current).value[index].get();
           break;
         }
         case Type::DENSE_UNION:
         case Type::SPARSE_UNION: {
-          const auto& union_scalar = checked_cast<const UnionScalar&>(**current);
-          const auto& union_ty = checked_cast<const UnionType&>(*(*current)->type);
+          const auto& union_scalar = checked_cast<const UnionScalar&>(*current);
+          const auto& union_ty = checked_cast<const UnionType&>(*current->type);
           if (union_scalar.type_code != union_ty.type_codes()[index]) {
             // out should already be a null scalar of the appropriate type
             return Status::OK();
           }
-          current = &union_scalar.value;
+          current = union_scalar.value.get();
           break;
         }
         default:
           // Should have been checked in ResolveStructFieldType
           return Status::TypeError("struct_field: cannot reference child field of type ",
-                                   *(*current)->type);
+                                   *current->type);
       }
     }
-    *out = *current;
+    // XXX: Revisit the above to see if we can avoid shared_from_this
+    out->value = current->GetSharedPtr();
     return Status::OK();
   }
 
@@ -386,7 +440,7 @@ Result<ValueDescr> MakeStructResolve(KernelContext* ctx,
   return ValueDescr{struct_(std::move(fields)), shape};
 }
 
-Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status MakeStructExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   ARROW_ASSIGN_OR_RAISE(auto descr, MakeStructResolve(ctx, batch.GetDescriptors()));
 
   for (int i = 0; i < batch.num_values(); ++i) {
@@ -398,29 +452,31 @@ Status MakeStructExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     }
   }
 
+  /// TODO: remove this scalar output modality altogether
   if (descr.shape == ValueDescr::SCALAR) {
     ScalarVector scalars(batch.num_values());
     for (int i = 0; i < batch.num_values(); ++i) {
-      scalars[i] = batch[i].scalar();
+      scalars[i] = batch[i].scalar->GetSharedPtr();
     }
-
-    *out =
-        Datum(std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type)));
+    out->value =
+        std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type));
     return Status::OK();
   }
 
-  ArrayVector arrays(batch.num_values());
+  ArrayData* out_data = out->array_data().get();
+  out_data->length = batch.length;
+  out_data->type = descr.type;
+  out_data->child_data.resize(batch.num_values());
   for (int i = 0; i < batch.num_values(); ++i) {
     if (batch[i].is_array()) {
-      arrays[i] = batch[i].make_array();
-      continue;
+      out_data->child_data[i] = batch[i].array.ToArrayData();
+    } else {
+      ARROW_ASSIGN_OR_RAISE(
+          std::shared_ptr<Array> promoted,
+          MakeArrayFromScalar(*batch[i].scalar, batch.length, ctx->memory_pool()));
+      out_data->child_data[i] = promoted->data();
     }
-
-    ARROW_ASSIGN_OR_RAISE(arrays[i], MakeArrayFromScalar(*batch[i].scalar(), batch.length,
-                                                         ctx->memory_pool()));
   }
-
-  *out = std::make_shared<StructArray>(descr.type, batch.length, std::move(arrays));
   return Status::OK();
 }
 
@@ -431,31 +487,29 @@ const FunctionDoc make_struct_doc{"Wrap Arrays into a StructArray",
                                   "MakeStructOptions"};
 template <typename KeyType>
 struct MapLookupFunctor {
-  static Result<int64_t> GetOneMatchingIndex(const Array& keys,
-                                             const Scalar& query_key_scalar,
-                                             const bool* from_back) {
+  using UnboxedKey = typename UnboxScalar<KeyType>::T;
+  static Result<int64_t> GetOneMatchingIndex(const ArraySpan& keys, UnboxedKey query_key,
+                                             const bool use_last) {
     int64_t match_index = -1;
-    RETURN_NOT_OK(
-        FindMatchingIndices(keys, query_key_scalar, [&](int64_t index) -> Status {
-          match_index = index;
-          if (*from_back) {
-            return Status::OK();
-          } else {
-            return Status::Cancelled("Found key match for FIRST");
-          }
-        }));
-
+    RETURN_NOT_OK(FindMatchingIndices(keys, query_key, [&](int64_t index) -> Status {
+      match_index = index;
+      if (use_last) {
+        return Status::OK();
+      } else {
+        // If use_last is false, then this will abort the loop
+        return Status::Cancelled("Found match, short-circuiting");
+      }
+    }));
     return match_index;
   }
 
   template <typename FoundItem>
-  static Status FindMatchingIndices(const Array& keys, const Scalar& query_key_scalar,
+  static Status FindMatchingIndices(const ArraySpan& keys, UnboxedKey query_key,
                                     FoundItem callback) {
-    const auto query_key = UnboxScalar<KeyType>::Unbox(query_key_scalar);
     int64_t index = 0;
     Status status = VisitArrayValuesInline<KeyType>(
-        *keys.data(),
-        [&](decltype(query_key) key) -> Status {
+        keys,
+        [&](UnboxedKey key) -> Status {
           if (key == query_key) {
             return callback(index++);
           }
@@ -472,78 +526,95 @@ struct MapLookupFunctor {
     return Status::OK();
   }
 
-  static Status ExecMapArray(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status ExecMapArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = OptionsWrapper<MapLookupOptions>::Get(ctx);
-    const auto& query_key = options.query_key;
-    const auto& occurrence = options.occurrence;
-    const MapArray map_array(batch[0].array());
+    const UnboxedKey query_key = UnboxScalar<KeyType>::Unbox(*options.query_key);
+
+    const ArraySpan& map = batch[0].array;
+    const int32_t* offsets = map.GetValues<int32_t>(1);
+
+    // We create a copy of the keys array because we will adjust the
+    // offset and length for the map probes below
+    ArraySpan map_keys = map.child_data[0].child_data[0];
+    const ArraySpan& map_items = map.child_data[0].child_data[1];
+
+    std::shared_ptr<DataType> item_type =
+        checked_cast<const MapType*>(map.type)->item_type();
 
     std::unique_ptr<ArrayBuilder> builder;
-    if (occurrence == MapLookupOptions::Occurrence::ALL) {
-      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(),
-                                list(map_array.map_type()->item_type()), &builder));
+    if (options.occurrence == MapLookupOptions::Occurrence::ALL) {
+      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(item_type), &builder));
       auto list_builder = checked_cast<ListBuilder*>(builder.get());
       auto value_builder = list_builder->value_builder();
 
-      for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
-           ++map_array_idx) {
-        if (!map_array.IsValid(map_array_idx)) {
+      for (int64_t map_index = 0; map_index < map.length; ++map_index) {
+        if (!map.IsValid(map_index)) {
           RETURN_NOT_OK(list_builder->AppendNull());
           continue;
         }
 
-        auto map = map_array.value_slice(map_array_idx);
-        auto keys = checked_cast<const StructArray&>(*map).field(0);
-        auto items = checked_cast<const StructArray&>(*map).field(1);
+        const int32_t item_offset = offsets[map_index];
+        const int32_t item_size = offsets[map_index + 1] - offsets[map_index];
+
+        // Adjust the keys view to just the map slot that we are about to search
+        map_keys.SetOffset(item_offset);
+        map_keys.length = item_size;
+
         bool found_at_least_one_key = false;
-        RETURN_NOT_OK(
-            FindMatchingIndices(*keys, *query_key, [&](int64_t index) -> Status {
-              if (!found_at_least_one_key) RETURN_NOT_OK(list_builder->Append(true));
-              found_at_least_one_key = true;
-              RETURN_NOT_OK(value_builder->AppendArraySlice(*items->data(), index, 1));
-              return Status::OK();
-            }));
+        RETURN_NOT_OK(FindMatchingIndices(map_keys, query_key, [&](int64_t key_index) {
+          if (!found_at_least_one_key) {
+            RETURN_NOT_OK(list_builder->Append(true));
+          }
+          found_at_least_one_key = true;
+          return value_builder->AppendArraySlice(map_items, item_offset + key_index, 1);
+        }));
         if (!found_at_least_one_key) {
+          // Key was not found in this map element, so we append a null list
           RETURN_NOT_OK(list_builder->AppendNull());
         }
       }
       ARROW_ASSIGN_OR_RAISE(auto result, list_builder->Finish());
-      out->value = result->data();
+      out->value = std::move(result->data());
     } else { /* occurrence == FIRST || LAST */
-      RETURN_NOT_OK(
-          MakeBuilder(ctx->memory_pool(), map_array.map_type()->item_type(), &builder));
+      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), item_type, &builder));
       RETURN_NOT_OK(builder->Reserve(batch.length));
-      for (int64_t map_array_idx = 0; map_array_idx < map_array.length();
-           ++map_array_idx) {
-        if (!map_array.IsValid(map_array_idx)) {
+      for (int64_t map_index = 0; map_index < map.length; ++map_index) {
+        if (!map.IsValid(map_index)) {
           RETURN_NOT_OK(builder->AppendNull());
           continue;
         }
 
-        auto map = map_array.value_slice(map_array_idx);
-        auto keys = checked_cast<const StructArray&>(*map).field(0);
-        auto items = checked_cast<const StructArray&>(*map).field(1);
-        bool from_back = (occurrence == MapLookupOptions::LAST);
-        ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx,
-                              GetOneMatchingIndex(*keys, *query_key, &from_back));
+        const int32_t item_offset = offsets[map_index];
+        const int32_t item_size = offsets[map_index + 1] - offsets[map_index];
 
-        if (key_match_idx != -1) {
-          RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), key_match_idx, 1));
+        // Adjust the keys view to just the map slot that we are about to search
+        map_keys.SetOffset(item_offset);
+        map_keys.length = item_size;
+
+        ARROW_ASSIGN_OR_RAISE(
+            int64_t item_index,
+            GetOneMatchingIndex(map_keys, query_key,
+                                options.occurrence == MapLookupOptions::LAST));
+
+        if (item_index != -1) {
+          RETURN_NOT_OK(
+              builder->AppendArraySlice(map_items, item_offset + item_index, 1));
         } else {
           RETURN_NOT_OK(builder->AppendNull());
         }
       }
       ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
-      out->value = result->data();
+      out->value = std::move(result->data());
     }
-
     return Status::OK();
   }
 
-  static Status ExecMapScalar(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  /// TODO(ARROW-16577): use array path for scalars to avoid having to
+  /// maintain two code paths
+  static Status ExecMapScalar(KernelContext* ctx, const ExecSpan& batch,
+                              ExecResult* out) {
     const auto& options = OptionsWrapper<MapLookupOptions>::Get(ctx);
-    const auto& query_key = options.query_key;
-    const auto& occurrence = options.occurrence;
+    UnboxedKey query_key = UnboxScalar<KeyType>::Unbox(*options.query_key);
 
     std::shared_ptr<DataType> item_type =
         checked_cast<const MapType&>(*batch[0].type()).item_type();
@@ -559,34 +630,43 @@ struct MapLookupFunctor {
     }
 
     const auto& struct_array = checked_cast<const StructArray&>(*map_scalar.value);
-    const std::shared_ptr<Array> keys = struct_array.field(0);
-    const std::shared_ptr<Array> items = struct_array.field(1);
+    ArraySpan map_keys(*struct_array.data()->child_data[0]);
+
+    // Keys offset and length must be adjusted to match its parent
+    map_keys.length = struct_array.length();
+    map_keys.offset = struct_array.offset();
+
+    if (options.occurrence == MapLookupOptions::Occurrence::ALL) {
+      ArraySpan map_items(*struct_array.data()->child_data[1]);
+      // Keys offset and length must be adjusted to match its parent
+      map_items.length = struct_array.length();
+      map_items.offset = struct_array.offset();
 
-    if (occurrence == MapLookupOptions::Occurrence::ALL) {
       bool found_at_least_one_key = false;
       std::unique_ptr<ArrayBuilder> builder;
-      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), items->type(), &builder));
-
-      RETURN_NOT_OK(FindMatchingIndices(*keys, *query_key, [&](int64_t index) -> Status {
-        found_at_least_one_key = true;
-        RETURN_NOT_OK(builder->AppendArraySlice(*items->data(), index, 1));
-        return Status::OK();
-      }));
+      RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), item_type, &builder));
+      RETURN_NOT_OK(
+          FindMatchingIndices(map_keys, query_key, [&](int64_t index) -> Status {
+            found_at_least_one_key = true;
+            RETURN_NOT_OK(builder->AppendArraySlice(map_items, index, 1));
+            return Status::OK();
+          }));
       if (!found_at_least_one_key) {
-        out->value = MakeNullScalar(list(items->type()));
+        out->value = MakeNullScalar(list(item_type));
       } else {
         ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
-        ARROW_ASSIGN_OR_RAISE(out->value, MakeScalar(list(items->type()), result));
+        ARROW_ASSIGN_OR_RAISE(out->value, MakeScalar(list(item_type), result));
       }
     } else { /* occurrence == FIRST || LAST */
-      bool from_back = (occurrence == MapLookupOptions::LAST);
-
-      ARROW_ASSIGN_OR_RAISE(int64_t key_match_idx,
-                            GetOneMatchingIndex(*keys, *query_key, &from_back));
-      if (key_match_idx != -1) {
-        ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(key_match_idx));
+      std::shared_ptr<Array> items = struct_array.field(1);
+      ARROW_ASSIGN_OR_RAISE(
+          int64_t item_index,
+          GetOneMatchingIndex(map_keys, query_key,
+                              options.occurrence == MapLookupOptions::LAST));
+      if (item_index != -1) {
+        ARROW_ASSIGN_OR_RAISE(out->value, items->GetScalar(item_index));
       } else {
-        out->value = MakeNullScalar(items->type());
+        out->value = MakeNullScalar(item_type);
       }
     }
     return Status::OK();
@@ -620,12 +700,12 @@ Result<ValueDescr> ResolveMapLookupType(KernelContext* ctx,
 
 struct ResolveMapLookup {
   KernelContext* ctx;
-  const ExecBatch& batch;
-  Datum* out;
+  const ExecSpan& batch;
+  ExecResult* out;
 
   template <typename KeyType>
   Status Execute() {
-    if (batch[0].kind() == Datum::SCALAR) {
+    if (batch[0].is_scalar()) {
       return MapLookupFunctor<KeyType>::ExecMapScalar(ctx, batch, out);
     }
     return MapLookupFunctor<KeyType>::ExecMapArray(ctx, batch, out);
@@ -661,7 +741,7 @@ struct ResolveMapLookup {
     return Status::TypeError("Got unsupported type: ", type.ToString());
   }
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     ResolveMapLookup visitor{ctx, batch, out};
     return VisitTypeInline(*checked_cast<const MapType&>(*batch[0].type()).key_type(),
                            &visitor);
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
index c35c8f35028..d0f0b12e74f 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested_test.cc
@@ -68,6 +68,15 @@ TEST(TestScalarNested, ListElementFixedList) {
       auto index = ScalarFromJSON(index_type, "0");
       auto expected = ArrayFromJSON(ty, "[7, 6, 3, 1]");
       CheckScalar("list_element", {input, index}, expected);
+      index = ScalarFromJSON(index_type, "1");
+      expected = ArrayFromJSON(ty, "[5, 4, 12, 43]");
+      CheckScalar("list_element", {input, index}, expected);
+      index = ScalarFromJSON(index_type, "2");
+      expected = ArrayFromJSON(ty, "[81, 8, 2, 87]");
+      CheckScalar("list_element", {input, index}, expected);
+      index = ScalarFromJSON(index_type, "3");
+      EXPECT_THAT(CallFunction("list_element", {input, index}),
+                  Raises(StatusCode::Invalid));
     }
   }
 }
@@ -426,12 +435,12 @@ TYPED_TEST(TestMapLookupIntegralKeys, StringItems) {
   auto map_type = this->type_singleton();
   const char* input = R"(
     [
-      [ 
+      [
         [0, "zero"], [1, "first_one"], [2, "two"], [1, null], [3, "three"], [1, "second_one"],
         [1, "last_one"]
       ],
       null,
-      [ 
+      [
         [0, "zero_hero"], [9, "almost_six"], [1, "the_dumb_one"], [7, "eleven"],
         [1, "the_chosen_one"], [42, "meaning of life?"], [1, "just_one"],
         [1, "no more ones!"]
diff --git a/cpp/src/arrow/compute/kernels/scalar_random.cc b/cpp/src/arrow/compute/kernels/scalar_random.cc
index 2204dbf0202..2fe48c9a7a6 100644
--- a/cpp/src/arrow/compute/kernels/scalar_random.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_random.cc
@@ -56,7 +56,7 @@ random::pcg64_oneseq MakeSeedGenerator() {
   return seed_gen;
 }
 
-Status ExecRandom(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status ExecRandom(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   static random::pcg64_oneseq seed_gen = MakeSeedGenerator();
   static std::mutex seed_gen_mutex;
 
@@ -66,23 +66,23 @@ Status ExecRandom(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     return Status::Invalid("Negative number of elements");
   }
 
-  auto out_data = ArrayData::Make(float64(), options.length, 0);
-  out_data->buffers.resize(2, nullptr);
-
-  ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
-                        ctx->Allocate(options.length * sizeof(double)));
-  double* out_buffer = out_data->template GetMutableValues<double>(1);
-
   if (options.initializer == RandomOptions::Seed) {
     gen.seed(options.seed);
   } else {
     std::lock_guard<std::mutex> seed_gen_lock(seed_gen_mutex);
     gen.seed(seed_gen());
   }
+  // TODO(wesm): refactor to use batch length instead of passing length in
+  // options
+  auto out_data = ArrayData::Make(float64(), options.length, 0);
+  out_data->buffers.resize(2, nullptr);
+  ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
+                        ctx->Allocate(options.length * sizeof(double)));
+  double* out_values = out_data->GetMutableValues<double>(1);
   for (int64_t i = 0; i < options.length; ++i) {
-    out_buffer[i] = generate_uniform(&gen);
+    out_values[i] = generate_uniform(&gen);
   }
-  *out = std::move(out_data);
+  out->value = std::move(out_data);
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index 7d8d2edc4b3..a608df87146 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -16,7 +16,6 @@
 // under the License.
 
 #include "arrow/array/array_base.h"
-#include "arrow/array/builder_primitive.h"
 #include "arrow/compute/api_scalar.h"
 #include "arrow/compute/cast.h"
 #include "arrow/compute/kernels/common.h"
@@ -40,7 +39,7 @@ struct SetLookupState : public KernelState {
   explicit SetLookupState(MemoryPool* pool) : lookup_table(pool, 0) {}
 
   Status Init(const SetLookupOptions& options) {
-    if (options.value_set.kind() == Datum::ARRAY) {
+    if (options.value_set.is_array()) {
       const ArrayData& value_set = *options.value_set.array();
       memo_index_to_value_index.reserve(value_set.length);
       RETURN_NOT_OK(AddArrayValueSet(options, *options.value_set.array()));
@@ -227,26 +226,23 @@ Result<std::unique_ptr<KernelState>> InitSetLookup(KernelContext* ctx,
 
 struct IndexInVisitor {
   KernelContext* ctx;
-  const ArrayData& data;
-  Datum* out;
-  Int32Builder builder;
+  const ArraySpan& data;
+  ArraySpan* out;
+  uint8_t* out_bitmap;
 
-  IndexInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
-      : ctx(ctx), data(data), out(out), builder(ctx->exec_context()->memory_pool()) {}
+  IndexInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out)
+      : ctx(ctx), data(data), out(out), out_bitmap(out->buffers[0].data) {}
 
   Status Visit(const DataType& type) {
     DCHECK_EQ(type.id(), Type::NA);
     const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
+
     if (data.length != 0) {
       // skip_nulls is honored for consistency with other types
-      if (state.value_set_has_null) {
-        RETURN_NOT_OK(this->builder.Reserve(data.length));
-        for (int64_t i = 0; i < data.length; ++i) {
-          this->builder.UnsafeAppend(0);
-        }
-      } else {
-        RETURN_NOT_OK(this->builder.AppendNulls(data.length));
-      }
+      bit_util::SetBitsTo(out_bitmap, out->offset, out->length, state.value_set_has_null);
+
+      // Set all values to 0, which will be unmasked only if null is in the value_set
+      std::memset(out->GetValues<int32_t>(1), 0x00, out->length * sizeof(int32_t));
     }
     return Status::OK();
   }
@@ -257,28 +253,38 @@ struct IndexInVisitor {
 
     const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
 
-    RETURN_NOT_OK(this->builder.Reserve(data.length));
-    VisitArrayDataInline<Type>(
+    FirstTimeBitmapWriter bitmap_writer(out_bitmap, out->offset, out->length);
+    int32_t* out_data = out->GetValues<int32_t>(1);
+    VisitArraySpanInline<Type>(
         data,
         [&](T v) {
           int32_t index = state.lookup_table.Get(v);
           if (index != -1) {
+            bitmap_writer.Set();
+
             // matching needle; output index from value_set
-            this->builder.UnsafeAppend(state.memo_index_to_value_index[index]);
+            *out_data++ = state.memo_index_to_value_index[index];
           } else {
             // no matching needle; output null
-            this->builder.UnsafeAppendNull();
+            bitmap_writer.Clear();
+            *out_data++ = 0;
           }
+          bitmap_writer.Next();
         },
         [&]() {
           if (state.null_index != -1) {
+            bitmap_writer.Set();
+
             // value_set included null
-            this->builder.UnsafeAppend(state.null_index);
+            *out_data++ = state.null_index;
           } else {
             // value_set does not include null; output null
-            this->builder.UnsafeAppendNull();
+            bitmap_writer.Clear();
+            *out_data++ = 0;
           }
+          bitmap_writer.Next();
         });
+    bitmap_writer.Finish();
     return Status::OK();
   }
 
@@ -310,20 +316,11 @@ struct IndexInVisitor {
     return ProcessIndexIn<MonthDayNanoIntervalType>();
   }
 
-  Status Execute() {
-    Status s = VisitTypeInline(*data.type, this);
-    if (!s.ok()) {
-      return s;
-    }
-    std::shared_ptr<ArrayData> out_data;
-    RETURN_NOT_OK(this->builder.FinishInternal(&out_data));
-    out->value = std::move(out_data);
-    return Status::OK();
-  }
+  Status Execute() { return VisitTypeInline(*data.type, this); }
 };
 
-Status ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  return IndexInVisitor(ctx, *batch[0].array(), out).Execute();
+Status ExecIndexIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return IndexInVisitor(ctx, batch[0].array, out->array_span()).Execute();
 }
 
 // ----------------------------------------------------------------------
@@ -331,19 +328,18 @@ Status ExecIndexIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 // IsIn writes the results into a preallocated boolean data bitmap
 struct IsInVisitor {
   KernelContext* ctx;
-  const ArrayData& data;
-  Datum* out;
+  const ArraySpan& data;
+  ArraySpan* out;
 
-  IsInVisitor(KernelContext* ctx, const ArrayData& data, Datum* out)
+  IsInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out)
       : ctx(ctx), data(data), out(out) {}
 
   Status Visit(const DataType& type) {
     DCHECK_EQ(type.id(), Type::NA);
     const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
-    ArrayData* output = out->mutable_array();
     // skip_nulls is honored for consistency with other types
-    bit_util::SetBitsTo(output->buffers[1]->mutable_data(), output->offset,
-                        output->length, state.value_set_has_null);
+    bit_util::SetBitsTo(out->buffers[1].data, out->offset, out->length,
+                        state.value_set_has_null);
     return Status::OK();
   }
 
@@ -351,12 +347,9 @@ struct IsInVisitor {
   Status ProcessIsIn() {
     using T = typename GetViewType<Type>::T;
     const auto& state = checked_cast<const SetLookupState<Type>&>(*ctx->state());
-    ArrayData* output = out->mutable_array();
-
-    FirstTimeBitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
-                                 output->length);
 
-    VisitArrayDataInline<Type>(
+    FirstTimeBitmapWriter writer(out->buffers[1].data, out->offset, out->length);
+    VisitArraySpanInline<Type>(
         this->data,
         [&](T v) {
           if (state.lookup_table.Get(v) != -1) {
@@ -408,8 +401,8 @@ struct IsInVisitor {
   Status Execute() { return VisitTypeInline(*data.type, this); }
 };
 
-Status ExecIsIn(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  return IsInVisitor(ctx, *batch[0].array(), out).Execute();
+Status ExecIsIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  return IsInVisitor(ctx, batch[0].array, out->array_span()).Execute();
 }
 
 // Unary set lookup kernels available for the following input types
@@ -525,8 +518,9 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
   {
     ScalarKernel isin_base;
     isin_base.init = InitSetLookup;
-    isin_base.exec =
-        TrivialScalarUnaryAsArraysExec(ExecIsIn, NullHandling::OUTPUT_NOT_NULL);
+    isin_base.exec = TrivialScalarUnaryAsArraysExec(ExecIsIn,
+                                                    /*use_array_span=*/true,
+                                                    NullHandling::OUTPUT_NOT_NULL);
     isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
     auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), is_in_doc);
 
@@ -539,14 +533,14 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
     DCHECK_OK(registry->AddFunction(std::make_shared<IsInMetaBinary>()));
   }
 
-  // IndexIn uses Int32Builder and so is responsible for all its own allocation
+  // IndexIn writes its int32 output into preallocated memory
   {
     ScalarKernel index_in_base;
     index_in_base.init = InitSetLookup;
     index_in_base.exec = TrivialScalarUnaryAsArraysExec(
-        ExecIndexIn, NullHandling::COMPUTED_NO_PREALLOCATE);
-    index_in_base.null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
-    index_in_base.mem_allocation = MemAllocation::NO_PREALLOCATE;
+        ExecIndexIn,
+        /*use_array_span=*/true, NullHandling::COMPUTED_PREALLOCATE);
+    index_in_base.null_handling = NullHandling::COMPUTED_PREALLOCATE;
     auto index_in =
         std::make_shared<SetLookupFunction>("index_in", Arity::Unary(), index_in_doc);
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
index 50eff4ab78b..86b7a5597a0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc
@@ -75,7 +75,10 @@ void CheckIsInChunked(const std::shared_ptr<ChunkedArray>& input,
                        IsIn(input, SetLookupOptions(value_set, skip_nulls)));
   auto actual = actual_datum.chunked_array();
   ValidateOutput(actual_datum);
-  AssertChunkedEqual(*expected, *actual);
+
+  // Output contiguous in a single chunk
+  ASSERT_EQ(1, actual->num_chunks());
+  ASSERT_TRUE(actual->Equals(*expected));
 }
 
 void CheckIsInDictionary(const std::shared_ptr<DataType>& type,
@@ -497,7 +500,12 @@ class TestIndexInKernel : public ::testing::Test {
                          IndexIn(input, SetLookupOptions(value_set, skip_nulls)));
     ASSERT_EQ(Datum::CHUNKED_ARRAY, actual.kind());
     ValidateOutput(actual);
-    AssertChunkedEqual(*expected, *actual.chunked_array());
+
+    auto actual_chunked = actual.chunked_array();
+
+    // Output contiguous in a single chunk
+    ASSERT_EQ(1, actual_chunked->num_chunks());
+    ASSERT_TRUE(actual_chunked->Equals(*expected));
   }
 
   void CheckIndexInDictionary(const std::shared_ptr<DataType>& type,
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
index 611601cab87..6b3be303227 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -76,31 +76,29 @@ RE2::Options MakeRE2Options(bool ignore_case = false, bool literal = false) {
 template <typename StringTransform>
 struct FixedSizeBinaryTransformExecBase {
   static Status Execute(KernelContext* ctx, StringTransform* transform,
-                        const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      return ExecArray(ctx, transform, batch[0].array(), out);
+                        const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_array()) {
+      return ExecArray(ctx, transform, batch[0].array, out);
     }
-    DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
-    return ExecScalar(ctx, transform, batch[0].scalar(), out);
+    DCHECK(batch[0].is_scalar());
+    return ExecScalar(ctx, transform, batch[0].scalar, out);
   }
 
   static Status ExecArray(KernelContext* ctx, StringTransform* transform,
-                          const std::shared_ptr<ArrayData>& data, Datum* out) {
-    FixedSizeBinaryArray input(data);
-    ArrayData* output = out->mutable_array();
-
-    const int32_t input_width =
-        checked_cast<const FixedSizeBinaryType&>(*data->type).byte_width();
-    const int32_t output_width =
-        checked_cast<const FixedSizeBinaryType&>(*out->type()).byte_width();
-    const int64_t input_nstrings = input.length();
+                          const ArraySpan& input, ExecResult* out) {
+    ArrayData* output = out->array_data().get();
+
+    const int32_t input_width = input.type->byte_width();
+    const int32_t output_width = out->type()->byte_width();
+    const int64_t input_nstrings = input.length;
     ARROW_ASSIGN_OR_RAISE(auto values_buffer,
                           ctx->Allocate(output_width * input_nstrings));
     uint8_t* output_str = values_buffer->mutable_data();
 
+    const uint8_t* input_data = input.GetValues<uint8_t>(1);
     for (int64_t i = 0; i < input_nstrings; i++) {
       if (!input.IsNull(i)) {
-        const uint8_t* input_string = input.GetValue(i);
+        const uint8_t* input_string = input_data + i * input_width;
         auto encoded_nbytes = static_cast<int32_t>(
             transform->Transform(input_string, input_width, output_str));
         if (encoded_nbytes != output_width) {
@@ -117,14 +115,13 @@ struct FixedSizeBinaryTransformExecBase {
   }
 
   static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
-                           const std::shared_ptr<Scalar>& scalar, Datum* out) {
+                           const Scalar* scalar, ExecResult* out) {
     const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
     if (!input.is_valid) {
       return Status::OK();
     }
-    const int32_t out_width =
-        checked_cast<const FixedSizeBinaryType&>(*out->type()).byte_width();
-    auto* result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+    const int32_t out_width = out->type()->byte_width();
+    auto result = checked_cast<BaseBinaryScalar*>(out->scalar().get());
 
     const int32_t data_nbytes = static_cast<int32_t>(input.value->size());
     ARROW_ASSIGN_OR_RAISE(auto value_buffer, ctx->Allocate(out_width));
@@ -146,7 +143,7 @@ struct FixedSizeBinaryTransformExecWithState
   using State = typename StringTransform::State;
   using FixedSizeBinaryTransformExecBase<StringTransform>::Execute;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     StringTransform transform(State::Get(ctx));
     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
     return Execute(ctx, &transform, batch, out);
@@ -156,22 +153,27 @@ struct FixedSizeBinaryTransformExecWithState
                                        const std::vector<ValueDescr>& descrs) {
     DCHECK_EQ(1, descrs.size());
     const auto& options = State::Get(ctx);
-    const int32_t input_width =
-        checked_cast<const FixedSizeBinaryType&>(*descrs[0].type).byte_width();
+    const int32_t input_width = descrs[0].type->byte_width();
     const int32_t output_width = StringTransform::FixedOutputSize(options, input_width);
     return ValueDescr(fixed_size_binary(output_width), descrs[0].shape);
   }
 };
 
+template <typename T>
+static int64_t GetVarBinaryValuesLength(const ArraySpan& span) {
+  const T* offsets = span.GetValues<T>(1);
+  return span.length > 0 ? offsets[span.length] - offsets[0] : 0;
+}
+
 template <typename Type1, typename Type2>
 struct StringBinaryTransformBase {
   using ViewType2 = typename GetViewType<Type2>::T;
   using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
-  using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+  using offset_type = typename ArrayType1::offset_type;
 
   virtual ~StringBinaryTransformBase() = default;
 
-  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  virtual Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return Status::OK();
   }
 
@@ -190,18 +192,18 @@ struct StringBinaryTransformBase {
 
   // Scalar-Array
   virtual Result<int64_t> MaxCodeunits(const int64_t input1_ncodeunits,
-                                       const ArrayType2&) {
+                                       const ArraySpan&) {
     return input1_ncodeunits;
   }
 
   // Array-Scalar
-  virtual Result<int64_t> MaxCodeunits(const ArrayType1& input1, const ViewType2) {
-    return input1.total_values_length();
+  virtual Result<int64_t> MaxCodeunits(const ArraySpan& input1, const ViewType2) {
+    return GetVarBinaryValuesLength<offset_type>(input1);
   }
 
   // Array-Array
-  virtual Result<int64_t> MaxCodeunits(const ArrayType1& input1, const ArrayType2&) {
-    return input1.total_values_length();
+  virtual Result<int64_t> MaxCodeunits(const ArraySpan& input1, const ArraySpan&) {
+    return GetVarBinaryValuesLength<offset_type>(input1);
   }
 
   // Not all combinations of input shapes are meaningful to string binary
@@ -213,7 +215,8 @@ struct StringBinaryTransformBase {
   //
   // template <typename Type1, typename Type2>
   // struct MyStringTransform : public StringBinaryTransformBase<Type1, Type2> {
-  //   Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  //   Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override
+  //   {
   //     enable_scalar_array_ = false;
   //     enable_array_scalar_ = false;
   //     return StringBinaryTransformBase::PreExec(ctx, batch, out);
@@ -251,45 +254,35 @@ struct StringBinaryTransformExecBase {
   using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
 
   static Status Execute(KernelContext* ctx, StringTransform* transform,
-                        const ExecBatch& batch, Datum* out) {
+                        const ExecSpan& batch, ExecResult* out) {
     if (batch[0].is_scalar()) {
       if (batch[1].is_scalar()) {
         if (transform->enable_scalar_scalar_) {
-          return ExecScalarScalar(ctx, transform, batch[0].scalar(), batch[1].scalar(),
-                                  out);
+          return ExecScalarScalar(ctx, transform, batch[0].scalar, batch[1].scalar, out);
         }
       } else if (batch[1].is_array()) {
         if (transform->enable_scalar_array_) {
-          return ExecScalarArray(ctx, transform, batch[0].scalar(), batch[1].array(),
-                                 out);
+          return ExecScalarArray(ctx, transform, batch[0].scalar, batch[1].array, out);
         }
       }
     } else if (batch[0].is_array()) {
       if (batch[1].is_scalar()) {
         if (transform->enable_array_scalar_) {
-          return ExecArrayScalar(ctx, transform, batch[0].array(), batch[1].scalar(),
-                                 out);
+          return ExecArrayScalar(ctx, transform, batch[0].array, batch[1].scalar, out);
         }
       } else if (batch[1].is_array()) {
         if (transform->enable_array_array_) {
-          return ExecArrayArray(ctx, transform, batch[0].array(), batch[1].array(), out);
+          return ExecArrayArray(ctx, transform, batch[0].array, batch[1].array, out);
         }
       }
     }
-
-    if (!(transform->enable_scalar_scalar_ && transform->enable_scalar_array_ &&
-          transform->enable_array_scalar_ && transform->enable_array_array_)) {
-      return Status::Invalid(
-          "Binary string transform has no combination of operand kinds enabled.");
-    }
-
-    return Status::TypeError("Invalid combination of operands (", batch[0].ToString(),
-                             ", ", batch[1].ToString(), ") for binary string transform.");
+    return Status::Invalid(
+        "Binary string transform has no combination of operand kinds enabled.");
   }
 
   static Status ExecScalarScalar(KernelContext* ctx, StringTransform* transform,
-                                 const std::shared_ptr<Scalar>& scalar1,
-                                 const std::shared_ptr<Scalar>& scalar2, Datum* out) {
+                                 const Scalar* scalar1, const Scalar* scalar2,
+                                 ExecResult* out) {
     if (!scalar1->is_valid || !scalar2->is_valid) {
       return Status::OK();
     }
@@ -325,21 +318,20 @@ struct StringBinaryTransformExecBase {
   }
 
   static Status ExecArrayScalar(KernelContext* ctx, StringTransform* transform,
-                                const std::shared_ptr<ArrayData>& data1,
-                                const std::shared_ptr<Scalar>& scalar2, Datum* out) {
+                                const ArraySpan& data1, const Scalar* scalar2,
+                                ExecResult* out) {
     if (!scalar2->is_valid) {
       return Status::OK();
     }
-    const ArrayType1 array1(data1);
     const auto value2 = UnboxScalar<Type2>::Unbox(*scalar2);
 
     // Calculate max number of output codeunits
     ARROW_ASSIGN_OR_RAISE(const auto max_output_ncodeunits,
-                          transform->MaxCodeunits(array1, value2));
+                          transform->MaxCodeunits(data1, value2));
     RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
 
     // Allocate output strings
-    const auto output = out->mutable_array();
+    ArrayData* output = out->array_data().get();
     ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
     output->buffers[2] = values_buffer;
     const auto output_string = output->buffers[2]->mutable_data();
@@ -350,8 +342,8 @@ struct StringBinaryTransformExecBase {
     offset_type output_ncodeunits = 0;
 
     // Apply transform
-    RETURN_NOT_OK(VisitArrayDataInline<Type1>(
-        *data1,
+    RETURN_NOT_OK(VisitArraySpanInline<Type1>(
+        data1,
         [&](util::string_view input_string_view) {
           auto input_ncodeunits = static_cast<offset_type>(input_string_view.length());
           auto input_string = reinterpret_cast<const uint8_t*>(input_string_view.data());
@@ -378,23 +370,22 @@ struct StringBinaryTransformExecBase {
   }
 
   static Status ExecScalarArray(KernelContext* ctx, StringTransform* transform,
-                                const std::shared_ptr<Scalar>& scalar1,
-                                const std::shared_ptr<ArrayData>& data2, Datum* out) {
+                                const Scalar* scalar1, const ArraySpan& data2,
+                                ExecResult* out) {
     if (!scalar1->is_valid) {
       return Status::OK();
     }
     const auto& binary_scalar1 = checked_cast<const BaseBinaryScalar&>(*scalar1);
     const auto input_string = binary_scalar1.value->data();
     const auto input_ncodeunits = binary_scalar1.value->size();
-    const ArrayType2 array2(data2);
 
     // Calculate max number of output codeunits
     ARROW_ASSIGN_OR_RAISE(const auto max_output_ncodeunits,
-                          transform->MaxCodeunits(input_ncodeunits, array2));
+                          transform->MaxCodeunits(input_ncodeunits, data2));
     RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
 
     // Allocate output strings
-    const auto output = out->mutable_array();
+    ArrayData* output = out->array_data().get();
     ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
     output->buffers[2] = values_buffer;
     const auto output_string = output->buffers[2]->mutable_data();
@@ -404,11 +395,14 @@ struct StringBinaryTransformExecBase {
     output_offsets[0] = 0;
     offset_type output_ncodeunits = 0;
 
+    // TODO(wesm): rewrite to not require boxing
+    const ArrayType2 array2(data2.ToArrayData());
+
     // Apply transform
     RETURN_NOT_OK(arrow::internal::VisitBitBlocks(
-        data2->buffers[0], data2->offset, data2->length,
+        data2.buffers[0].data, data2.offset, data2.length,
         [&](int64_t i) {
-          auto value2 = array2.GetView(i);
+          ViewType2 value2 = array2.GetView(i);
           ARROW_ASSIGN_OR_RAISE(
               auto encoded_nbytes_,
               transform->Transform(input_string, input_ncodeunits, value2,
@@ -432,18 +426,15 @@ struct StringBinaryTransformExecBase {
   }
 
   static Status ExecArrayArray(KernelContext* ctx, StringTransform* transform,
-                               const std::shared_ptr<ArrayData>& data1,
-                               const std::shared_ptr<ArrayData>& data2, Datum* out) {
-    const ArrayType1 array1(data1);
-    const ArrayType2 array2(data2);
-
+                               const ArraySpan& data1, const ArraySpan& data2,
+                               ExecResult* out) {
     // Calculate max number of output codeunits
     ARROW_ASSIGN_OR_RAISE(const auto max_output_ncodeunits,
-                          transform->MaxCodeunits(array1, array2));
+                          transform->MaxCodeunits(data1, data2));
     RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
 
     // Allocate output strings
-    const auto output = out->mutable_array();
+    ArrayData* output = out->array_data().get();
     ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
     output->buffers[2] = values_buffer;
     const auto output_string = output->buffers[2]->mutable_data();
@@ -453,14 +444,20 @@ struct StringBinaryTransformExecBase {
     output_offsets[0] = 0;
     offset_type output_ncodeunits = 0;
 
+    const offset_type* data1_offsets = data1.GetValues<offset_type>(1);
+    const uint8_t* data1_data = data1.GetValues<uint8_t>(2, /*offset=*/0);
+
+    // TODO(wesm): rewrite to not require boxing
+    const ArrayType2 array2(data2.ToArrayData());
+
     // Apply transform
     RETURN_NOT_OK(arrow::internal::VisitTwoBitBlocks(
-        data1->buffers[0], data1->offset, data2->buffers[0], data2->offset, data1->length,
+        data1.buffers[0].data, data1.offset, data2.buffers[0].data, data2.offset,
+        data1.length,
         [&](int64_t i) {
-          auto input_string_view = array1.GetView(i);
-          auto input_ncodeunits = static_cast<offset_type>(input_string_view.length());
-          auto input_string = reinterpret_cast<const uint8_t*>(input_string_view.data());
-          auto value2 = array2.GetView(i);
+          const offset_type input_ncodeunits = data1_offsets[i + 1] - data1_offsets[i];
+          const uint8_t* input_string = data1_data + data1_offsets[i];
+          ViewType2 value2 = array2.GetView(i);
           ARROW_ASSIGN_OR_RAISE(
               auto encoded_nbytes_,
               transform->Transform(input_string, input_ncodeunits, value2,
@@ -498,7 +495,7 @@ struct StringBinaryTransformExec
     : public StringBinaryTransformExecBase<Type1, Type2, StringTransform> {
   using StringBinaryTransformExecBase<Type1, Type2, StringTransform>::Execute;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     StringTransform transform;
     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
     return Execute(ctx, &transform, batch, out);
@@ -511,7 +508,7 @@ struct StringBinaryTransformExecWithState
   using State = typename StringTransform::State;
   using StringBinaryTransformExecBase<Type1, Type2, StringTransform>::Execute;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     StringTransform transform(State::Get(ctx));
     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
     return Execute(ctx, &transform, batch, out);
@@ -539,37 +536,40 @@ Status GetShiftedOffsets(KernelContext* ctx, const Buffer& input_buffer, int64_t
 // Apply `transform` to input character data- this function cannot change the
 // length
 template <typename Type>
-Status StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
-                           TransformFunc transform, Datum* out) {
+Status StringDataTransform(KernelContext* ctx, const ExecSpan& batch,
+                           TransformFunc transform, ExecResult* out) {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using offset_type = typename Type::offset_type;
 
-  if (batch[0].kind() == Datum::ARRAY) {
-    const ArrayData& input = *batch[0].array();
-    ArrayType input_boxed(batch[0].array());
+  if (batch[0].is_array()) {
+    // TODO(wesm): Rewrite this to note require this, which is expensive
+    std::shared_ptr<ArrayData> input = batch[0].array.ToArrayData();
+    ArrayType input_boxed(input);
+    ArrayData* out_arr = out->array_data().get();
 
-    ArrayData* out_arr = out->mutable_array();
-
-    if (input.offset == 0) {
+    if (input->offset == 0) {
       // We can reuse offsets from input
-      out_arr->buffers[1] = input.buffers[1];
+      out_arr->buffers[1] = input->buffers[1];
     } else {
-      DCHECK(input.buffers[1]);
+      DCHECK(input->buffers[1]);
       // We must allocate new space for the offsets and shift the existing offsets
-      RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
-                                                   input.length, &out_arr->buffers[1]));
+      RETURN_NOT_OK(GetShiftedOffsets<offset_type>(ctx, *input->buffers[1], input->offset,
+                                                   input->length, &out_arr->buffers[1]));
     }
 
     // Allocate space for output data
     int64_t data_nbytes = input_boxed.total_values_length();
     RETURN_NOT_OK(ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
-    if (input.length > 0) {
-      transform(input.buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
+    if (input->length > 0) {
+      transform(input->buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
                 out_arr->buffers[2]->mutable_data());
     }
   } else {
-    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
-    auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
+    // Isn't an null output scalar already created? Anyway this code
+    // will be deleted soon per ARROW-16577
+    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar);
+    auto result = checked_pointer_cast<BaseBinaryScalar>(
+        MakeNullScalar(out->type()->GetSharedPtr()));
     if (input.is_valid) {
       result->is_valid = true;
       int64_t data_nbytes = input.value->size();
@@ -821,7 +821,7 @@ void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output)
 
 template <typename Type>
 struct AsciiUpper {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
   }
 };
@@ -832,7 +832,7 @@ void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output)
 
 template <typename Type>
 struct AsciiLower {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
   }
 };
@@ -843,7 +843,7 @@ void TransformAsciiSwapCase(const uint8_t* input, int64_t length, uint8_t* outpu
 
 template <typename Type>
 struct AsciiSwapCase {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return StringDataTransform<Type>(ctx, batch, TransformAsciiSwapCase, out);
   }
 };
@@ -949,12 +949,11 @@ struct BinaryLength {
     return static_cast<OutValue>(val.size());
   }
 
-  static Status FixedSizeExec(KernelContext*, const ExecBatch& batch, Datum* out) {
+  static Status FixedSizeExec(KernelContext*, const ExecSpan& batch, ExecResult* out) {
     // Output is preallocated and validity buffer is precomputed
-    const int32_t width =
-        checked_cast<const FixedSizeBinaryType&>(*batch[0].type()).byte_width();
-    if (batch.values[0].is_array()) {
-      int32_t* buffer = out->mutable_array()->GetMutableValues<int32_t>(1);
+    const int32_t width = batch[0].type()->byte_width();
+    if (batch[0].is_array()) {
+      int32_t* buffer = out->array_span()->GetValues<int32_t>(1);
       std::fill(buffer, buffer + batch.length, width);
     } else {
       checked_cast<Int32Scalar*>(out->scalar().get())->value = width;
@@ -1213,7 +1212,7 @@ struct AsciiPadTransform : public StringTransformBase {
 
   explicit AsciiPadTransform(const PadOptions& options) : options_(options) {}
 
-  Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override {
     if (options_.padding.size() != 1) {
       return Status::Invalid("Padding must be one byte, got '", options_.padding, "'");
     }
@@ -1299,21 +1298,20 @@ using StrToBoolTransformFunc =
 // Apply `transform` to input character data- this function cannot change the
 // length
 template <typename Type>
-void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
-                         StrToBoolTransformFunc transform, Datum* out) {
+void StringBoolTransform(KernelContext* ctx, const ExecSpan& batch,
+                         StrToBoolTransformFunc transform, ExecResult* out) {
   using offset_type = typename Type::offset_type;
 
-  if (batch[0].kind() == Datum::ARRAY) {
-    const ArrayData& input = *batch[0].array();
-    ArrayData* out_arr = out->mutable_array();
+  if (batch[0].is_array()) {
+    const ArraySpan& input = batch[0].array;
+    ArraySpan* out_arr = out->array_span();
     if (input.length > 0) {
       transform(
-          reinterpret_cast<const offset_type*>(input.buffers[1]->data()) + input.offset,
-          input.buffers[2]->data(), input.length, out_arr->offset,
-          out_arr->buffers[1]->mutable_data());
+          reinterpret_cast<const offset_type*>(input.buffers[1].data) + input.offset,
+          input.buffers[2].data, input.length, out_arr->offset, out_arr->buffers[1].data);
     }
   } else {
-    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+    const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar);
     if (input.is_valid) {
       uint8_t result_value = 0;
       std::array<offset_type, 2> offsets{0,
@@ -1449,7 +1447,7 @@ template <typename Type, typename Matcher>
 struct MatchSubstringImpl {
   using offset_type = typename Type::offset_type;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out,
                      const Matcher* matcher) {
     StringBoolTransform<Type>(
         ctx, batch,
@@ -1474,7 +1472,7 @@ struct MatchSubstringImpl {
 
 template <typename Type, typename Matcher>
 struct MatchSubstring {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // TODO Cache matcher across invocations (for regex compilation)
     ARROW_ASSIGN_OR_RAISE(auto matcher, Matcher::Make(MatchSubstringState::Get(ctx)));
     return MatchSubstringImpl<Type, Matcher>::Exec(ctx, batch, out, matcher.get());
@@ -1484,7 +1482,7 @@ struct MatchSubstring {
 #ifdef ARROW_WITH_RE2
 template <typename Type>
 struct MatchSubstring<Type, RegexSubstringMatcher> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // TODO Cache matcher across invocations (for regex compilation)
     ARROW_ASSIGN_OR_RAISE(auto matcher,
                           RegexSubstringMatcher::Make(MatchSubstringState::Get(ctx),
@@ -1497,7 +1495,7 @@ struct MatchSubstring<Type, RegexSubstringMatcher> {
 
 template <typename Type>
 struct MatchSubstring<Type, PlainSubstringMatcher> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     auto options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
@@ -1518,7 +1516,7 @@ struct MatchSubstring<Type, PlainSubstringMatcher> {
 
 template <typename Type>
 struct MatchSubstring<Type, PlainStartsWithMatcher> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     auto options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
@@ -1541,7 +1539,7 @@ struct MatchSubstring<Type, PlainStartsWithMatcher> {
 
 template <typename Type>
 struct MatchSubstring<Type, PlainEndsWithMatcher> {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     auto options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
@@ -1615,7 +1613,7 @@ std::string MakeLikeRegex(const MatchSubstringOptions& options) {
 // https://github.com/apache/impala/blob/9c38568657d62b6f6d7b10aa1c721ba843374dd8/be/src/exprs/like-predicate.cc
 template <typename StringType>
 struct MatchLike {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // NOTE: avoid making those constants global to avoid compiling regexes at startup
     static const RE2::Options kRE2Options = MakeRE2Options<StringType>();
     // A LIKE pattern matching this regex can be translated into a substring search.
@@ -1813,7 +1811,7 @@ struct FindSubstringRegex {
 template <typename InputType>
 struct FindSubstringExec {
   using OffsetType = typename TypeTraits<InputType>::OffsetType;
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
@@ -1842,7 +1840,7 @@ const FunctionDoc find_substring_doc(
 template <typename InputType>
 struct FindSubstringRegexExec {
   using OffsetType = typename TypeTraits<InputType>::OffsetType;
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
     applicator::ScalarUnaryNotNullStateful<OffsetType, InputType, FindSubstringRegex>
         kernel{FindSubstringRegex(options, /*literal=*/false)};
@@ -1957,7 +1955,7 @@ struct CountSubstringRegex {
 template <typename InputType>
 struct CountSubstringRegexExec {
   using OffsetType = typename TypeTraits<InputType>::OffsetType;
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
     ARROW_ASSIGN_OR_RAISE(
         auto counter, CountSubstringRegex::Make(options, /*is_utf8=*/InputType::is_utf8));
@@ -1971,7 +1969,7 @@ struct CountSubstringRegexExec {
 template <typename InputType>
 struct CountSubstringExec {
   using OffsetType = typename TypeTraits<InputType>::OffsetType;
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const MatchSubstringOptions& options = MatchSubstringState::Get(ctx);
     if (options.ignore_case) {
 #ifdef ARROW_WITH_RE2
@@ -2052,25 +2050,24 @@ struct ReplaceSubstring {
   using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
   using OffsetBuilder = TypedBufferBuilder<offset_type>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     // TODO Cache replacer across invocations (for regex compilation)
     ARROW_ASSIGN_OR_RAISE(auto replacer, Replacer::Make(ReplaceState::Get(ctx)));
     return Replace(ctx, batch, *replacer, out);
   }
 
-  static Status Replace(KernelContext* ctx, const ExecBatch& batch,
-                        const Replacer& replacer, Datum* out) {
+  static Status Replace(KernelContext* ctx, const ExecSpan& batch,
+                        const Replacer& replacer, ExecResult* out) {
     ValueDataBuilder value_data_builder(ctx->memory_pool());
     OffsetBuilder offset_builder(ctx->memory_pool());
 
-    if (batch[0].kind() == Datum::ARRAY) {
+    if (batch[0].is_array()) {
       // We already know how many strings we have, so we can use Reserve/UnsafeAppend
-      RETURN_NOT_OK(offset_builder.Reserve(batch[0].array()->length + 1));
+      RETURN_NOT_OK(offset_builder.Reserve(batch.length + 1));
       offset_builder.UnsafeAppend(0);  // offsets start at 0
 
-      const ArrayData& input = *batch[0].array();
-      RETURN_NOT_OK(VisitArrayDataInline<Type>(
-          input,
+      RETURN_NOT_OK(VisitArraySpanInline<Type>(
+          batch[0].array,
           [&](util::string_view s) {
             RETURN_NOT_OK(replacer.ReplaceString(s, &value_data_builder));
             offset_builder.UnsafeAppend(
@@ -2083,11 +2080,11 @@ struct ReplaceSubstring {
                 static_cast<offset_type>(value_data_builder.length()));
             return Status::OK();
           }));
-      ArrayData* output = out->mutable_array();
+      ArrayData* output = out->array_data().get();
       RETURN_NOT_OK(value_data_builder.Finish(&output->buffers[2]));
       RETURN_NOT_OK(offset_builder.Finish(&output->buffers[1]));
     } else {
-      const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+      const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar);
       auto result = std::make_shared<ScalarType>();
       if (input.is_valid) {
         util::string_view s = static_cast<util::string_view>(*input.value);
@@ -2374,18 +2371,20 @@ struct ExtractRegex : public ExtractRegexBase {
   using BuilderType = typename TypeTraits<Type>::BuilderType;
   using ExtractRegexBase::ExtractRegexBase;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     ExtractRegexOptions options = ExtractRegexState::Get(ctx);
     ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options, Type::is_utf8));
     return ExtractRegex{data}.Extract(ctx, batch, out);
   }
 
-  Status Extract(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  Status Extract(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    // TODO: why is this needed? Type resolution should already be
+    // done and the output type set in the output variable
     ARROW_ASSIGN_OR_RAISE(auto descr, data.ResolveOutputType(batch.GetDescriptors()));
     DCHECK_NE(descr.type, nullptr);
     const auto& type = descr.type;
 
-    if (batch[0].kind() == Datum::ARRAY) {
+    if (batch[0].is_array()) {
       std::unique_ptr<ArrayBuilder> array_builder;
       RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), type, &array_builder));
       StructBuilder* struct_builder = checked_cast<StructBuilder*>(array_builder.get());
@@ -2408,14 +2407,13 @@ struct ExtractRegex : public ExtractRegexBase {
           return struct_builder->AppendNull();
         }
       };
-      const ArrayData& input = *batch[0].array();
-      RETURN_NOT_OK(VisitArrayDataInline<Type>(input, visit_value, visit_null));
+      RETURN_NOT_OK(VisitArraySpanInline<Type>(batch[0].array, visit_value, visit_null));
 
       std::shared_ptr<Array> out_array;
       RETURN_NOT_OK(struct_builder->Finish(&out_array));
-      *out = std::move(out_array);
+      out->value = std::move(out_array->data());
     } else {
-      const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
+      const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar);
       auto result = std::make_shared<StructScalar>(type);
       if (input.is_valid && Match(util::string_view(*input.value))) {
         result->value.reserve(group_count);
@@ -2542,9 +2540,11 @@ void AddAsciiStringReplaceSlice(FunctionRegistry* registry) {
   }
   using TransformExec =
       FixedSizeBinaryTransformExecWithState<BinaryReplaceSliceTransform>;
-  DCHECK_OK(func->AddKernel({InputType(Type::FIXED_SIZE_BINARY)},
-                            OutputType(TransformExec::OutputType), TransformExec::Exec,
-                            ReplaceStringSliceTransformBase::State::Init));
+  ScalarKernel fsb_kernel({InputType(Type::FIXED_SIZE_BINARY)},
+                          OutputType(TransformExec::OutputType), TransformExec::Exec,
+                          ReplaceStringSliceTransformBase::State::Init);
+  fsb_kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  DCHECK_OK(func->AddKernel(std::move(fsb_kernel)));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
@@ -2781,20 +2781,20 @@ struct BinaryJoin {
   using ListOffsetType = typename ListArrayType::offset_type;
   using BuilderType = typename TypeTraits<BinaryType>::BuilderType;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::SCALAR) {
-      if (batch[1].kind() == Datum::SCALAR) {
-        return ExecScalarScalar(ctx, *batch[0].scalar(), *batch[1].scalar(), out);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_scalar()) {
+      if (batch[1].is_scalar()) {
+        return ExecScalarScalar(ctx, *batch[0].scalar, *batch[1].scalar, out);
       }
-      DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
-      return ExecScalarArray(ctx, *batch[0].scalar(), batch[1].array(), out);
+      DCHECK(batch[1].is_array());
+      return ExecScalarArray(ctx, *batch[0].scalar, batch[1].array, out);
     }
-    DCHECK_EQ(batch[0].kind(), Datum::ARRAY);
-    if (batch[1].kind() == Datum::SCALAR) {
-      return ExecArrayScalar(ctx, batch[0].array(), *batch[1].scalar(), out);
+    DCHECK(batch[0].is_array());
+    if (batch[1].is_scalar()) {
+      return ExecArrayScalar(ctx, batch[0].array, *batch[1].scalar, out);
     }
-    DCHECK_EQ(batch[1].kind(), Datum::ARRAY);
-    return ExecArrayArray(ctx, batch[0].array(), batch[1].array(), out);
+    DCHECK(batch[1].is_array());
+    return ExecArrayArray(ctx, batch[0].array, batch[1].array, out);
   }
 
   struct ListScalarOffsetLookup {
@@ -2834,7 +2834,7 @@ struct BinaryJoin {
 
   // Scalar, scalar -> scalar
   static Status ExecScalarScalar(KernelContext* ctx, const Scalar& left,
-                                 const Scalar& right, Datum* out) {
+                                 const Scalar& right, ExecResult* out) {
     const auto& list = checked_cast<const ListScalarType&>(left);
     const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
     if (!list.is_valid || !separator_scalar.is_valid) {
@@ -2869,22 +2869,25 @@ struct BinaryJoin {
 
   // Scalar, array -> array
   static Status ExecScalarArray(KernelContext* ctx, const Scalar& left,
-                                const std::shared_ptr<ArrayData>& right, Datum* out) {
+                                const ArraySpan& right, ExecResult* out) {
     const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
     if (!list_scalar.is_valid) {
       ARROW_ASSIGN_OR_RAISE(
-          auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
-      *out = *nulls->data();
+          auto nulls,
+          MakeArrayOfNull(right.type->GetSharedPtr(), right.length, ctx->memory_pool()));
+      out->value = std::move(nulls->data());
       return Status::OK();
     }
     const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
     if (strings.null_count() != 0) {
       ARROW_ASSIGN_OR_RAISE(
-          auto nulls, MakeArrayOfNull(right->type, right->length, ctx->memory_pool()));
-      *out = *nulls->data();
+          auto nulls,
+          MakeArrayOfNull(right.type->GetSharedPtr(), right.length, ctx->memory_pool()));
+      out->value = std::move(nulls->data());
       return Status::OK();
     }
-    const ArrayType separators(right);
+    // TODO(wesm): rewrite to not use ArrayData
+    const ArrayType separators(right.ToArrayData());
 
     BuilderType builder(ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(separators.length()));
@@ -2910,17 +2913,17 @@ struct BinaryJoin {
   }
 
   // Array, scalar -> array
-  static Status ExecArrayScalar(KernelContext* ctx,
-                                const std::shared_ptr<ArrayData>& left,
-                                const Scalar& right, Datum* out) {
-    const ListArrayType lists(left);
+  static Status ExecArrayScalar(KernelContext* ctx, const ArraySpan& left,
+                                const Scalar& right, ExecResult* out) {
+    // TODO(wesm): rewrite to not use ArrayData
+    const ListArrayType lists(left.ToArrayData());
     const auto& separator_scalar = checked_cast<const BaseBinaryScalar&>(right);
 
     if (!separator_scalar.is_valid) {
       ARROW_ASSIGN_OR_RAISE(
           auto nulls,
           MakeArrayOfNull(lists.value_type(), lists.length(), ctx->memory_pool()));
-      *out = *nulls->data();
+      out->value = std::move(nulls->data());
       return Status::OK();
     }
 
@@ -2946,13 +2949,15 @@ struct BinaryJoin {
   }
 
   // Array, array -> array
-  static Status ExecArrayArray(KernelContext* ctx, const std::shared_ptr<ArrayData>& left,
-                               const std::shared_ptr<ArrayData>& right, Datum* out) {
-    const ListArrayType lists(left);
+  static Status ExecArrayArray(KernelContext* ctx, const ArraySpan& left,
+                               const ArraySpan& right, ExecResult* out) {
+    // TODO(wesm): rewrite to not use ArrayData
+    const ListArrayType lists(left.ToArrayData());
+    const ArrayType separators(right.ToArrayData());
+
     const auto& strings = checked_cast<const ArrayType&>(*lists.values());
     const auto list_offsets = lists.raw_value_offsets();
     const auto string_offsets = strings.raw_value_offsets();
-    const ArrayType separators(right);
 
     BuilderType builder(ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(lists.length()));
@@ -2984,7 +2989,7 @@ struct BinaryJoin {
   template <typename ListOffsetLookup, typename SeparatorLookup>
   static Status JoinStrings(int64_t length, const ArrayType& strings,
                             ListOffsetLookup&& list_offsets, SeparatorLookup&& separators,
-                            BuilderType* builder, Datum* out) {
+                            BuilderType* builder, ExecResult* out) {
     for (int64_t i = 0; i < length; ++i) {
       if (list_offsets.IsNull(i) || separators.IsNull(i)) {
         builder->UnsafeAppendNull();
@@ -3008,9 +3013,9 @@ struct BinaryJoin {
 
     std::shared_ptr<Array> string_array;
     RETURN_NOT_OK(builder->Finish(&string_array));
-    *out = *string_array->data();
+    out->value = std::move(string_array->data());
     // Correct the output type based on the input
-    out->mutable_array()->type = strings.type();
+    out->array_data()->type = strings.type();
     return Status::OK();
   }
 
@@ -3035,23 +3040,25 @@ struct BinaryJoinElementWise {
   using BuilderType = typename TypeTraits<Type>::BuilderType;
   using offset_type = typename Type::offset_type;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     JoinOptions options = BinaryJoinElementWiseState::Get(ctx);
     // Last argument is the separator (for consistency with binary_join)
+    // TODO(wesm): eliminate this scalar output modality altogether to
+    // simplify implementation
     if (std::all_of(batch.values.begin(), batch.values.end(),
-                    [](const Datum& d) { return d.is_scalar(); })) {
+                    [](const ExecValue& d) { return d.is_scalar(); })) {
       return ExecOnlyScalar(ctx, options, batch, out);
     }
     return ExecContainingArrays(ctx, options, batch, out);
   }
 
   static Status ExecOnlyScalar(KernelContext* ctx, const JoinOptions& options,
-                               const ExecBatch& batch, Datum* out) {
+                               const ExecSpan& batch, ExecResult* out) {
     BaseBinaryScalar* output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
-    const size_t num_args = batch.values.size();
+    const int num_args = batch.num_values();
     if (num_args == 1) {
       // Only separator, no values
-      output->is_valid = batch.values[0].scalar()->is_valid;
+      output->is_valid = batch[0].scalar->is_valid;
       if (output->is_valid) {
         ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(0));
       }
@@ -3064,11 +3071,11 @@ struct BinaryJoinElementWise {
       return Status::OK();
     }
     ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
-    const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar());
+    const auto separator = UnboxScalar<Type>::Unbox(*batch.values.back().scalar);
     uint8_t* buf = output->value->mutable_data();
     bool first = true;
-    for (size_t i = 0; i < num_args - 1; i++) {
-      const Scalar& scalar = *batch[i].scalar();
+    for (int i = 0; i < num_args - 1; i++) {
+      const Scalar& scalar = *batch[i].scalar;
       util::string_view s;
       if (scalar.is_valid) {
         s = UnboxScalar<Type>::Unbox(scalar);
@@ -3097,7 +3104,7 @@ struct BinaryJoinElementWise {
   }
 
   static Status ExecContainingArrays(KernelContext* ctx, const JoinOptions& options,
-                                     const ExecBatch& batch, Datum* out) {
+                                     const ExecSpan& batch, ExecResult* out) {
     // Presize data to avoid reallocations
     int64_t final_size = 0;
     for (int64_t i = 0; i < batch.length; i++) {
@@ -3108,28 +3115,28 @@ struct BinaryJoinElementWise {
     RETURN_NOT_OK(builder.Reserve(batch.length));
     RETURN_NOT_OK(builder.ReserveData(final_size));
 
-    std::vector<util::string_view> valid_cols(batch.values.size());
-    for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
-      size_t num_valid = 0;  // Not counting separator
-      for (size_t col = 0; col < batch.values.size(); col++) {
+    std::vector<util::string_view> valid_cols(batch.num_values());
+    for (int64_t row = 0; row < batch.length; row++) {
+      int num_valid = 0;  // Not counting separator
+      for (int col = 0; col < batch.num_values(); col++) {
         if (batch[col].is_scalar()) {
-          const auto& scalar = *batch[col].scalar();
+          const auto& scalar = *batch[col].scalar;
           if (scalar.is_valid) {
             valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
-            if (col < batch.values.size() - 1) num_valid++;
+            if (col < batch.num_values() - 1) num_valid++;
           } else {
             valid_cols[col] = util::string_view();
           }
         } else {
-          const ArrayData& array = *batch[col].array();
+          const ArraySpan& array = batch[col].array;
           if (!array.MayHaveNulls() ||
-              bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+              bit_util::GetBit(array.buffers[0].data, array.offset + row)) {
             const offset_type* offsets = array.GetValues<offset_type>(1);
             const uint8_t* data = array.GetValues<uint8_t>(2, /*absolute_offset=*/0);
             const int64_t length = offsets[row + 1] - offsets[row];
             valid_cols[col] = util::string_view(
                 reinterpret_cast<const char*>(data + offsets[row]), length);
-            if (col < batch.values.size() - 1) num_valid++;
+            if (col < batch.num_values() - 1) num_valid++;
           } else {
             valid_cols[col] = util::string_view();
           }
@@ -3140,11 +3147,11 @@ struct BinaryJoinElementWise {
         // Separator is null
         builder.UnsafeAppendNull();
         continue;
-      } else if (batch.values.size() == 1) {
+      } else if (batch.num_values() == 1) {
         // Only given separator
         builder.UnsafeAppendEmptyValue();
         continue;
-      } else if (num_valid < batch.values.size() - 1) {
+      } else if (num_valid < batch.num_values() - 1) {
         // We had some nulls
         if (options.null_handling == JoinOptions::EMIT_NULL) {
           builder.UnsafeAppendNull();
@@ -3153,7 +3160,7 @@ struct BinaryJoinElementWise {
       }
       const auto separator = valid_cols.back();
       bool first = true;
-      for (size_t col = 0; col < batch.values.size() - 1; col++) {
+      for (int col = 0; col < batch.num_values() - 1; col++) {
         util::string_view value = valid_cols[col];
         if (!value.data()) {
           switch (options.null_handling) {
@@ -3179,31 +3186,31 @@ struct BinaryJoinElementWise {
 
     std::shared_ptr<Array> string_array;
     RETURN_NOT_OK(builder.Finish(&string_array));
-    *out = *string_array->data();
-    out->mutable_array()->type = batch[0].type();
-    DCHECK_EQ(batch.length, out->array()->length);
+    out->value = std::move(string_array->data());
+    out->array_data()->type = batch[0].type()->GetSharedPtr();
+    DCHECK_EQ(batch.length, out->array_data()->length);
     DCHECK_EQ(final_size,
               checked_cast<const ArrayType&>(*string_array).total_values_length());
     return Status::OK();
   }
 
   // Compute the length of the output for the given position, or -1 if it would be null.
-  static int64_t CalculateRowSize(const JoinOptions& options, const ExecBatch& batch,
+  static int64_t CalculateRowSize(const JoinOptions& options, const ExecSpan& batch,
                                   const int64_t index) {
-    const auto num_args = batch.values.size();
+    const int num_args = batch.num_values();
     int64_t final_size = 0;
     int64_t num_non_null_args = 0;
-    for (size_t i = 0; i < num_args; i++) {
+    for (int i = 0; i < num_args; i++) {
       int64_t element_size = 0;
       bool valid = true;
       if (batch[i].is_scalar()) {
-        const Scalar& scalar = *batch[i].scalar();
+        const Scalar& scalar = *batch[i].scalar;
         valid = scalar.is_valid;
         element_size = UnboxScalar<Type>::Unbox(scalar).size();
       } else {
-        const ArrayData& array = *batch[i].array();
+        const ArraySpan& array = batch[i].array;
         valid = !array.MayHaveNulls() ||
-                bit_util::GetBit(array.buffers[0]->data(), array.offset + index);
+                bit_util::GetBit(array.buffers[0].data, array.offset + index);
         const offset_type* offsets = array.GetValues<offset_type>(1);
         element_size = offsets[index + 1] - offsets[index];
       }
@@ -3317,6 +3324,8 @@ template <typename Type1, typename Type2>
 struct BinaryRepeatTransform : public StringBinaryTransformBase<Type1, Type2> {
   using ArrayType1 = typename TypeTraits<Type1>::ArrayType;
   using ArrayType2 = typename TypeTraits<Type2>::ArrayType;
+  using offset_type = typename ArrayType1::offset_type;
+  using repeat_type = typename Type2::c_type;
 
   Result<int64_t> MaxCodeunits(const int64_t input1_ncodeunits,
                                const int64_t num_repeats) override {
@@ -3325,29 +3334,30 @@ struct BinaryRepeatTransform : public StringBinaryTransformBase<Type1, Type2> {
   }
 
   Result<int64_t> MaxCodeunits(const int64_t input1_ncodeunits,
-                               const ArrayType2& input2) override {
+                               const ArraySpan& input2) override {
     int64_t total_num_repeats = 0;
-    for (int64_t i = 0; i < input2.length(); ++i) {
-      auto num_repeats = input2.GetView(i);
-      ARROW_RETURN_NOT_OK(ValidateRepeatCount(num_repeats));
-      total_num_repeats += num_repeats;
+    const repeat_type* repeats = input2.GetValues<repeat_type>(1);
+    for (int64_t i = 0; i < input2.length; ++i) {
+      ARROW_RETURN_NOT_OK(ValidateRepeatCount(repeats[i]));
+      total_num_repeats += repeats[i];
     }
     return input1_ncodeunits * total_num_repeats;
   }
 
-  Result<int64_t> MaxCodeunits(const ArrayType1& input1,
+  Result<int64_t> MaxCodeunits(const ArraySpan& input1,
                                const int64_t num_repeats) override {
     ARROW_RETURN_NOT_OK(ValidateRepeatCount(num_repeats));
-    return input1.total_values_length() * num_repeats;
+    return GetVarBinaryValuesLength<offset_type>(input1) * num_repeats;
   }
 
-  Result<int64_t> MaxCodeunits(const ArrayType1& input1,
-                               const ArrayType2& input2) override {
+  Result<int64_t> MaxCodeunits(const ArraySpan& input1,
+                               const ArraySpan& input2) override {
     int64_t total_codeunits = 0;
-    for (int64_t i = 0; i < input2.length(); ++i) {
-      auto num_repeats = input2.GetView(i);
-      ARROW_RETURN_NOT_OK(ValidateRepeatCount(num_repeats));
-      total_codeunits += input1.GetView(i).length() * num_repeats;
+    const repeat_type* repeats = input2.GetValues<repeat_type>(1);
+    const offset_type* offsets = input1.GetValues<offset_type>(1);
+    for (int64_t i = 0; i < input2.length; ++i) {
+      ARROW_RETURN_NOT_OK(ValidateRepeatCount(repeats[i]));
+      total_codeunits += (offsets[i + 1] - offsets[i]) * repeats[i];
     }
     return total_codeunits;
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_internal.h b/cpp/src/arrow/compute/kernels/scalar_string_internal.h
index 3d3f030f3ba..635aacf671d 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_internal.h
+++ b/cpp/src/arrow/compute/kernels/scalar_string_internal.h
@@ -33,7 +33,7 @@ constexpr int64_t kStringTransformError = -1;
 
 struct StringTransformBase {
   virtual ~StringTransformBase() = default;
-  virtual Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  virtual Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return Status::OK();
   }
 
@@ -68,24 +68,26 @@ struct StringTransformExecBase {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
 
   static Status Execute(KernelContext* ctx, StringTransform* transform,
-                        const ExecBatch& batch, Datum* out) {
-    if (batch[0].kind() == Datum::ARRAY) {
-      return ExecArray(ctx, transform, batch[0].array(), out);
+                        const ExecSpan& batch, ExecResult* out) {
+    if (batch[0].is_array()) {
+      return ExecArray(ctx, transform, batch[0].array, out);
     }
-    DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
-    return ExecScalar(ctx, transform, batch[0].scalar(), out);
+    DCHECK(batch[0].is_scalar());
+    // TODO: change to execute with array of length 1
+    return ExecScalar(ctx, transform, batch[0].scalar, out);
   }
 
   static Status ExecArray(KernelContext* ctx, StringTransform* transform,
-                          const std::shared_ptr<ArrayData>& data, Datum* out) {
-    ArrayType input(data);
+                          const ArraySpan& data, ExecResult* out) {
+    // TODO(wesm): reimplement this to not use the array box type
+    ArrayType input(data.ToArrayData());
     const int64_t input_ncodeunits = input.total_values_length();
     const int64_t input_nstrings = input.length();
     const int64_t max_output_ncodeunits =
         transform->MaxCodeunits(input_nstrings, input_ncodeunits);
     RETURN_NOT_OK(CheckOutputCapacity(max_output_ncodeunits));
 
-    ArrayData* output = out->mutable_array();
+    ArrayData* output = out->array_data().get();
     ARROW_ASSIGN_OR_RAISE(auto values_buffer, ctx->Allocate(max_output_ncodeunits));
     output->buffers[2] = values_buffer;
 
@@ -114,7 +116,7 @@ struct StringTransformExecBase {
   }
 
   static Status ExecScalar(KernelContext* ctx, StringTransform* transform,
-                           const std::shared_ptr<Scalar>& scalar, Datum* out) {
+                           const Scalar* scalar, ExecResult* out) {
     const auto& input = checked_cast<const BaseBinaryScalar&>(*scalar);
     if (!input.is_valid) {
       return Status::OK();
@@ -149,7 +151,7 @@ template <typename Type, typename StringTransform>
 struct StringTransformExec : public StringTransformExecBase<Type, StringTransform> {
   using StringTransformExecBase<Type, StringTransform>::Execute;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     StringTransform transform;
     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
     return Execute(ctx, &transform, batch, out);
@@ -162,7 +164,7 @@ struct StringTransformExecWithState
   using State = typename StringTransform::State;
   using StringTransformExecBase<Type, StringTransform>::Execute;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     StringTransform transform(State::Get(ctx));
     RETURN_NOT_OK(transform.PreExec(ctx, batch, out));
     return Execute(ctx, &transform, batch, out);
@@ -240,22 +242,21 @@ static inline FunctionDoc StringClassifyDoc(std::string class_summary,
 
 template <typename Type, typename Predicate>
 struct StringPredicateFunctor {
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     Status st = Status::OK();
     EnsureUtf8LookupTablesFilled();
-    if (batch[0].kind() == Datum::ARRAY) {
-      const ArrayData& input = *batch[0].array();
+    if (batch[0].is_array()) {
+      const ArraySpan& input = batch[0].array;
       ArrayIterator<Type> input_it(input);
-      ArrayData* out_arr = out->mutable_array();
+      ArraySpan* out_arr = out->array_span();
       ::arrow::internal::GenerateBitsUnrolled(
-          out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
-          [&]() -> bool {
+          out_arr->buffers[1].data, out_arr->offset, input.length, [&]() -> bool {
             util::string_view val = input_it();
             return Predicate::Call(ctx, reinterpret_cast<const uint8_t*>(val.data()),
                                    val.size(), &st);
           });
     } else {
-      const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
+      const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar);
       if (input.is_valid) {
         bool boolean_result = Predicate::Call(
             ctx, input.value->data(), static_cast<size_t>(input.value->size()), &st);
@@ -288,7 +289,7 @@ struct StringSliceTransformBase : public StringTransformBase {
 
   const SliceOptions* options;
 
-  Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override {
     options = &State::Get(ctx);
     if (options->step == 0) {
       return Status::Invalid("Slice step cannot be zero");
@@ -349,23 +350,24 @@ struct StringSplitExec {
 
   explicit StringSplitExec(const Options& options) : options(options) {}
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return StringSplitExec{State::Get(ctx)}.Execute(ctx, batch, out);
   }
 
-  Status Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  Status Execute(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     SplitFinder finder;
     RETURN_NOT_OK(finder.PreExec(options));
-    if (batch[0].kind() == Datum::ARRAY) {
-      return Execute(ctx, &finder, batch[0].array(), out);
+    if (batch[0].is_array()) {
+      return Execute(ctx, &finder, batch[0].array, out);
+    } else {
+      return Execute(ctx, &finder, batch[0].scalar, out);
     }
-    DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
-    return Execute(ctx, &finder, batch[0].scalar(), out);
   }
 
-  Status Execute(KernelContext* ctx, SplitFinder* finder,
-                 const std::shared_ptr<ArrayData>& data, Datum* out) {
-    const ArrayType input(data);
+  Status Execute(KernelContext* ctx, SplitFinder* finder, const ArraySpan& data,
+                 ExecResult* out) {
+    // TODO(wesm): refactor to not require creating ArrayData
+    const ArrayType input(data.ToArrayData());
 
     BuilderType builder(input.type(), ctx->memory_pool());
     // A slight overestimate of the data needed
@@ -373,7 +375,7 @@ struct StringSplitExec {
     // The minimum amount of strings needed
     RETURN_NOT_OK(builder.Resize(input.length() - input.null_count()));
 
-    ArrayData* output_list = out->mutable_array();
+    ArrayData* output_list = out->array_data().get();
     // List offsets were preallocated
     auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
     DCHECK_NE(list_offsets, nullptr);
@@ -396,8 +398,8 @@ struct StringSplitExec {
     return Status::OK();
   }
 
-  Status Execute(KernelContext* ctx, SplitFinder* finder,
-                 const std::shared_ptr<Scalar>& scalar, Datum* out) {
+  Status Execute(KernelContext* ctx, SplitFinder* finder, const Scalar* scalar,
+                 ExecResult* out) {
     const auto& input = checked_cast<const ScalarType&>(*scalar);
     auto result = checked_cast<ListScalarType*>(out->scalar().get());
     if (input.is_valid) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
index 9e060127280..434448c6978 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
@@ -342,7 +342,7 @@ void AddUtf8StringPredicates(FunctionRegistry* registry) {
 #ifdef ARROW_WITH_UTF8PROC
 
 struct FunctionalCaseMappingTransform : public StringTransformBase {
-  Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override {
     EnsureUtf8LookupTablesFilled();
     return Status::OK();
   }
@@ -602,31 +602,33 @@ struct Utf8NormalizeExec : public Utf8NormalizeBase {
 
   using Utf8NormalizeBase::Utf8NormalizeBase;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const auto& options = State::Get(ctx);
     Utf8NormalizeExec exec{options};
-    if (batch[0].kind() == Datum::ARRAY) {
-      return exec.ExecArray(ctx, *batch[0].array(), out);
+    if (batch[0].is_array()) {
+      return exec.ExecArray(ctx, batch[0].array, out);
     } else {
-      DCHECK_EQ(batch[0].kind(), Datum::SCALAR);
-      return exec.ExecScalar(ctx, *batch[0].scalar(), out);
+      DCHECK(batch[0].is_scalar());
+      return exec.ExecScalar(ctx, *batch[0].scalar, out);
     }
   }
 
-  Status ExecArray(KernelContext* ctx, const ArrayData& array, Datum* out) {
+  Status ExecArray(KernelContext* ctx, const ArraySpan& array, ExecResult* out) {
     BufferBuilder data_builder(ctx->memory_pool());
 
     const offset_type* in_offsets = array.GetValues<offset_type>(1);
     if (array.length > 0) {
       RETURN_NOT_OK(data_builder.Reserve(in_offsets[array.length] - in_offsets[0]));
     }
+
     // Output offsets are preallocated
-    offset_type* out_offsets = out->mutable_array()->GetMutableValues<offset_type>(1);
+    ArrayData* output = out->array_data().get();
+    offset_type* out_offsets = output->GetMutableValues<offset_type>(1);
 
     int64_t offset = 0;
     *out_offsets++ = static_cast<offset_type>(offset);
 
-    RETURN_NOT_OK(VisitArrayDataInline<Type>(
+    RETURN_NOT_OK(VisitArraySpanInline<Type>(
         array,
         [&](util::string_view v) {
           ARROW_ASSIGN_OR_RAISE(auto n_bytes, Decompose(v, &data_builder));
@@ -639,12 +641,10 @@ struct Utf8NormalizeExec : public Utf8NormalizeBase {
           return Status::OK();
         }));
 
-    ArrayData* output = out->mutable_array();
-    RETURN_NOT_OK(data_builder.Finish(&output->buffers[2]));
-    return Status::OK();
+    return data_builder.Finish(&output->buffers[2]);
   }
 
-  Status ExecScalar(KernelContext* ctx, const Scalar& scalar, Datum* out) {
+  Status ExecScalar(KernelContext* ctx, const Scalar& scalar, ExecResult* out) {
     if (scalar.is_valid) {
       const auto& string_scalar = checked_cast<const ScalarType&>(scalar);
       auto* out_scalar = checked_cast<ScalarType*>(out->scalar().get());
@@ -766,7 +766,7 @@ struct UTF8TrimTransform : public StringTransformBase {
 
   explicit UTF8TrimTransform(const UTF8TrimState& state) : state_(state) {}
 
-  Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override {
     return state_.status_;
   }
 
@@ -807,7 +807,7 @@ using UTF8RTrim = StringTransformExecWithState<Type, UTF8TrimTransform<false, tr
 
 template <bool TrimLeft, bool TrimRight>
 struct UTF8TrimWhitespaceTransform : public StringTransformBase {
-  Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override {
     EnsureUtf8LookupTablesFilled();
     return Status::OK();
   }
@@ -920,7 +920,7 @@ struct Utf8PadTransform : public StringTransformBase {
 
   explicit Utf8PadTransform(const PadOptions& options) : options_(options) {}
 
-  Status PreExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) override {
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) override {
     auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
     auto strlen = options_.padding.size();
     if (util::UTF8Length(str, str + strlen) != 1) {
diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc
index 90d21273e78..8a00aff3c28 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc
@@ -65,10 +65,10 @@ using internal::applicator::ScalarBinaryNotNullStatefulEqualTypes;
 using DayOfWeekState = OptionsWrapper<DayOfWeekOptions>;
 using WeekState = OptionsWrapper<WeekOptions>;
 
-Status CheckTimezones(const ExecBatch& batch) {
-  const auto& timezone = GetInputTimezone(batch.values[0]);
+Status CheckTimezones(const ExecSpan& batch) {
+  const auto& timezone = GetInputTimezone(*batch[0].type());
   for (int i = 1; i < batch.num_values(); i++) {
-    const auto& other_timezone = GetInputTimezone(batch.values[i]);
+    const auto& other_timezone = GetInputTimezone(*batch[i].type());
     if (other_timezone != timezone) {
       return Status::TypeError("Got differing time zone '", other_timezone,
                                "' for argument ", i + 1, "; expected '", timezone, "'");
@@ -83,11 +83,11 @@ struct TemporalBinary {
   template <typename OptionsType, typename T = InType>
   static enable_if_timestamp<T, Status> ExecWithOptions(KernelContext* ctx,
                                                         const OptionsType* options,
-                                                        const ExecBatch& batch,
-                                                        Datum* out) {
+                                                        const ExecSpan& batch,
+                                                        ExecResult* out) {
     RETURN_NOT_OK(CheckTimezones(batch));
 
-    const auto& timezone = GetInputTimezone(batch.values[0]);
+    const auto& timezone = GetInputTimezone(*batch[0].type());
     if (timezone.empty()) {
       using ExecTemplate = Op<Duration, NonZonedLocalizer>;
       auto op = ExecTemplate(options, NonZonedLocalizer());
@@ -106,8 +106,8 @@ struct TemporalBinary {
 
   template <typename OptionsType, typename T = InType>
   static enable_if_t<!is_timestamp_type<T>::value, Status> ExecWithOptions(
-      KernelContext* ctx, const OptionsType* options, const ExecBatch& batch,
-      Datum* out) {
+      KernelContext* ctx, const OptionsType* options, const ExecSpan& batch,
+      ExecResult* out) {
     using ExecTemplate = Op<Duration, NonZonedLocalizer>;
     auto op = ExecTemplate(options, NonZonedLocalizer());
     applicator::ScalarBinaryNotNullStatefulEqualTypes<OutType, T, ExecTemplate> kernel{
@@ -115,7 +115,7 @@ struct TemporalBinary {
     return kernel.Exec(ctx, batch, out);
   }
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const FunctionOptions* options = nullptr;
     return ExecWithOptions(ctx, options, batch, out);
   }
@@ -126,7 +126,7 @@ template <template <typename...> class Op, typename Duration, typename InType,
 struct TemporalDayOfWeekBinary : public TemporalBinary<Op, Duration, InType, OutType> {
   using Base = TemporalBinary<Op, Duration, InType, OutType>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const DayOfWeekOptions& options = DayOfWeekState::Get(ctx);
     RETURN_NOT_OK(ValidateDayOfWeekOptions(options));
     return Base::ExecWithOptions(ctx, &options, batch, out);
diff --git a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
index 6275de94818..96b93908402 100644
--- a/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc
@@ -105,7 +105,7 @@ struct TemporalComponentExtractDayOfWeek
     : public TemporalComponentExtractBase<Op, Duration, InType, OutType> {
   using Base = TemporalComponentExtractBase<Op, Duration, InType, OutType>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const DayOfWeekOptions& options = DayOfWeekState::Get(ctx);
     RETURN_NOT_OK(ValidateDayOfWeekOptions(options));
     return Base::ExecWithOptions(ctx, &options, batch, out);
@@ -118,9 +118,9 @@ struct AssumeTimezoneExtractor
     : public TemporalComponentExtractBase<Op, Duration, InType, OutType> {
   using Base = TemporalComponentExtractBase<Op, Duration, InType, OutType>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const AssumeTimezoneOptions& options = AssumeTimezoneState::Get(ctx);
-    const auto& timezone = GetInputTimezone(batch.values[0]);
+    const auto& timezone = GetInputTimezone(*batch[0].type());
     if (!timezone.empty()) {
       return Status::Invalid("Timestamps already have a timezone: '", timezone,
                              "'. Cannot localize to '", options.timezone, "'.");
@@ -140,8 +140,8 @@ struct DaylightSavingsExtractor
     : public TemporalComponentExtractBase<Op, Duration, InType, OutType> {
   using Base = TemporalComponentExtractBase<Op, Duration, InType, OutType>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-    const auto& timezone = GetInputTimezone(batch.values[0]);
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const auto& timezone = GetInputTimezone(*batch[0].type());
     if (timezone.empty()) {
       return Status::Invalid("Timestamps have no timezone. Cannot determine DST.");
     }
@@ -160,7 +160,7 @@ struct TemporalComponentExtractWeek
     : public TemporalComponentExtractBase<Op, Duration, InType, OutType> {
   using Base = TemporalComponentExtractBase<Op, Duration, InType, OutType>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const WeekOptions& options = WeekState::Get(ctx);
     return Base::ExecWithOptions(ctx, &options, batch, out);
   }
@@ -172,7 +172,7 @@ struct TemporalComponentExtractRound
     : public TemporalComponentExtractBase<Op, Duration, InType, OutType> {
   using Base = TemporalComponentExtractBase<Op, Duration, InType, OutType>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     const RoundTemporalOptions& options = RoundTemporalState::Get(ctx);
     return Base::ExecWithOptions(ctx, &options, batch, out);
   }
@@ -279,7 +279,7 @@ template <typename Duration>
 struct YearMonthDayWrapper<Duration, TimestampType> {
   static Result<std::array<int64_t, 3>> Get(const Scalar& in) {
     const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
-    const auto& timezone = GetInputTimezone(in);
+    const auto& timezone = GetInputTimezone(*in.type);
     if (timezone.empty()) {
       return GetYearMonthDay<Duration>(in_val, NonZonedLocalizer{});
     } else {
@@ -292,7 +292,7 @@ struct YearMonthDayWrapper<Duration, TimestampType> {
 template <typename Duration, typename InType, typename BuilderType>
 struct YearMonthDayVisitValueFunction {
   static Result<std::function<Status(typename InType::c_type arg)>> Get(
-      const std::vector<BuilderType*>& field_builders, const ArrayData&,
+      const std::vector<BuilderType*>& field_builders, const ArraySpan&,
       StructBuilder* struct_builder) {
     return [=](typename InType::c_type arg) {
       const auto ymd = GetYearMonthDay<Duration>(arg, NonZonedLocalizer{});
@@ -307,9 +307,9 @@ struct YearMonthDayVisitValueFunction {
 template <typename Duration, typename BuilderType>
 struct YearMonthDayVisitValueFunction<Duration, TimestampType, BuilderType> {
   static Result<std::function<Status(typename TimestampType::c_type arg)>> Get(
-      const std::vector<BuilderType*>& field_builders, const ArrayData& in,
+      const std::vector<BuilderType*>& field_builders, const ArraySpan& in,
       StructBuilder* struct_builder) {
-    const auto& timezone = GetInputTimezone(in);
+    const auto& timezone = GetInputTimezone(*in.type);
     if (timezone.empty()) {
       return [=](TimestampType::c_type arg) {
         const auto ymd = GetYearMonthDay<Duration>(arg, NonZonedLocalizer{});
@@ -332,22 +332,23 @@ struct YearMonthDayVisitValueFunction<Duration, TimestampType, BuilderType> {
 
 template <typename Duration, typename InType>
 struct YearMonthDay {
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
+    Scalar* out_scalar = out->scalar().get();
     if (in.is_valid) {
       ARROW_ASSIGN_OR_RAISE(auto year_month_day,
                             (YearMonthDayWrapper<Duration, InType>::Get(in)));
       ScalarVector values = {std::make_shared<Int64Scalar>(year_month_day[0]),
                              std::make_shared<Int64Scalar>(year_month_day[1]),
                              std::make_shared<Int64Scalar>(year_month_day[2])};
-      *checked_cast<StructScalar*>(out) =
+      *checked_cast<StructScalar*>(out_scalar) =
           StructScalar(std::move(values), YearMonthDayType());
     } else {
-      out->is_valid = false;
+      out_scalar->is_valid = false;
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& in, ExecResult* out) {
     using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
 
     std::unique_ptr<ArrayBuilder> array_builder;
@@ -368,10 +369,10 @@ struct YearMonthDay {
         visit_value, (YearMonthDayVisitValueFunction<Duration, InType, BuilderType>::Get(
                          field_builders, in, struct_builder)));
     RETURN_NOT_OK(
-        VisitArrayDataInline<typename InType::PhysicalType>(in, visit_value, visit_null));
+        VisitArraySpanInline<typename InType::PhysicalType>(in, visit_value, visit_null));
     std::shared_ptr<Array> out_array;
     RETURN_NOT_OK(struct_builder->Finish(&out_array));
-    *out = *std::move(out_array->data());
+    out->value = std::move(out_array->data());
     return Status::OK();
   }
 };
@@ -1177,21 +1178,23 @@ struct Strftime {
     return Strftime{options, tz, std::move(locale)};
   }
 
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
     ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
     TimestampFormatter<Duration> formatter{self.options.format, self.tz, self.locale};
 
+    Scalar* output = out->scalar().get();
     if (in.is_valid) {
       const int64_t in_val = internal::UnboxScalar<const InType>::Unbox(in);
       ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(in_val));
-      checked_cast<StringScalar*>(out)->value = Buffer::FromString(std::move(formatted));
+      checked_cast<StringScalar*>(output)->value =
+          Buffer::FromString(std::move(formatted));
     } else {
-      out->is_valid = false;
+      output->is_valid = false;
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& in, ExecResult* out) {
     ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
     TimestampFormatter<Duration> formatter{self.options.format, self.tz, self.locale};
 
@@ -1210,12 +1213,11 @@ struct Strftime {
       ARROW_ASSIGN_OR_RAISE(auto formatted, formatter(arg));
       return string_builder.Append(std::move(formatted));
     };
-    RETURN_NOT_OK(VisitArrayDataInline<InType>(in, visit_value, visit_null));
+    RETURN_NOT_OK(VisitArraySpanInline<InType>(in, visit_value, visit_null));
 
     std::shared_ptr<Array> out_array;
     RETURN_NOT_OK(string_builder.Finish(&out_array));
-    *out = *std::move(out_array->data());
-
+    out->value = std::move(out_array->data());
     return Status::OK();
   }
 };
@@ -1259,44 +1261,47 @@ struct Strptime {
                     options.error_is_null};
   }
 
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
     ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
 
+    Scalar* output = out->scalar().get();
     if (in.is_valid) {
       auto s = internal::UnboxScalar<InType>::Unbox(in);
       int64_t result;
       if ((*self.parser)(s.data(), s.size(), self.unit, &result)) {
-        *checked_cast<TimestampScalar*>(out) =
+        *checked_cast<TimestampScalar*>(output) =
             TimestampScalar(result, timestamp(self.unit, self.zone));
       } else {
         if (self.error_is_null) {
-          out->is_valid = false;
+          output->is_valid = false;
         } else {
           return Status::Invalid("Failed to parse string: '", s, "' as a scalar of type ",
                                  TimestampType(self.unit).ToString());
         }
       }
     } else {
-      out->is_valid = false;
+      output->is_valid = false;
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& in, ExecResult* out) {
     ARROW_ASSIGN_OR_RAISE(auto self, Make(ctx, *in.type));
-    int64_t* out_data = out->GetMutableValues<int64_t>(1);
+
+    ArraySpan* out_span = out->array_span();
+    int64_t* out_data = out_span->GetValues<int64_t>(1);
 
     if (self.error_is_null) {
-      if (out->buffers[0] == nullptr) {
-        ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(in.length));
-        bit_util::SetBitmap(out->buffers[0]->mutable_data(), out->offset, out->length);
-      }
+      // Set all values to non-null, and only clear bits when there is a
+      // parsing error
+      bit_util::SetBitmap(out_span->buffers[0].data, out_span->offset, out_span->length);
 
       int64_t null_count = 0;
-      arrow::internal::BitmapWriter out_writer(out->GetMutableValues<uint8_t>(0, 0),
-                                               out->offset, out->length);
+      arrow::internal::BitmapWriter out_writer(out_span->buffers[0].data,
+                                               out_span->offset, out_span->length);
       auto visit_null = [&]() {
         *out_data++ = 0;
+        out_writer.Clear();
         out_writer.Next();
         null_count++;
       };
@@ -1311,10 +1316,18 @@ struct Strptime {
         }
         out_writer.Next();
       };
-      VisitArrayDataInline<InType>(in, visit_value, visit_null);
+      VisitArraySpanInline<InType>(in, visit_value, visit_null);
       out_writer.Finish();
-      out->null_count = null_count;
+      out_span->null_count = null_count;
     } else {
+      if (in.buffers[0].data != nullptr) {
+        ::arrow::internal::CopyBitmap(in.buffers[0].data, in.offset, in.length,
+                                      out_span->buffers[0].data, out_span->offset);
+      } else {
+        // Input is all non-null
+        bit_util::SetBitmap(out_span->buffers[0].data, out_span->offset,
+                            out_span->length);
+      }
       auto visit_null = [&]() {
         *out_data++ = 0;
         return Status::OK();
@@ -1329,7 +1342,7 @@ struct Strptime {
                                  TimestampType(self.unit).ToString());
         }
       };
-      RETURN_NOT_OK(VisitArrayDataInline<InType>(in, visit_value, visit_null));
+      RETURN_NOT_OK(VisitArraySpanInline<InType>(in, visit_value, visit_null));
     }
     return Status::OK();
   }
@@ -1450,7 +1463,7 @@ template <typename Duration>
 struct ISOCalendarWrapper<Duration, TimestampType> {
   static Result<std::array<int64_t, 3>> Get(const Scalar& in) {
     const auto& in_val = internal::UnboxScalar<const TimestampType>::Unbox(in);
-    const auto& timezone = GetInputTimezone(in);
+    const auto& timezone = GetInputTimezone(*in.type);
     if (timezone.empty()) {
       return GetIsoCalendar<Duration>(in_val, NonZonedLocalizer{});
     } else {
@@ -1463,7 +1476,7 @@ struct ISOCalendarWrapper<Duration, TimestampType> {
 template <typename Duration, typename InType, typename BuilderType>
 struct ISOCalendarVisitValueFunction {
   static Result<std::function<Status(typename InType::c_type arg)>> Get(
-      const std::vector<BuilderType*>& field_builders, const ArrayData&,
+      const std::vector<BuilderType*>& field_builders, const ArraySpan&,
       StructBuilder* struct_builder) {
     return [=](typename InType::c_type arg) {
       const auto iso_calendar = GetIsoCalendar<Duration>(arg, NonZonedLocalizer{});
@@ -1478,9 +1491,9 @@ struct ISOCalendarVisitValueFunction {
 template <typename Duration, typename BuilderType>
 struct ISOCalendarVisitValueFunction<Duration, TimestampType, BuilderType> {
   static Result<std::function<Status(typename TimestampType::c_type arg)>> Get(
-      const std::vector<BuilderType*>& field_builders, const ArrayData& in,
+      const std::vector<BuilderType*>& field_builders, const ArraySpan& in,
       StructBuilder* struct_builder) {
-    const auto& timezone = GetInputTimezone(in);
+    const auto& timezone = GetInputTimezone(*in.type);
     if (timezone.empty()) {
       return [=](TimestampType::c_type arg) {
         const auto iso_calendar = GetIsoCalendar<Duration>(arg, NonZonedLocalizer{});
@@ -1503,22 +1516,23 @@ struct ISOCalendarVisitValueFunction<Duration, TimestampType, BuilderType> {
 
 template <typename Duration, typename InType>
 struct ISOCalendar {
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
+    Scalar* output = out->scalar().get();
     if (in.is_valid) {
       ARROW_ASSIGN_OR_RAISE(auto iso_calendar,
                             (ISOCalendarWrapper<Duration, InType>::Get(in)));
       ScalarVector values = {std::make_shared<Int64Scalar>(iso_calendar[0]),
                              std::make_shared<Int64Scalar>(iso_calendar[1]),
                              std::make_shared<Int64Scalar>(iso_calendar[2])};
-      *checked_cast<StructScalar*>(out) =
+      *checked_cast<StructScalar*>(output) =
           StructScalar(std::move(values), IsoCalendarType());
     } else {
-      out->is_valid = false;
+      output->is_valid = false;
     }
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& in, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& in, ExecResult* out) {
     using BuilderType = typename TypeTraits<Int64Type>::BuilderType;
 
     std::unique_ptr<ArrayBuilder> array_builder;
@@ -1539,10 +1553,10 @@ struct ISOCalendar {
         visit_value, (ISOCalendarVisitValueFunction<Duration, InType, BuilderType>::Get(
                          field_builders, in, struct_builder)));
     RETURN_NOT_OK(
-        VisitArrayDataInline<typename InType::PhysicalType>(in, visit_value, visit_null));
+        VisitArraySpanInline<typename InType::PhysicalType>(in, visit_value, visit_null));
     std::shared_ptr<Array> out_array;
     RETURN_NOT_OK(struct_builder->Finish(&out_array));
-    *out = *std::move(out_array->data());
+    out->value = std::move(out_array->data());
     return Status::OK();
   }
 };
@@ -1575,7 +1589,8 @@ struct UnaryTemporalFactory {
   template <typename Duration, typename InType>
   void AddKernel(InputType in_type) {
     auto exec = ExecTemplate<Op, Duration, InType, OutType>::Exec;
-    DCHECK_OK(func->AddKernel({std::move(in_type)}, out_type, std::move(exec), init));
+    ScalarKernel kernel({std::move(in_type)}, out_type, std::move(exec), init);
+    DCHECK_OK(func->AddKernel(kernel));
   }
 };
 
@@ -1584,16 +1599,19 @@ struct SimpleUnaryTemporalFactory {
   OutputType out_type;
   KernelInit init;
   std::shared_ptr<ScalarFunction> func;
+  NullHandling::type null_handling;
 
   template <typename... WithTypes>
   static std::shared_ptr<ScalarFunction> Make(
       std::string name, OutputType out_type, FunctionDoc doc,
-      const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR) {
+      const FunctionOptions* default_options = NULLPTR, KernelInit init = NULLPTR,
+      NullHandling::type null_handling = NullHandling::INTERSECTION) {
     DCHECK_NE(sizeof...(WithTypes), 0);
     SimpleUnaryTemporalFactory self{
         out_type, init,
         std::make_shared<ScalarFunction>(name, Arity::Unary(), std::move(doc),
-                                         default_options)};
+                                         default_options),
+        null_handling};
     AddTemporalKernels(&self, WithTypes{}...);
     return self.func;
   }
@@ -1601,8 +1619,9 @@ struct SimpleUnaryTemporalFactory {
   template <typename Duration, typename InType>
   void AddKernel(InputType in_type) {
     auto exec = SimpleUnary<Op<Duration, InType>>;
-    DCHECK_OK(func->AddKernel({std::move(in_type)}, out_type, std::move(exec), init));
-    ScalarKernel kernel({std::move(in_type)}, out_type, exec, init);
+    ScalarKernel kernel({std::move(in_type)}, out_type, std::move(exec), init);
+    kernel.null_handling = this->null_handling;
+    DCHECK_OK(func->AddKernel(kernel));
   }
 };
 
@@ -1992,7 +2011,7 @@ void RegisterScalarTemporalUnary(FunctionRegistry* registry) {
 
   auto strptime = SimpleUnaryTemporalFactory<Strptime>::Make<WithStringTypes>(
       "strptime", OutputType::Resolver(ResolveStrptimeOutput), strptime_doc, nullptr,
-      StrptimeState::Init);
+      StrptimeState::Init, NullHandling::COMPUTED_PREALLOCATE);
   DCHECK_OK(registry->AddFunction(std::move(strptime)));
 
   auto assume_timezone =
diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc
index 1b6eb3d6483..713dafb3bb8 100644
--- a/cpp/src/arrow/compute/kernels/scalar_validity.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -33,40 +33,32 @@ namespace internal {
 namespace {
 
 struct IsValidOperator {
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
-    checked_cast<BooleanScalar*>(out)->value = in.is_valid;
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
+    Scalar* output = out->scalar().get();
+    checked_cast<BooleanScalar*>(output)->value = in.is_valid;
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
+  static Status Call(KernelContext* ctx, const ArraySpan& arr, ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
     if (arr.type->id() == Type::NA) {
       // Input is all nulls => output is entirely false.
-      ARROW_ASSIGN_OR_RAISE(out->buffers[1],
-                            ctx->AllocateBitmap(out->length + out->offset));
-      bit_util::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length,
+      bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
                           false);
       return Status::OK();
     }
 
-    DCHECK_EQ(out->offset, 0);
-    DCHECK_LE(out->length, arr.length);
+    DCHECK_EQ(out_span->offset, 0);
+    DCHECK_LE(out_span->length, arr.length);
     if (arr.MayHaveNulls()) {
-      // Input has nulls => output is the null (validity) bitmap.
-      // To avoid copying the null bitmap, slice from the starting byte offset
-      // and set the offset to the remaining bit offset.
-      out->offset = arr.offset % 8;
-      out->buffers[1] =
-          arr.offset == 0
-              ? arr.buffers[0]
-              : SliceBuffer(arr.buffers[0], arr.offset / 8,
-                            bit_util::BytesForBits(out->length + out->offset));
-      return Status::OK();
+      // We could do a zero-copy optimization, but it isn't worth the added complexity
+      ::arrow::internal::CopyBitmap(arr.buffers[0].data, arr.offset, arr.length,
+                                    out_span->buffers[1].data, out_span->offset);
+    } else {
+      // Input has no nulls => output is entirely true.
+      bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
+                          true);
     }
-
-    // Input has no nulls => output is entirely true.
-    ARROW_ASSIGN_OR_RAISE(out->buffers[1],
-                          ctx->AllocateBitmap(out->length + out->offset));
-    bit_util::SetBitsTo(out->buffers[1]->mutable_data(), out->offset, out->length, true);
     return Status::OK();
   }
 };
@@ -88,9 +80,11 @@ struct IsInfOperator {
 using NanOptionsState = OptionsWrapper<NullOptions>;
 
 struct IsNullOperator {
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
+    Scalar* output = out->scalar().get();
+
     const auto& options = NanOptionsState::Get(ctx);
-    bool* out_value = &checked_cast<BooleanScalar*>(out)->value;
+    bool* out_value = &checked_cast<BooleanScalar*>(output)->value;
 
     if (in.is_valid) {
       if (options.nan_is_null && is_floating(in.type->id())) {
@@ -116,7 +110,7 @@ struct IsNullOperator {
   }
 
   template <typename T>
-  static void SetNanBits(const ArrayData& arr, uint8_t* out_bitmap, int64_t out_offset) {
+  static void SetNanBits(const ArraySpan& arr, uint8_t* out_bitmap, int64_t out_offset) {
     const T* data = arr.GetValues<T>(1);
     for (int64_t i = 0; i < arr.length; ++i) {
       if (std::isnan(data[i])) {
@@ -125,26 +119,27 @@ struct IsNullOperator {
     }
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
-    const auto& options = NanOptionsState::Get(ctx);
+  static Status Call(KernelContext* ctx, const ArraySpan& arr, ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
 
-    uint8_t* out_bitmap = out->buffers[1]->mutable_data();
+    const auto& options = NanOptionsState::Get(ctx);
+    uint8_t* out_bitmap = out_span->buffers[1].data;
     if (arr.GetNullCount() > 0) {
       // Input has nulls => output is the inverted null (validity) bitmap.
-      InvertBitmap(arr.buffers[0]->data(), arr.offset, arr.length, out_bitmap,
-                   out->offset);
+      InvertBitmap(arr.buffers[0].data, arr.offset, arr.length, out_bitmap,
+                   out_span->offset);
     } else {
       // Input has no nulls => output is entirely false.
-      bit_util::SetBitsTo(out_bitmap, out->offset, out->length, false);
+      bit_util::SetBitsTo(out_bitmap, out_span->offset, out_span->length, false);
     }
 
     if (is_floating(arr.type->id()) && options.nan_is_null) {
       switch (arr.type->id()) {
         case Type::FLOAT:
-          SetNanBits<float>(arr, out_bitmap, out->offset);
+          SetNanBits<float>(arr, out_bitmap, out_span->offset);
           break;
         case Type::DOUBLE:
-          SetNanBits<double>(arr, out_bitmap, out->offset);
+          SetNanBits<double>(arr, out_bitmap, out_span->offset);
           break;
         default:
           return Status::NotImplemented("NaN detection not implemented for type ",
@@ -156,23 +151,26 @@ struct IsNullOperator {
 };
 
 struct TrueUnlessNullOperator {
-  static Status Call(KernelContext* ctx, const Scalar& in, Scalar* out) {
-    checked_cast<BooleanScalar*>(out)->is_valid = in.is_valid;
-    checked_cast<BooleanScalar*>(out)->value = true;
+  static Status Call(KernelContext* ctx, const Scalar& in, ExecResult* out) {
+    BooleanScalar* output = checked_cast<BooleanScalar*>(out->scalar().get());
+    output->is_valid = in.is_valid;
+    output->value = true;
     return Status::OK();
   }
 
-  static Status Call(KernelContext* ctx, const ArrayData& arr, ArrayData* out) {
-    // NullHandling::INTERSECTION with a single input means the execution engine
-    // has already reused or allocated a null_bitmap which can be reused as the values
-    // buffer.
-    if (out->buffers[0]) {
-      out->buffers[1] = out->buffers[0];
+  static Status Call(KernelContext* ctx, const ArraySpan& arr, ExecResult* out) {
+    ArraySpan* out_span = out->array_span();
+    if (out_span->buffers[0].data) {
+      // If there is a validity bitmap computed above the kernel
+      // invocation, we copy it to the output buffers
+      ::arrow::internal::CopyBitmap(out_span->buffers[0].data, out_span->offset,
+                                    out_span->length, out_span->buffers[1].data,
+                                    out_span->offset);
     } else {
       // But for all-valid inputs, the engine will skip allocating a
-      // buffer; we have to allocate one ourselves
-      ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(arr.length));
-      std::memset(out->buffers[1]->mutable_data(), 0xFF, out->buffers[1]->size());
+      // validity bitmap, so we set everything to true
+      bit_util::SetBitsTo(out_span->buffers[1].data, out_span->offset, out_span->length,
+                          true);
     }
     return Status::OK();
   }
@@ -186,8 +184,8 @@ struct IsNanOperator {
 };
 
 void MakeFunction(std::string name, FunctionDoc doc, std::vector<InputType> in_types,
-                  OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry,
-                  MemAllocation::type mem_allocation, NullHandling::type null_handling,
+                  OutputType out_type, ScalarKernel::ExecFunc exec,
+                  FunctionRegistry* registry, NullHandling::type null_handling,
                   bool can_write_into_slices,
                   const FunctionOptions* default_options = NULLPTR,
                   KernelInit init = NULLPTR) {
@@ -198,7 +196,6 @@ void MakeFunction(std::string name, FunctionDoc doc, std::vector<InputType> in_t
   ScalarKernel kernel(std::move(in_types), out_type, exec, init);
   kernel.null_handling = null_handling;
   kernel.can_write_into_slices = can_write_into_slices;
-  kernel.mem_allocation = mem_allocation;
 
   DCHECK_OK(func->AddKernel(std::move(kernel)));
   DCHECK_OK(registry->AddFunction(std::move(func)));
@@ -211,14 +208,13 @@ void AddFloatValidityKernel(const std::shared_ptr<DataType>& ty, ScalarFunction*
 }
 
 template <bool kConstant>
-Status ConstBoolExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  if (batch.values[0].is_scalar()) {
+Status ConstBoolExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  if (batch[0].is_scalar()) {
     checked_cast<BooleanScalar*>(out->scalar().get())->value = kConstant;
     return Status::OK();
   }
-  ArrayData* array = out->mutable_array();
-  bit_util::SetBitsTo(array->buffers[1]->mutable_data(), array->offset, array->length,
-                      kConstant);
+  ArraySpan* array = out->array_span();
+  bit_util::SetBitsTo(array->buffers[1].data, array->offset, array->length, kConstant);
   return Status::OK();
 }
 
@@ -276,20 +272,20 @@ std::shared_ptr<ScalarFunction> MakeIsNanFunction(std::string name, FunctionDoc
   return func;
 }
 
-Status IsValidExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status IsValidExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   return applicator::SimpleUnary<IsValidOperator>(ctx, batch, out);
 }
 
-Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
-  const Datum& arg0 = batch[0];
+Status IsNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ExecValue& arg0 = batch[0];
   if (arg0.type()->id() == Type::NA) {
-    if (arg0.kind() == Datum::SCALAR) {
+    if (arg0.is_scalar()) {
       out->value = std::make_shared<BooleanScalar>(true);
     } else {
       // Data is preallocated
-      ArrayData* out_arr = out->mutable_array();
-      bit_util::SetBitsTo(out_arr->buffers[1]->mutable_data(), out_arr->offset,
-                          out_arr->length, true);
+      ArraySpan* out_arr = out->array_span();
+      bit_util::SetBitsTo(out_arr->buffers[1].data, out_arr->offset, out_arr->length,
+                          true);
     }
     return Status::OK();
   } else {
@@ -297,7 +293,7 @@ Status IsNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   }
 }
 
-Status TrueUnlessNullExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+Status TrueUnlessNullExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   return applicator::SimpleUnary<TrueUnlessNullOperator>(ctx, batch, out);
 }
 
@@ -337,16 +333,15 @@ const FunctionDoc is_nan_doc("Return true if NaN",
 void RegisterScalarValidity(FunctionRegistry* registry) {
   static auto kNullOptions = NullOptions::Defaults();
   MakeFunction("is_valid", is_valid_doc, {ValueDescr::ANY}, boolean(), IsValidExec,
-               registry, MemAllocation::NO_PREALLOCATE, NullHandling::OUTPUT_NOT_NULL,
+               registry, NullHandling::OUTPUT_NOT_NULL,
                /*can_write_into_slices=*/false);
 
   MakeFunction("is_null", is_null_doc, {ValueDescr::ANY}, boolean(), IsNullExec, registry,
-               MemAllocation::PREALLOCATE, NullHandling::OUTPUT_NOT_NULL,
+               NullHandling::OUTPUT_NOT_NULL,
                /*can_write_into_slices=*/true, &kNullOptions, NanOptionsState::Init);
 
   MakeFunction("true_unless_null", true_unless_null_doc, {ValueDescr::ANY}, boolean(),
-               TrueUnlessNullExec, registry, MemAllocation::NO_PREALLOCATE,
-               NullHandling::INTERSECTION,
+               TrueUnlessNullExec, registry, NullHandling::INTERSECTION,
                /*can_write_into_slices=*/false);
 
   DCHECK_OK(registry->AddFunction(MakeIsFiniteFunction("is_finite", is_finite_doc)));
diff --git a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc
index df7ccc2909f..94d951c8382 100644
--- a/cpp/src/arrow/compute/kernels/scalar_validity_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_validity_test.cc
@@ -67,12 +67,6 @@ TEST_F(TestBooleanValidityKernels, IsValidIsNullNullType) {
                    ArrayFromJSON(boolean(), "[null, null, null, null, null]"));
 }
 
-TEST_F(TestBooleanValidityKernels, ArrayIsValidBufferPassthruOptimization) {
-  Datum arg = ArrayFromJSON(boolean(), "[null, 1, 0, null]");
-  ASSERT_OK_AND_ASSIGN(auto validity, arrow::compute::IsValid(arg));
-  ASSERT_EQ(validity.array()->buffers[1], arg.array()->buffers[0]);
-}
-
 TEST_F(TestBooleanValidityKernels, IsNull) {
   auto ty = type_singleton();
   NullOptions default_options;
diff --git a/cpp/src/arrow/compute/kernels/temporal_internal.h b/cpp/src/arrow/compute/kernels/temporal_internal.h
index 5a1bd970cb5..6e6931951f8 100644
--- a/cpp/src/arrow/compute/kernels/temporal_internal.h
+++ b/cpp/src/arrow/compute/kernels/temporal_internal.h
@@ -62,18 +62,6 @@ static inline const std::string& GetInputTimezone(const DataType& type) {
   }
 }
 
-static inline const std::string& GetInputTimezone(const Datum& datum) {
-  return checked_cast<const TimestampType&>(*datum.type()).timezone();
-}
-
-static inline const std::string& GetInputTimezone(const Scalar& scalar) {
-  return checked_cast<const TimestampType&>(*scalar.type).timezone();
-}
-
-static inline const std::string& GetInputTimezone(const ArrayData& array) {
-  return checked_cast<const TimestampType&>(*array.type).timezone();
-}
-
 static inline Status ValidateDayOfWeekOptions(const DayOfWeekOptions& options) {
   if (options.week_start < 1 || 7 < options.week_start) {
     return Status::Invalid(
@@ -228,8 +216,8 @@ template <template <typename...> class Op, typename Duration, typename InType,
 struct TemporalComponentExtractBase {
   template <typename OptionsType>
   static Status ExecWithOptions(KernelContext* ctx, const OptionsType* options,
-                                const ExecBatch& batch, Datum* out, Args... args) {
-    const auto& timezone = GetInputTimezone(batch.values[0]);
+                                const ExecSpan& batch, ExecResult* out, Args... args) {
+    const auto& timezone = GetInputTimezone(*batch[0].type());
     if (timezone.empty()) {
       using ExecTemplate = Op<Duration, NonZonedLocalizer>;
       auto op = ExecTemplate(options, NonZonedLocalizer(), args...);
@@ -249,7 +237,7 @@ template <template <typename...> class Op, typename OutType>
 struct TemporalComponentExtractBase<Op, days, Date32Type, OutType> {
   template <typename OptionsType>
   static Status ExecWithOptions(KernelContext* ctx, const OptionsType* options,
-                                const ExecBatch& batch, Datum* out) {
+                                const ExecSpan& batch, ExecResult* out) {
     using ExecTemplate = Op<days, NonZonedLocalizer>;
     auto op = ExecTemplate(options, NonZonedLocalizer());
     applicator::ScalarUnaryNotNullStateful<OutType, Date32Type, ExecTemplate> kernel{op};
@@ -261,7 +249,7 @@ template <template <typename...> class Op, typename OutType>
 struct TemporalComponentExtractBase<Op, std::chrono::milliseconds, Date64Type, OutType> {
   template <typename OptionsType>
   static Status ExecWithOptions(KernelContext* ctx, const OptionsType* options,
-                                const ExecBatch& batch, Datum* out) {
+                                const ExecSpan& batch, ExecResult* out) {
     using ExecTemplate = Op<std::chrono::milliseconds, NonZonedLocalizer>;
     auto op = ExecTemplate(options, NonZonedLocalizer());
     applicator::ScalarUnaryNotNullStateful<OutType, Date64Type, ExecTemplate> kernel{op};
@@ -273,7 +261,7 @@ template <template <typename...> class Op, typename OutType>
 struct TemporalComponentExtractBase<Op, std::chrono::seconds, Time32Type, OutType> {
   template <typename OptionsType>
   static Status ExecWithOptions(KernelContext* ctx, const OptionsType* options,
-                                const ExecBatch& batch, Datum* out) {
+                                const ExecSpan& batch, ExecResult* out) {
     using ExecTemplate = Op<std::chrono::seconds, NonZonedLocalizer>;
     auto op = ExecTemplate(options, NonZonedLocalizer());
     applicator::ScalarUnaryNotNullStateful<OutType, Time32Type, ExecTemplate> kernel{op};
@@ -285,7 +273,7 @@ template <template <typename...> class Op, typename OutType>
 struct TemporalComponentExtractBase<Op, std::chrono::milliseconds, Time32Type, OutType> {
   template <typename OptionsType>
   static Status ExecWithOptions(KernelContext* ctx, const OptionsType* options,
-                                const ExecBatch& batch, Datum* out) {
+                                const ExecSpan& batch, ExecResult* out) {
     using ExecTemplate = Op<std::chrono::milliseconds, NonZonedLocalizer>;
     auto op = ExecTemplate(options, NonZonedLocalizer());
     applicator::ScalarUnaryNotNullStateful<OutType, Time32Type, ExecTemplate> kernel{op};
@@ -297,7 +285,7 @@ template <template <typename...> class Op, typename OutType>
 struct TemporalComponentExtractBase<Op, std::chrono::microseconds, Time64Type, OutType> {
   template <typename OptionsType>
   static Status ExecWithOptions(KernelContext* ctx, const OptionsType* options,
-                                const ExecBatch& batch, Datum* out) {
+                                const ExecSpan& batch, ExecResult* out) {
     using ExecTemplate = Op<std::chrono::microseconds, NonZonedLocalizer>;
     auto op = ExecTemplate(options, NonZonedLocalizer());
     applicator::ScalarUnaryNotNullStateful<OutType, Date64Type, ExecTemplate> kernel{op};
@@ -309,7 +297,7 @@ template <template <typename...> class Op, typename OutType>
 struct TemporalComponentExtractBase<Op, std::chrono::nanoseconds, Time64Type, OutType> {
   template <typename OptionsType>
   static Status ExecWithOptions(KernelContext* ctx, const OptionsType* options,
-                                const ExecBatch& batch, Datum* out) {
+                                const ExecSpan& batch, ExecResult* out) {
     using ExecTemplate = Op<std::chrono::nanoseconds, NonZonedLocalizer>;
     auto op = ExecTemplate(options, NonZonedLocalizer());
     applicator::ScalarUnaryNotNullStateful<OutType, Date64Type, ExecTemplate> kernel{op};
@@ -323,7 +311,7 @@ struct TemporalComponentExtract
     : public TemporalComponentExtractBase<Op, Duration, InType, OutType, Args...> {
   using Base = TemporalComponentExtractBase<Op, Duration, InType, OutType, Args...>;
 
-  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out,
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out,
                      Args... args) {
     const FunctionOptions* options = nullptr;
     return Base::ExecWithOptions(ctx, options, batch, out, args...);
diff --git a/cpp/src/arrow/compute/kernels/util_internal.cc b/cpp/src/arrow/compute/kernels/util_internal.cc
index 846fa26baf2..010569d687b 100644
--- a/cpp/src/arrow/compute/kernels/util_internal.cc
+++ b/cpp/src/arrow/compute/kernels/util_internal.cc
@@ -57,22 +57,67 @@ PrimitiveArg GetPrimitiveArg(const ArrayData& arr) {
   return arg;
 }
 
-ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec,
-                                               NullHandling::type null_handling) {
-  return [=](KernelContext* ctx, const ExecBatch& batch, Datum* out) -> Status {
-    if (out->is_array()) {
-      return exec(ctx, batch, out);
+// TODO(wesm): ARROW-16577: this will be unneeded later
+ScalarKernel::ExecFunc TrivialScalarUnaryAsArraysExec(ScalarKernel::ExecFunc exec,
+                                                      bool use_array_span,
+                                                      NullHandling::type null_handling) {
+  return [=](KernelContext* ctx, const ExecSpan& span, ExecResult* out) -> Status {
+    if (!out->is_scalar()) {
+      return exec(ctx, span, out);
     }
 
-    if (null_handling == NullHandling::INTERSECTION && !batch[0].scalar()->is_valid) {
+    if (null_handling == NullHandling::INTERSECTION && !span[0].scalar->is_valid) {
       out->scalar()->is_valid = false;
       return Status::OK();
     }
 
-    ARROW_ASSIGN_OR_RAISE(Datum array_in, MakeArrayFromScalar(*batch[0].scalar(), 1));
-    ARROW_ASSIGN_OR_RAISE(Datum array_out, MakeArrayFromScalar(*out->scalar(), 1));
-    RETURN_NOT_OK(exec(ctx, ExecBatch{{std::move(array_in)}, 1}, &array_out));
-    ARROW_ASSIGN_OR_RAISE(*out, array_out.make_array()->GetScalar(0));
+    ExecSpan span_with_arrays;
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> array_in,
+                          MakeArrayFromScalar(*span[0].scalar, 1));
+    span_with_arrays.length = 1;
+    span_with_arrays.values = {ExecValue(*array_in->data())};
+
+    ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> array_out,
+                          MakeArrayFromScalar(*out->scalar(), 1));
+
+    ExecResult array_result;
+
+    // Send either ArraySpan or ArrayData depending on what modality the kernel
+    // is expecting, which we have to specify manually for now
+    if (!use_array_span) {
+      array_result.value = array_out->data();
+      RETURN_NOT_OK(exec(ctx, span_with_arrays, &array_result));
+      ARROW_ASSIGN_OR_RAISE(out->value,
+                            MakeArray(array_result.array_data())->GetScalar(0));
+    } else {
+      DCHECK(is_fixed_width(out->type()->id()));
+      ArrayData* out_data = array_out->data().get();
+
+      // the null count will be unknown after the kernel executes
+      out_data->null_count = kUnknownNullCount;
+
+      ArraySpan* span = array_result.array_span();
+
+      // TODO(wesm): It isn't safe to write into the memory allocated by
+      // MakeArrayFromScalar because MakeArrayOfNull reuses memory across
+      // buffers. So to be able to write into an ArraySpan we need to allocate
+      // some memory with the same structure as array_out
+      //
+      // Should probably implement a "make empty" array whose buffers are all
+      // safe to modify
+      if (out_data->buffers[0]) {
+        ARROW_ASSIGN_OR_RAISE(out_data->buffers[0],
+                              out_data->buffers[0]->CopySlice(0, 1));
+      }
+      ARROW_ASSIGN_OR_RAISE(out_data->buffers[1], out_data->buffers[1]->CopySlice(
+                                                      0, out_data->buffers[1]->size()));
+      span->SetMembers(*out_data);
+      RETURN_NOT_OK(exec(ctx, span_with_arrays, &array_result));
+
+      // XXX(wesm): have to rebox the array after mutating the buffers because
+      // of the cached validity bitmap buffer
+      ARROW_ASSIGN_OR_RAISE(out->value, MakeArray(array_out->data())->GetScalar(0));
+    }
     return Status::OK();
   };
 }
diff --git a/cpp/src/arrow/compute/kernels/util_internal.h b/cpp/src/arrow/compute/kernels/util_internal.h
index 845b9113001..40788b0ea60 100644
--- a/cpp/src/arrow/compute/kernels/util_internal.h
+++ b/cpp/src/arrow/compute/kernels/util_internal.h
@@ -67,14 +67,16 @@ int GetBitWidth(const DataType& type);
 // rather than duplicating compiled code to do all these in each kernel.
 PrimitiveArg GetPrimitiveArg(const ArrayData& arr);
 
-// Augment a unary ArrayKernelExec which supports only array-like inputs with support for
-// scalar inputs. Scalars will be transformed to 1-long arrays with the scalar's value (or
-// null if the scalar is null) as its only element. This 1-long array will be passed to
-// the original exec, then the only element of the resulting array will be extracted as
-// the output scalar. This could be far more efficient, but instead of optimizing this
-// it'd be better to support scalar inputs "upstream" in original exec.
-ArrayKernelExec TrivialScalarUnaryAsArraysExec(
-    ArrayKernelExec exec, NullHandling::type null_handling = NullHandling::INTERSECTION);
+// Augment a unary ScalarernelExec which supports only array-like inputs with
+// support for scalar inputs. Scalars will be transformed to 1-long arrays with
+// the scalar's value (or null if the scalar is null) as its only element. This
+// 1-long array will be passed to the original exec, then the only element of
+// the resulting array will be extracted as the output scalar. This could be
+// far more efficient, but instead of optimizing this it'd be better to support
+// scalar inputs "upstream" in original exec.
+ScalarKernel::ExecFunc TrivialScalarUnaryAsArraysExec(
+    ScalarKernel::ExecFunc exec, bool use_array_span = true,
+    NullHandling::type null_handling = NullHandling::INTERSECTION);
 
 // Return (min, max) of a numerical array, ignore nulls.
 // For empty array, return the maximal number limit as 'min', and minimal limit as 'max'.
diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
index ac4fe443d3e..c8b13e6d638 100644
--- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
@@ -474,30 +474,30 @@ void AddArraySortingKernels(VectorKernel base, VectorFunction* func) {
 
   // duration type
   base.signature = KernelSignature::Make({InputType::Array(Type::DURATION)}, uint64());
-  base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*int64());
+  base.exec = GenerateNumericOld<ExecTemplate, UInt64Type>(*int64());
   DCHECK_OK(func->AddKernel(base));
 
   for (const auto& ty : NumericTypes()) {
     auto physical_type = GetPhysicalType(ty);
     base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
-    base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+    base.exec = GenerateNumericOld<ExecTemplate, UInt64Type>(*physical_type);
     DCHECK_OK(func->AddKernel(base));
   }
   for (const auto& ty : TemporalTypes()) {
     auto physical_type = GetPhysicalType(ty);
     base.signature = KernelSignature::Make({InputType::Array(ty->id())}, uint64());
-    base.exec = GenerateNumeric<ExecTemplate, UInt64Type>(*physical_type);
+    base.exec = GenerateNumericOld<ExecTemplate, UInt64Type>(*physical_type);
     DCHECK_OK(func->AddKernel(base));
   }
   for (const auto id : {Type::DECIMAL128, Type::DECIMAL256}) {
     base.signature = KernelSignature::Make({InputType::Array(id)}, uint64());
-    base.exec = GenerateDecimal<ExecTemplate, UInt64Type>(id);
+    base.exec = GenerateDecimalOld<ExecTemplate, UInt64Type>(id);
     DCHECK_OK(func->AddKernel(base));
   }
   for (const auto& ty : BaseBinaryTypes()) {
     auto physical_type = GetPhysicalType(ty);
     base.signature = KernelSignature::Make({InputType::Array(ty)}, uint64());
-    base.exec = GenerateVarBinaryBase<ExecTemplate, UInt64Type>(*physical_type);
+    base.exec = GenerateVarBinaryBaseOld<ExecTemplate, UInt64Type>(*physical_type);
     DCHECK_OK(func->AddKernel(base));
   }
   base.signature =
diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
index c0eb40964da..5d1f7bd7e6c 100644
--- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
+++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
@@ -132,7 +132,7 @@ struct CumulativeGeneric {
 
     if (skip_nulls || (input.GetNullCount() == 0 && !encountered_null)) {
       VisitArrayValuesInline<ArgType>(
-          input,
+          ArraySpan(input),
           [&](ArgValue v) {
             accumulator =
                 Op::template Call<OutValue, ArgValue, ArgValue>(ctx, v, accumulator, &st);
@@ -142,7 +142,7 @@ struct CumulativeGeneric {
     } else {
       int64_t nulls_start_idx = 0;
       VisitArrayValuesInline<ArgType>(
-          input,
+          ArraySpan(input),
           [&](ArgValue v) {
             if (!encountered_null) {
               accumulator = Op::template Call<OutValue, ArgValue, ArgValue>(
@@ -215,7 +215,7 @@ void MakeVectorCumulativeFunction(FunctionRegistry* registry, const std::string
     kernel.null_handling = NullHandling::type::COMPUTED_NO_PREALLOCATE;
     kernel.mem_allocation = MemAllocation::type::NO_PREALLOCATE;
     kernel.signature = KernelSignature::Make({InputType(ty)}, OutputType(ty));
-    kernel.exec = ArithmeticExecFromOp<CumulativeGeneric, Op, OptionsType>(ty);
+    kernel.exec = ArithmeticExecFromOpOld<CumulativeGeneric, Op, OptionsType>(ty);
     kernel.init = CumulativeOptionsWrapper<OptionsType>::Init;
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
diff --git a/cpp/src/arrow/compute/kernels/vector_replace.cc b/cpp/src/arrow/compute/kernels/vector_replace.cc
index 294f6fe0408..bdcb37438dd 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -832,7 +832,7 @@ struct FillNullBackwardFunctor {
 template <template <class> class Functor>
 void RegisterVectorFunction(FunctionRegistry* registry,
                             std::shared_ptr<VectorFunction> func) {
-  auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExec exec) {
+  auto add_kernel = [&](detail::GetTypeId get_id, KernelBatchExec exec) {
     VectorKernel kernel;
     kernel.can_execute_chunkwise = false;
     if (is_fixed_width(get_id.id)) {
@@ -849,7 +849,7 @@ void RegisterVectorFunction(FunctionRegistry* registry,
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   };
   auto add_primitive_kernel = [&](detail::GetTypeId get_id) {
-    add_kernel(get_id, GenerateTypeAgnosticPrimitive<Functor>(get_id));
+    add_kernel(get_id, GenerateTypeAgnosticPrimitiveOld<Functor>(get_id));
   };
   for (const auto& ty : NumericTypes()) {
     add_primitive_kernel(ty);
@@ -866,7 +866,7 @@ void RegisterVectorFunction(FunctionRegistry* registry,
   add_kernel(Type::DECIMAL128, Functor<FixedSizeBinaryType>::Exec);
   add_kernel(Type::DECIMAL256, Functor<FixedSizeBinaryType>::Exec);
   for (const auto& ty : BaseBinaryTypes()) {
-    add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBase<Functor>(*ty));
+    add_kernel(ty->id(), GenerateTypeAgnosticVarBinaryBaseOld<Functor>(*ty));
   }
   // TODO: list types
   DCHECK_OK(registry->AddFunction(std::move(func)));
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 00c702f223e..5ef84d4fd92 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -1,3 +1,4 @@
+
 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
@@ -494,7 +495,7 @@ void TakeIndexDispatch(const PrimitiveArg& values, const PrimitiveArg& indices,
 
 Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*batch[1].array()), batch[0].length()));
   }
 
   PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
@@ -1133,7 +1134,7 @@ Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
 Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*batch[1].array()), batch[0].length()));
   }
   // batch.length doesn't take into account the take indices
   auto new_length = batch[1].array()->length;
@@ -2318,7 +2319,7 @@ Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 template <typename Impl>
 Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*batch[1].array()), batch[0].length()));
   }
   Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
   return kernel.ExecTake();
@@ -2326,7 +2327,7 @@ Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
 struct SelectionKernelDescr {
   InputType input;
-  ArrayKernelExec exec;
+  KernelBatchExec exec;
 };
 
 void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
@@ -2382,9 +2383,9 @@ struct NonZeroVisitor {
     const T zero{};
     uint64_t index = 0;
 
-    for (const auto& current_array : arrays) {
+    for (const std::shared_ptr<ArrayData>& current_array : arrays) {
       VisitArrayValuesInline<Type>(
-          *current_array,
+          ArraySpan(*current_array),
           [&](T v) {
             if (v != zero) {
               this->builder->UnsafeAppend(index++);
diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h
index a732dc429db..9460c69b795 100644
--- a/cpp/src/arrow/datum.h
+++ b/cpp/src/arrow/datum.h
@@ -236,6 +236,8 @@ struct ARROW_EXPORT Datum {
 
   bool is_array() const { return this->kind() == Datum::ARRAY; }
 
+  bool is_chunked_array() const { return this->kind() == Datum::CHUNKED_ARRAY; }
+
   bool is_arraylike() const {
     return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
   }
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 25392a069ca..ac92287f1bc 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -183,13 +183,13 @@ class ArrayPrinter : public PrettyPrinter {
 
   template <typename ArrayType, typename T = typename ArrayType::TypeClass>
   Status WritePrimitiveValues(const ArrayType& array) {
-    StringFormatter<T> formatter{array.type()};
+    StringFormatter<T> formatter{array.type().get()};
     return WritePrimitiveValues(array, &formatter);
   }
 
   Status WriteValidityBitmap(const Array& array);
 
-  Status PrintChildren(const std::vector<std::shared_ptr<Array>>& fields, int64_t offset,
+  Status PrintChildren(const std::vector<const Array*>& fields, int64_t offset,
                        int64_t length) {
     for (size_t i = 0; i < fields.size(); ++i) {
       Write("\n");  // Always want newline before child array description
@@ -198,12 +198,14 @@ class ArrayPrinter : public PrettyPrinter {
       ss << "-- child " << i << " type: " << fields[i]->type()->ToString() << "\n";
       Write(ss.str());
 
-      std::shared_ptr<Array> field = fields[i];
+      // Indent();
+      const Array* field = fields[i];
       if (offset != 0) {
-        field = field->Slice(offset, length);
+        RETURN_NOT_OK(
+            PrettyPrint(*field->Slice(offset, length), ChildOptions(true), sink_));
+      } else {
+        RETURN_NOT_OK(PrettyPrint(*field, ChildOptions(true), sink_));
       }
-      // Indent();
-      RETURN_NOT_OK(PrettyPrint(*field, ChildOptions(true), sink_));
     }
     return Status::OK();
   }
@@ -219,7 +221,7 @@ class ArrayPrinter : public PrettyPrinter {
 
   Status WriteDataValues(const HalfFloatArray& array) {
     // XXX do not know how to format half floats yet
-    StringFormatter<Int16Type> formatter{array.type()};
+    StringFormatter<Int16Type> formatter{array.type().get()};
     return WritePrimitiveValues(array, &formatter);
   }
 
@@ -326,10 +328,10 @@ class ArrayPrinter : public PrettyPrinter {
 
   Status Visit(const StructArray& array) {
     RETURN_NOT_OK(WriteValidityBitmap(array));
-    std::vector<std::shared_ptr<Array>> children;
+    std::vector<const Array*> children;
     children.reserve(array.num_fields());
     for (int i = 0; i < array.num_fields(); ++i) {
-      children.emplace_back(array.field(i));
+      children.emplace_back(array.field(i).get());
     }
     return PrintChildren(children, 0, array.length());
   }
@@ -354,10 +356,10 @@ class ArrayPrinter : public PrettyPrinter {
     }
 
     // Print the children without any offset, because the type ids are absolute
-    std::vector<std::shared_ptr<Array>> children;
+    std::vector<const Array*> children;
     children.reserve(array.num_fields());
     for (int i = 0; i < array.num_fields(); ++i) {
-      children.emplace_back(array.field(i));
+      children.emplace_back(array.field(i).get());
     }
     return PrintChildren(children, 0, array.length() + array.offset());
   }
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 957dd3c5ca1..17afcf06e60 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -1657,7 +1657,8 @@ class CategoricalWriter
       const auto& indices = checked_cast<const ArrayType&>(*arr.indices());
       auto values = reinterpret_cast<const T*>(indices.raw_values());
 
-      RETURN_NOT_OK(CheckIndexBounds(*indices.data(), arr.dictionary()->length()));
+      RETURN_NOT_OK(
+          CheckIndexBounds(ArraySpan(*indices.data()), arr.dictionary()->length()));
       // Null is -1 in CategoricalBlock
       for (int i = 0; i < arr.length(); ++i) {
         if (indices.IsValid(i)) {
@@ -1691,7 +1692,7 @@ class CategoricalWriter
       auto transpose = reinterpret_cast<const int32_t*>(transpose_buffer->data());
       int64_t dict_length = arr.dictionary()->length();
 
-      RETURN_NOT_OK(CheckIndexBounds(*indices.data(), dict_length));
+      RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*indices.data()), dict_length));
 
       // Null is -1 in CategoricalBlock
       for (int i = 0; i < arr.length(); ++i) {
@@ -1715,8 +1716,8 @@ class CategoricalWriter
     const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
 
     if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
-      RETURN_NOT_OK(
-          CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
+      RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*indices_first->data()),
+                                     arr_first.dictionary()->length()));
 
       PyObject* wrapped;
       npy_intp dims[1] = {static_cast<npy_intp>(this->num_rows_)};
diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc
index 41309d27bb7..e0046c2ab88 100644
--- a/cpp/src/arrow/python/udf.cc
+++ b/cpp/src/arrow/python/udf.cc
@@ -21,6 +21,9 @@
 
 namespace arrow {
 
+using compute::ExecResult;
+using compute::ExecSpan;
+
 namespace py {
 
 namespace {
@@ -45,37 +48,26 @@ struct PythonUdf {
     }
   }
 
-  Status operator()(compute::KernelContext* ctx, const compute::ExecBatch& batch,
-                    Datum* out) {
+  Status operator()(compute::KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     return SafeCallIntoPython([&]() -> Status { return Execute(ctx, batch, out); });
   }
 
-  Status Execute(compute::KernelContext* ctx, const compute::ExecBatch& batch,
-                 Datum* out) {
-    const auto num_args = batch.values.size();
-    ScalarUdfContext udf_context{ctx->memory_pool(), static_cast<int64_t>(batch.length)};
+  Status Execute(compute::KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+    const int num_args = batch.num_values();
+    ScalarUdfContext udf_context{ctx->memory_pool(), batch.length};
 
     OwnedRef arg_tuple(PyTuple_New(num_args));
     RETURN_NOT_OK(CheckPyError());
-    for (size_t arg_id = 0; arg_id < num_args; arg_id++) {
-      switch (batch[arg_id].kind()) {
-        case Datum::SCALAR: {
-          auto c_data = batch[arg_id].scalar();
-          PyObject* data = wrap_scalar(c_data);
-          PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
-          break;
-        }
-        case Datum::ARRAY: {
-          auto c_data = batch[arg_id].make_array();
-          PyObject* data = wrap_array(c_data);
-          PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
-          break;
-        }
-        default:
-          auto datum = batch[arg_id];
-          return Status::NotImplemented(
-              "User-defined-functions are not supported for the datum kind ",
-              ToString(batch[arg_id].kind()));
+    for (int arg_id = 0; arg_id < num_args; arg_id++) {
+      if (batch[arg_id].is_scalar()) {
+        std::shared_ptr<Scalar> c_data = batch[arg_id].scalar->GetSharedPtr();
+        PyObject* data = wrap_scalar(c_data);
+        PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
+      } else {
+        std::shared_ptr<Array> c_data = batch[arg_id].array.ToArray();
+        PyObject* data = wrap_array(c_data);
+        PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
+        break;
       }
     }
 
@@ -83,14 +75,14 @@ struct PythonUdf {
     RETURN_NOT_OK(CheckPyError());
     // unwrapping the output for expected output type
     if (is_scalar(result.obj())) {
-      ARROW_ASSIGN_OR_RAISE(auto val, unwrap_scalar(result.obj()));
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> val, unwrap_scalar(result.obj()));
       RETURN_NOT_OK(CheckOutputType(*output_type.type(), *val->type));
-      *out = Datum(val);
+      out->value = val;
       return Status::OK();
     } else if (is_array(result.obj())) {
-      ARROW_ASSIGN_OR_RAISE(auto val, unwrap_array(result.obj()));
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> val, unwrap_array(result.obj()));
       RETURN_NOT_OK(CheckOutputType(*output_type.type(), *val->type()));
-      *out = Datum(val);
+      out->value = std::move(val->data());
       return Status::OK();
     } else {
       return Status::TypeError("Unexpected output type: ", Py_TYPE(result.obj())->tp_name,
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index 15304c01285..fb097d59401 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -916,7 +916,7 @@ template <typename ScalarType, typename T = typename ScalarType::TypeClass,
           // undefined
           typename Value = typename Formatter::value_type>
 Status CastImpl(const ScalarType& from, StringScalar* to) {
-  to->value = FormatToBuffer(Formatter{from.type}, from);
+  to->value = FormatToBuffer(Formatter{from.type.get()}, from);
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 4a2246f63e2..38e7c614124 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -49,7 +49,8 @@ class Array;
 /// Scalars are useful for passing single value inputs to compute functions,
 /// or for representing individual array elements (with a non-trivial
 /// wrapping cost, though).
-struct ARROW_EXPORT Scalar : public util::EqualityComparable<Scalar> {
+struct ARROW_EXPORT Scalar : public std::enable_shared_from_this<Scalar>,
+                             public util::EqualityComparable<Scalar> {
   virtual ~Scalar() = default;
 
   explicit Scalar(std::shared_ptr<DataType> type) : type(std::move(type)) {}
@@ -106,6 +107,11 @@ struct ARROW_EXPORT Scalar : public util::EqualityComparable<Scalar> {
   /// \brief Apply the ScalarVisitor::Visit() method specialized to the scalar type
   Status Accept(ScalarVisitor* visitor) const;
 
+  /// EXPERIMENTAL
+  std::shared_ptr<Scalar> GetSharedPtr() const {
+    return const_cast<Scalar*>(this)->shared_from_this();
+  }
+
  protected:
   Scalar(std::shared_ptr<DataType> type, bool is_valid)
       : type(std::move(type)), is_valid(is_valid) {}
@@ -129,6 +135,9 @@ namespace internal {
 
 struct ARROW_EXPORT PrimitiveScalarBase : public Scalar {
   using Scalar::Scalar;
+  /// \brief Get an immutable pointer to the value of this scalar. May be null.
+  virtual const void* data() const = 0;
+
   /// \brief Get a mutable pointer to the value of this scalar. May be null.
   virtual void* mutable_data() = 0;
   /// \brief Get an immutable view of the value of this scalar as bytes.
@@ -150,6 +159,7 @@ struct ARROW_EXPORT PrimitiveScalar : public PrimitiveScalarBase {
 
   ValueType value{};
 
+  const void* data() const override { return &value; }
   void* mutable_data() override { return &value; }
   util::string_view view() const override {
     return util::string_view(reinterpret_cast<const char*>(&value), sizeof(ValueType));
@@ -234,6 +244,10 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
 
   std::shared_ptr<Buffer> value;
 
+  const void* data() const override {
+    return value ? reinterpret_cast<const void*>(value->data()) : NULLPTR;
+  }
+
   void* mutable_data() override {
     return value ? reinterpret_cast<void*>(value->mutable_data()) : NULLPTR;
   }
@@ -404,6 +418,10 @@ struct ARROW_EXPORT DecimalScalar : public internal::PrimitiveScalarBase {
   DecimalScalar(ValueType value, std::shared_ptr<DataType> type)
       : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
 
+  const void* data() const override {
+    return reinterpret_cast<const void*>(value.native_endian_bytes());
+  }
+
   void* mutable_data() override {
     return reinterpret_cast<void*>(value.mutable_native_endian_bytes());
   }
@@ -526,6 +544,10 @@ struct ARROW_EXPORT DictionaryScalar : public internal::PrimitiveScalarBase {
 
   Result<std::shared_ptr<Scalar>> GetEncodedValue() const;
 
+  const void* data() const override {
+    return internal::checked_cast<const internal::PrimitiveScalarBase&>(*value.index)
+        .data();
+  }
   void* mutable_data() override {
     return internal::checked_cast<internal::PrimitiveScalarBase&>(*value.index)
         .mutable_data();
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 030c81a9b8b..85c3f6e09e9 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -126,7 +126,8 @@ struct ARROW_EXPORT DataTypeLayout {
 ///
 /// Simple datatypes may be entirely described by their Type::type id, but
 /// complex datatypes are usually parametric.
-class ARROW_EXPORT DataType : public detail::Fingerprintable {
+class ARROW_EXPORT DataType : public std::enable_shared_from_this<DataType>,
+                              public detail::Fingerprintable {
  public:
   explicit DataType(Type::type id) : detail::Fingerprintable(), id_(id) {}
   ~DataType() override;
@@ -174,6 +175,25 @@ class ARROW_EXPORT DataType : public detail::Fingerprintable {
   /// \brief Return the type category of the storage type
   virtual Type::type storage_id() const { return id_; }
 
+  /// \brief Returns the type's fixed byte width, if any. Returns -1
+  /// for non-fixed-width types, and should only be used for
+  /// subclasses of FixedWidthType
+  virtual int32_t byte_width() const {
+    int32_t num_bits = this->bit_width();
+    return num_bits > 0 ? num_bits / 8 : -1;
+  }
+
+  /// \brief Returns the type's fixed bit width, if any. Returns -1
+  /// for non-fixed-width types, and should only be used for
+  /// subclasses of FixedWidthType
+  virtual int bit_width() const { return -1; }
+
+  // \brief EXPERIMENTAL: Enable retrieving shared_ptr<DataType> from a const
+  // context. Consider renaming or removing in favor of a Copy method
+  std::shared_ptr<DataType> GetSharedPtr() const {
+    return const_cast<DataType*>(this)->shared_from_this();
+  }
+
  protected:
   // Dummy version that returns a null string (indicating not implemented).
   // Subclasses should override for fast equality checks.
@@ -215,8 +235,6 @@ std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& type)
 class ARROW_EXPORT FixedWidthType : public DataType {
  public:
   using DataType::DataType;
-
-  virtual int bit_width() const = 0;
 };
 
 /// \brief Base class for all data types representing primitive values
@@ -699,7 +717,7 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri
         {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(byte_width())});
   }
 
-  int32_t byte_width() const { return byte_width_; }
+  int32_t byte_width() const override { return byte_width_; }
   int bit_width() const override;
 
   // Validating constructor
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
index ed4689e21c8..c1b6eef6bf1 100644
--- a/cpp/src/arrow/util/bit_block_counter.h
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -425,13 +425,8 @@ class ARROW_EXPORT OptionalBinaryBitBlockCounter {
 // Functional-style bit block visitors.
 
 template <typename VisitNotNull, typename VisitNull>
-static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
-                             int64_t length, VisitNotNull&& visit_not_null,
-                             VisitNull&& visit_null) {
-  const uint8_t* bitmap = NULLPTR;
-  if (bitmap_buf != NULLPTR) {
-    bitmap = bitmap_buf->data();
-  }
+static Status VisitBitBlocks(const uint8_t* bitmap, int64_t offset, int64_t length,
+                             VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
   internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
   int64_t position = 0;
   while (position < length) {
@@ -458,13 +453,21 @@ static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t
 }
 
 template <typename VisitNotNull, typename VisitNull>
-static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
-                               int64_t length, VisitNotNull&& visit_not_null,
-                               VisitNull&& visit_null) {
+static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+                             int64_t length, VisitNotNull&& visit_not_null,
+                             VisitNull&& visit_null) {
   const uint8_t* bitmap = NULLPTR;
   if (bitmap_buf != NULLPTR) {
     bitmap = bitmap_buf->data();
   }
+  return VisitBitBlocks(bitmap, offset, length,
+                        std::forward<VisitNotNull>(visit_not_null),
+                        std::forward<VisitNull>(visit_null));
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static void VisitBitBlocksVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
+                               VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
   internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
   int64_t position = 0;
   while (position < length) {
@@ -490,26 +493,34 @@ static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_
 }
 
 template <typename VisitNotNull, typename VisitNull>
-static Status VisitTwoBitBlocks(const std::shared_ptr<Buffer>& left_bitmap_buf,
-                                int64_t left_offset,
-                                const std::shared_ptr<Buffer>& right_bitmap_buf,
-                                int64_t right_offset, int64_t length,
-                                VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
-  if (left_bitmap_buf == NULLPTR || right_bitmap_buf == NULLPTR) {
+static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
+                               int64_t length, VisitNotNull&& visit_not_null,
+                               VisitNull&& visit_null) {
+  const uint8_t* bitmap = NULLPTR;
+  if (bitmap_buf != NULLPTR) {
+    bitmap = bitmap_buf->data();
+  }
+  VisitBitBlocksVoid(bitmap, offset, length, std::forward<VisitNotNull>(visit_not_null),
+                     std::forward<VisitNull>(visit_null));
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static Status VisitTwoBitBlocks(const uint8_t* left_bitmap, int64_t left_offset,
+                                const uint8_t* right_bitmap, int64_t right_offset,
+                                int64_t length, VisitNotNull&& visit_not_null,
+                                VisitNull&& visit_null) {
+  if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
     // At most one bitmap is present
-    if (left_bitmap_buf == NULLPTR) {
-      return VisitBitBlocks(right_bitmap_buf, right_offset, length,
+    if (left_bitmap == NULLPTR) {
+      return VisitBitBlocks(right_bitmap, right_offset, length,
                             std::forward<VisitNotNull>(visit_not_null),
                             std::forward<VisitNull>(visit_null));
     } else {
-      return VisitBitBlocks(left_bitmap_buf, left_offset, length,
+      return VisitBitBlocks(left_bitmap, left_offset, length,
                             std::forward<VisitNotNull>(visit_not_null),
                             std::forward<VisitNull>(visit_null));
     }
   }
-  // Both bitmaps are present
-  const uint8_t* left_bitmap = left_bitmap_buf->data();
-  const uint8_t* right_bitmap = right_bitmap_buf->data();
   BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
                                     length);
   int64_t position = 0;
@@ -538,26 +549,37 @@ static Status VisitTwoBitBlocks(const std::shared_ptr<Buffer>& left_bitmap_buf,
 }
 
 template <typename VisitNotNull, typename VisitNull>
-static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
-                                  int64_t left_offset,
-                                  const std::shared_ptr<Buffer>& right_bitmap_buf,
-                                  int64_t right_offset, int64_t length,
-                                  VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
-  if (left_bitmap_buf == NULLPTR || right_bitmap_buf == NULLPTR) {
+static Status VisitTwoBitBlocks(const std::shared_ptr<Buffer>& left_bitmap_buf,
+                                int64_t left_offset,
+                                const std::shared_ptr<Buffer>& right_bitmap_buf,
+                                int64_t right_offset, int64_t length,
+                                VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+  const uint8_t* left_bitmap =
+      left_bitmap_buf == NULLPTR ? nullptr : left_bitmap_buf->data();
+  const uint8_t* right_bitmap =
+      right_bitmap_buf == NULLPTR ? nullptr : right_bitmap_buf->data();
+  return VisitTwoBitBlocks(left_bitmap, left_offset, right_bitmap, right_offset, length,
+                           std::forward<VisitNotNull>(visit_not_null),
+                           std::forward<VisitNull>(visit_null));
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static void VisitTwoBitBlocksVoid(const uint8_t* left_bitmap, int64_t left_offset,
+                                  const uint8_t* right_bitmap, int64_t right_offset,
+                                  int64_t length, VisitNotNull&& visit_not_null,
+                                  VisitNull&& visit_null) {
+  if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
     // At most one bitmap is present
-    if (left_bitmap_buf == NULLPTR) {
-      return VisitBitBlocksVoid(right_bitmap_buf, right_offset, length,
+    if (left_bitmap == NULLPTR) {
+      return VisitBitBlocksVoid(right_bitmap, right_offset, length,
                                 std::forward<VisitNotNull>(visit_not_null),
                                 std::forward<VisitNull>(visit_null));
     } else {
-      return VisitBitBlocksVoid(left_bitmap_buf, left_offset, length,
+      return VisitBitBlocksVoid(left_bitmap, left_offset, length,
                                 std::forward<VisitNotNull>(visit_not_null),
                                 std::forward<VisitNull>(visit_null));
     }
   }
-  // Both bitmaps are present
-  const uint8_t* left_bitmap = left_bitmap_buf->data();
-  const uint8_t* right_bitmap = right_bitmap_buf->data();
   BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
                                     length);
   int64_t position = 0;
@@ -584,5 +606,20 @@ static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf
   }
 }
 
+template <typename VisitNotNull, typename VisitNull>
+static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
+                                  int64_t left_offset,
+                                  const std::shared_ptr<Buffer>& right_bitmap_buf,
+                                  int64_t right_offset, int64_t length,
+                                  VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+  const uint8_t* left_bitmap =
+      left_bitmap_buf == NULLPTR ? nullptr : left_bitmap_buf->data();
+  const uint8_t* right_bitmap =
+      right_bitmap_buf == NULLPTR ? nullptr : right_bitmap_buf->data();
+  return VisitTwoBitBlocksVoid(
+      left_bitmap_buf, left_offset, right_bitmap_buf, right_offset, length,
+      std::forward<VisitNotNull>(visit_not_null), std::forward<VisitNull>(visit_null));
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bit_util_benchmark.cc b/cpp/src/arrow/util/bit_util_benchmark.cc
index 258fd27785e..8e95d014628 100644
--- a/cpp/src/arrow/util/bit_util_benchmark.cc
+++ b/cpp/src/arrow/util/bit_util_benchmark.cc
@@ -150,8 +150,8 @@ static void BenchmarkAndImpl(benchmark::State& state, DoAnd&& do_and) {
 
   for (auto _ : state) {
     do_and({bitmap_1, bitmap_2}, &bitmap_3);
-    auto total = internal::CountSetBits(bitmap_3.buffer()->data(), bitmap_3.offset(),
-                                        bitmap_3.length());
+    auto total =
+        internal::CountSetBits(bitmap_3.data(), bitmap_3.offset(), bitmap_3.length());
     benchmark::DoNotOptimize(total);
   }
   state.SetBytesProcessed(state.iterations() * nbytes);
@@ -159,9 +159,8 @@ static void BenchmarkAndImpl(benchmark::State& state, DoAnd&& do_and) {
 
 static void BenchmarkBitmapAnd(benchmark::State& state) {
   BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2], internal::Bitmap* out) {
-    internal::BitmapAnd(bitmaps[0].buffer()->data(), bitmaps[0].offset(),
-                        bitmaps[1].buffer()->data(), bitmaps[1].offset(),
-                        bitmaps[0].length(), 0, out->buffer()->mutable_data());
+    internal::BitmapAnd(bitmaps[0].data(), bitmaps[0].offset(), bitmaps[1].data(),
+                        bitmaps[1].offset(), bitmaps[0].length(), 0, out->mutable_data());
   });
 }
 
@@ -177,8 +176,7 @@ static void BenchmarkBitmapVisitUInt8And(benchmark::State& state) {
   BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2], internal::Bitmap* out) {
     int64_t i = 0;
     internal::Bitmap::VisitWords(bitmaps, [&](std::array<uint8_t, 2> uint8s) {
-      reinterpret_cast<uint8_t*>(out->buffer()->mutable_data())[i++] =
-          uint8s[0] & uint8s[1];
+      reinterpret_cast<uint8_t*>(out->mutable_data())[i++] = uint8s[0] & uint8s[1];
     });
   });
 }
@@ -187,8 +185,7 @@ static void BenchmarkBitmapVisitUInt64And(benchmark::State& state) {
   BenchmarkAndImpl(state, [](const internal::Bitmap(&bitmaps)[2], internal::Bitmap* out) {
     int64_t i = 0;
     internal::Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 2> uint64s) {
-      reinterpret_cast<uint64_t*>(out->buffer()->mutable_data())[i++] =
-          uint64s[0] & uint64s[1];
+      reinterpret_cast<uint64_t*>(out->mutable_data())[i++] = uint64s[0] & uint64s[1];
     });
   });
 }
diff --git a/cpp/src/arrow/util/bit_util_test.cc b/cpp/src/arrow/util/bit_util_test.cc
index 6c2aff4fbeb..15eadc9f2e7 100644
--- a/cpp/src/arrow/util/bit_util_test.cc
+++ b/cpp/src/arrow/util/bit_util_test.cc
@@ -2196,7 +2196,7 @@ static Bitmap Copy(const Bitmap& bitmap, std::shared_ptr<Buffer> storage) {
   auto min_offset = Bitmap::VisitWords(bitmaps, [&](std::array<uint64_t, 1> uint64s) {
     reinterpret_cast<uint64_t*>(storage->mutable_data())[i++] = uint64s[0];
   });
-  return Bitmap(std::move(storage), min_offset, bitmap.length());
+  return Bitmap(storage, min_offset, bitmap.length());
 }
 
 // reconstruct a bitmap from a word-wise visit
@@ -2285,9 +2285,9 @@ TEST(Bitmap, VisitWordsAnd) {
                   uint64s[0] & uint64s[1];
             });
 
-        BitmapAnd(bitmaps[0].buffer()->data(), bitmaps[0].offset(),
-                  bitmaps[1].buffer()->data(), bitmaps[1].offset(), bitmaps[0].length(),
-                  0, expected_buffer->mutable_data());
+        BitmapAnd(bitmaps[0].data(), bitmaps[0].offset(), bitmaps[1].data(),
+                  bitmaps[1].offset(), bitmaps[0].length(), 0,
+                  expected_buffer->mutable_data());
 
         ASSERT_TRUE(BitmapEquals(actual_buffer->data(), min_offset,
                                  expected_buffer->data(), 0, num_bits))
@@ -2312,13 +2312,14 @@ void DoBitmapVisitAndWrite(int64_t part, bool with_offset) {
   Bitmap bm2(arrow_buffer, part * 2, part);
 
   std::array<Bitmap, 2> out_bms;
+  std::shared_ptr<Buffer> out, out0, out1;
   if (with_offset) {
-    ASSERT_OK_AND_ASSIGN(auto out, AllocateBitmap(part * 4));
+    ASSERT_OK_AND_ASSIGN(out, AllocateBitmap(part * 4));
     out_bms[0] = Bitmap(out, part, part);
     out_bms[1] = Bitmap(out, part * 2, part);
   } else {
-    ASSERT_OK_AND_ASSIGN(auto out0, AllocateBitmap(part));
-    ASSERT_OK_AND_ASSIGN(auto out1, AllocateBitmap(part));
+    ASSERT_OK_AND_ASSIGN(out0, AllocateBitmap(part));
+    ASSERT_OK_AND_ASSIGN(out1, AllocateBitmap(part));
     out_bms[0] = Bitmap(out0, 0, part);
     out_bms[1] = Bitmap(out1, 0, part);
   }
@@ -2333,20 +2334,18 @@ void DoBitmapVisitAndWrite(int64_t part, bool with_offset) {
       });
 
   auto pool = MemoryPool::CreateDefault();
-  ASSERT_OK_AND_ASSIGN(auto exp_0,
-                       BitmapAnd(pool.get(), bm0.buffer()->data(), bm0.offset(),
-                                 bm1.buffer()->data(), bm1.offset(), part, 0));
-  ASSERT_OK_AND_ASSIGN(auto exp_1,
-                       BitmapOr(pool.get(), bm0.buffer()->data(), bm0.offset(),
-                                bm2.buffer()->data(), bm2.offset(), part, 0));
-
-  ASSERT_TRUE(BitmapEquals(exp_0->data(), 0, out_bms[0].buffer()->data(),
-                           out_bms[0].offset(), part))
+  ASSERT_OK_AND_ASSIGN(auto exp_0, BitmapAnd(pool.get(), bm0.data(), bm0.offset(),
+                                             bm1.data(), bm1.offset(), part, 0));
+  ASSERT_OK_AND_ASSIGN(auto exp_1, BitmapOr(pool.get(), bm0.data(), bm0.offset(),
+                                            bm2.data(), bm2.offset(), part, 0));
+
+  ASSERT_TRUE(
+      BitmapEquals(exp_0->data(), 0, out_bms[0].data(), out_bms[0].offset(), part))
       << "exp: " << Bitmap(exp_0->data(), 0, part).ToString() << std::endl
       << "got: " << out_bms[0].ToString();
 
-  ASSERT_TRUE(BitmapEquals(exp_1->data(), 0, out_bms[1].buffer()->data(),
-                           out_bms[1].offset(), part))
+  ASSERT_TRUE(
+      BitmapEquals(exp_1->data(), 0, out_bms[1].data(), out_bms[1].offset(), part))
       << "exp: " << Bitmap(exp_1->data(), 0, part).ToString() << std::endl
       << "got: " << out_bms[1].ToString();
 }
diff --git a/cpp/src/arrow/util/bitmap.cc b/cpp/src/arrow/util/bitmap.cc
index 33d1dee1957..2d6e273b7bb 100644
--- a/cpp/src/arrow/util/bitmap.cc
+++ b/cpp/src/arrow/util/bitmap.cc
@@ -38,30 +38,32 @@ std::string Bitmap::ToString() const {
   return out;
 }
 
-std::shared_ptr<BooleanArray> Bitmap::ToArray() const {
-  return std::make_shared<BooleanArray>(length_, buffer_, nullptr, 0, offset_);
-}
-
 std::string Bitmap::Diff(const Bitmap& other) const {
-  return ToArray()->Diff(*other.ToArray());
+  auto this_buf = std::make_shared<Buffer>(data_, length_);
+  auto other_buf = std::make_shared<Buffer>(other.data_, other.length_);
+
+  auto this_arr = std::make_shared<BooleanArray>(length_, this_buf, nullptr, 0, offset_);
+  auto other_arr =
+      std::make_shared<BooleanArray>(other.length_, other_buf, nullptr, 0, other.offset_);
+
+  return this_arr->Diff(*other_arr);
 }
 
 void Bitmap::CopyFrom(const Bitmap& other) {
-  ::arrow::internal::CopyBitmap(other.buffer_->data(), other.offset_, other.length_,
-                                buffer_->mutable_data(), offset_);
+  ::arrow::internal::CopyBitmap(other.data_, other.offset_, other.length_, mutable_data_,
+                                offset_);
 }
 
 void Bitmap::CopyFromInverted(const Bitmap& other) {
-  ::arrow::internal::InvertBitmap(other.buffer_->data(), other.offset_, other.length_,
-                                  buffer_->mutable_data(), offset_);
+  ::arrow::internal::InvertBitmap(other.data_, other.offset_, other.length_,
+                                  mutable_data_, offset_);
 }
 
 bool Bitmap::Equals(const Bitmap& other) const {
   if (length_ != other.length_) {
     return false;
   }
-  return BitmapEquals(buffer_->data(), offset_, other.buffer_->data(), other.offset(),
-                      length_);
+  return BitmapEquals(data_, offset_, other.data_, other.offset(), length_);
 }
 
 int64_t Bitmap::BitLength(const Bitmap* bitmaps, size_t N) {
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 99e3ceb4f58..267783da08e 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -53,27 +53,36 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
 
   Bitmap() = default;
 
-  Bitmap(std::shared_ptr<Buffer> buffer, int64_t offset, int64_t length)
-      : buffer_(std::move(buffer)), offset_(offset), length_(length) {}
+  Bitmap(const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length)
+      : data_(buffer->data()), offset_(offset), length_(length) {
+    if (buffer->is_mutable()) {
+      mutable_data_ = buffer->mutable_data();
+    }
+  }
 
   Bitmap(const void* data, int64_t offset, int64_t length)
-      : buffer_(std::make_shared<Buffer>(static_cast<const uint8_t*>(data),
-                                         bit_util::BytesForBits(offset + length))),
-        offset_(offset),
-        length_(length) {}
+      : data_(reinterpret_cast<const uint8_t*>(data)), offset_(offset), length_(length) {}
 
   Bitmap(void* data, int64_t offset, int64_t length)
-      : buffer_(std::make_shared<MutableBuffer>(static_cast<uint8_t*>(data),
-                                                bit_util::BytesForBits(offset + length))),
+      : data_(reinterpret_cast<const uint8_t*>(data)),
+        mutable_data_(reinterpret_cast<uint8_t*>(data)),
         offset_(offset),
         length_(length) {}
 
   Bitmap Slice(int64_t offset) const {
-    return Bitmap(buffer_, offset_ + offset, length_ - offset);
+    if (mutable_data_ != nullptr) {
+      return Bitmap(mutable_data_, offset_ + offset, length_ - offset);
+    } else {
+      return Bitmap(data_, offset_ + offset, length_ - offset);
+    }
   }
 
   Bitmap Slice(int64_t offset, int64_t length) const {
-    return Bitmap(buffer_, offset_ + offset, length);
+    if (mutable_data_ != nullptr) {
+      return Bitmap(mutable_data_, offset_ + offset, length);
+    } else {
+      return Bitmap(data_, offset_ + offset, length);
+    }
   }
 
   std::string ToString() const;
@@ -82,17 +91,15 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
 
   std::string Diff(const Bitmap& other) const;
 
-  bool GetBit(int64_t i) const { return bit_util::GetBit(buffer_->data(), i + offset_); }
+  bool GetBit(int64_t i) const { return bit_util::GetBit(data_, i + offset_); }
 
   bool operator[](int64_t i) const { return GetBit(i); }
 
   void SetBitTo(int64_t i, bool v) const {
-    bit_util::SetBitTo(buffer_->mutable_data(), i + offset_, v);
+    bit_util::SetBitTo(mutable_data_, i + offset_, v);
   }
 
-  void SetBitsTo(bool v) {
-    bit_util::SetBitsTo(buffer_->mutable_data(), offset_, length_, v);
-  }
+  void SetBitsTo(bool v) { bit_util::SetBitsTo(mutable_data_, offset_, length_, v); }
 
   void CopyFrom(const Bitmap& other);
   void CopyFromInverted(const Bitmap& other);
@@ -339,14 +346,14 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       for (size_t i = 0; i < N; ++i) {
         const Bitmap& in_bitmap = bitmaps_arg[i];
         readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
-            in_bitmap.buffer_->data(), in_bitmap.offset_, in_bitmap.length_);
+            in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
       }
 
       std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
       for (size_t i = 0; i < M; ++i) {
         const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
         writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
-            out_bitmap.buffer_->mutable_data(), out_bitmap.offset_, out_bitmap.length_);
+            out_bitmap.mutable_data_, out_bitmap.offset_, out_bitmap.length_);
       }
 
       RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
@@ -354,22 +361,23 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
       std::array<BitmapWordReader<Word>, N> readers;
       for (size_t i = 0; i < N; ++i) {
         const Bitmap& in_bitmap = bitmaps_arg[i];
-        readers[i] = BitmapWordReader<Word>(in_bitmap.buffer_->data(), in_bitmap.offset_,
-                                            in_bitmap.length_);
+        readers[i] =
+            BitmapWordReader<Word>(in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
       }
 
       std::array<BitmapWordWriter<Word>, M> writers;
       for (size_t i = 0; i < M; ++i) {
         const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
-        writers[i] = BitmapWordWriter<Word>(out_bitmap.buffer_->mutable_data(),
-                                            out_bitmap.offset_, out_bitmap.length_);
+        writers[i] = BitmapWordWriter<Word>(out_bitmap.mutable_data_, out_bitmap.offset_,
+                                            out_bitmap.length_);
       }
 
       RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
     }
   }
 
-  const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
+  const uint8_t* data() const { return data_; }
+  uint8_t* mutable_data() { return mutable_data_; }
 
   /// offset of first bit relative to buffer().data()
   int64_t offset() const { return offset_; }
@@ -381,7 +389,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   util::bytes_view bytes() const {
     auto byte_offset = offset_ / 8;
     auto byte_count = bit_util::CeilDiv(offset_ + length_, 8) - byte_offset;
-    return util::bytes_view(buffer_->data() + byte_offset, byte_count);
+    return util::bytes_view(data_ + byte_offset, byte_count);
   }
 
  private:
@@ -412,7 +420,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   /// offset of first bit relative to words<Word>().data()
   template <typename Word>
   int64_t word_offset() const {
-    return offset_ + 8 * (reinterpret_cast<intptr_t>(buffer_->data()) -
+    return offset_ + 8 * (reinterpret_cast<intptr_t>(data_) -
                           reinterpret_cast<intptr_t>(words<Word>().data()));
   }
 
@@ -440,8 +448,6 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     });
   }
 
-  std::shared_ptr<BooleanArray> ToArray() const;
-
   /// assert bitmaps have identical length and return that length
   static int64_t BitLength(const Bitmap* bitmaps, size_t N);
 
@@ -453,7 +459,8 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
     return bitmaps[0].length();
   }
 
-  std::shared_ptr<Buffer> buffer_;
+  const uint8_t* data_ = NULLPTR;
+  uint8_t* mutable_data_ = NULLPTR;
   int64_t offset_ = 0, length_ = 0;
 };
 
diff --git a/cpp/src/arrow/util/formatting.h b/cpp/src/arrow/util/formatting.h
index 09eb748e4a9..335aba8c5e3 100644
--- a/cpp/src/arrow/util/formatting.h
+++ b/cpp/src/arrow/util/formatting.h
@@ -68,7 +68,7 @@ using Return = decltype(std::declval<Appender>()(util::string_view{}));
 template <>
 class StringFormatter<BooleanType> {
  public:
-  explicit StringFormatter(const std::shared_ptr<DataType>& = NULLPTR) {}
+  explicit StringFormatter(const DataType* = NULLPTR) {}
 
   using value_type = bool;
 
@@ -156,7 +156,7 @@ constexpr size_t Digits10(Int value) {
 template <typename ARROW_TYPE>
 class IntToStringFormatterMixin {
  public:
-  explicit IntToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
+  explicit IntToStringFormatterMixin(const DataType* = NULLPTR) {}
 
   using value_type = typename ARROW_TYPE::c_type;
 
@@ -244,7 +244,7 @@ class FloatToStringFormatterMixin : public FloatToStringFormatter {
 
   static constexpr int buffer_size = 50;
 
-  explicit FloatToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
+  explicit FloatToStringFormatterMixin(const DataType* = NULLPTR) {}
 
   FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
                               char exp_character, int decimal_in_shortest_low,
@@ -374,7 +374,7 @@ class StringFormatter<DurationType> : public IntToStringFormatterMixin<DurationT
 
 class DateToStringFormatterMixin {
  public:
-  explicit DateToStringFormatterMixin(const std::shared_ptr<DataType>& = NULLPTR) {}
+  explicit DateToStringFormatterMixin(const DataType* = NULLPTR) {}
 
  protected:
   template <typename Appender>
@@ -432,7 +432,7 @@ class StringFormatter<TimestampType> {
  public:
   using value_type = int64_t;
 
-  explicit StringFormatter(const std::shared_ptr<DataType>& type)
+  explicit StringFormatter(const DataType* type)
       : unit_(checked_cast<const TimestampType&>(*type).unit()) {}
 
   template <typename Duration, typename Appender>
@@ -486,7 +486,7 @@ class StringFormatter<T, enable_if_time<T>> {
  public:
   using value_type = typename T::c_type;
 
-  explicit StringFormatter(const std::shared_ptr<DataType>& type)
+  explicit StringFormatter(const DataType* type)
       : unit_(checked_cast<const T&>(*type).unit()) {}
 
   template <typename Duration, typename Appender>
@@ -519,7 +519,7 @@ class StringFormatter<MonthIntervalType> {
  public:
   using value_type = MonthIntervalType::c_type;
 
-  explicit StringFormatter(const std::shared_ptr<DataType>&) {}
+  explicit StringFormatter(const DataType*) {}
 
   template <typename Appender>
   Return<Appender> operator()(value_type interval, Appender&& append) {
@@ -542,7 +542,7 @@ class StringFormatter<DayTimeIntervalType> {
  public:
   using value_type = DayTimeIntervalType::DayMilliseconds;
 
-  explicit StringFormatter(const std::shared_ptr<DataType>&) {}
+  explicit StringFormatter(const DataType*) {}
 
   template <typename Appender>
   Return<Appender> operator()(value_type interval, Appender&& append) {
@@ -570,7 +570,7 @@ class StringFormatter<MonthDayNanoIntervalType> {
  public:
   using value_type = MonthDayNanoIntervalType::MonthDayNanos;
 
-  explicit StringFormatter(const std::shared_ptr<DataType>&) {}
+  explicit StringFormatter(const DataType*) {}
 
   template <typename Appender>
   Return<Appender> operator()(value_type interval, Appender&& append) {
diff --git a/cpp/src/arrow/util/formatting_util_test.cc b/cpp/src/arrow/util/formatting_util_test.cc
index 3e785518773..a5760859990 100644
--- a/cpp/src/arrow/util/formatting_util_test.cc
+++ b/cpp/src/arrow/util/formatting_util_test.cc
@@ -312,7 +312,8 @@ TEST(Formatting, Date64) {
 
 TEST(Formatting, Time32) {
   {
-    StringFormatter<Time32Type> formatter(time32(TimeUnit::SECOND));
+    auto ty = time32(TimeUnit::SECOND);
+    StringFormatter<Time32Type> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "00:00:00");
     AssertFormatting(formatter, 1, "00:00:01");
@@ -321,7 +322,8 @@ TEST(Formatting, Time32) {
   }
 
   {
-    StringFormatter<Time32Type> formatter(time32(TimeUnit::MILLI));
+    auto ty = time32(TimeUnit::MILLI);
+    StringFormatter<Time32Type> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "00:00:00.000");
     AssertFormatting(formatter, 1, "00:00:00.001");
@@ -334,7 +336,8 @@ TEST(Formatting, Time32) {
 
 TEST(Formatting, Time64) {
   {
-    StringFormatter<Time64Type> formatter(time64(TimeUnit::MICRO));
+    auto ty = time64(TimeUnit::MICRO);
+    StringFormatter<Time64Type> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "00:00:00.000000");
     AssertFormatting(formatter, 1, "00:00:00.000001");
@@ -345,7 +348,8 @@ TEST(Formatting, Time64) {
   }
 
   {
-    StringFormatter<Time64Type> formatter(time64(TimeUnit::NANO));
+    auto ty = time64(TimeUnit::NANO);
+    StringFormatter<Time64Type> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "00:00:00.000000000");
     AssertFormatting(formatter, 1, "00:00:00.000000001");
@@ -358,7 +362,8 @@ TEST(Formatting, Time64) {
 
 TEST(Formatting, Timestamp) {
   {
-    StringFormatter<TimestampType> formatter(timestamp(TimeUnit::SECOND));
+    auto ty = timestamp(TimeUnit::SECOND);
+    StringFormatter<TimestampType> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "1970-01-01 00:00:00");
     AssertFormatting(formatter, 1, "1970-01-01 00:00:01");
@@ -373,7 +378,8 @@ TEST(Formatting, Timestamp) {
   }
 
   {
-    StringFormatter<TimestampType> formatter(timestamp(TimeUnit::MILLI));
+    auto ty = timestamp(TimeUnit::MILLI);
+    StringFormatter<TimestampType> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "1970-01-01 00:00:00.000");
     AssertFormatting(formatter, 1000L + 1, "1970-01-01 00:00:01.001");
@@ -388,7 +394,8 @@ TEST(Formatting, Timestamp) {
   }
 
   {
-    StringFormatter<TimestampType> formatter(timestamp(TimeUnit::MICRO));
+    auto ty = timestamp(TimeUnit::MICRO);
+    StringFormatter<TimestampType> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "1970-01-01 00:00:00.000000");
     AssertFormatting(formatter, 1000000LL + 1, "1970-01-01 00:00:01.000001");
@@ -407,7 +414,8 @@ TEST(Formatting, Timestamp) {
   }
 
   {
-    StringFormatter<TimestampType> formatter(timestamp(TimeUnit::NANO));
+    auto ty = timestamp(TimeUnit::NANO);
+    StringFormatter<TimestampType> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "1970-01-01 00:00:00.000000000");
     AssertFormatting(formatter, 1000000000LL + 1, "1970-01-01 00:00:01.000000001");
@@ -436,7 +444,8 @@ TEST(Formatting, Interval) {
   const int64_t max_int64 = std::numeric_limits<int64_t>::max();
   const int64_t min_int64 = std::numeric_limits<int64_t>::min();
   {
-    StringFormatter<MonthIntervalType> formatter(month_interval());
+    auto ty = month_interval();
+    StringFormatter<MonthIntervalType> formatter(ty.get());
 
     AssertFormatting(formatter, 0, "0M");
     AssertFormatting(formatter, -1, "-1M");
@@ -444,7 +453,8 @@ TEST(Formatting, Interval) {
     AssertFormatting(formatter, max_int32, "2147483647M");
   }
   {
-    StringFormatter<DayTimeIntervalType> formatter(day_time_interval());
+    auto ty = day_time_interval();
+    StringFormatter<DayTimeIntervalType> formatter(ty.get());
 
     AssertFormatting(formatter, DayMilliseconds{0, 0}, "0d0ms");
     AssertFormatting(formatter, DayMilliseconds{-1, -1}, "-1d-1ms");
@@ -454,7 +464,8 @@ TEST(Formatting, Interval) {
                      "2147483647d2147483647ms");
   }
   {
-    StringFormatter<MonthDayNanoIntervalType> formatter(month_day_nano_interval());
+    auto ty = month_day_nano_interval();
+    StringFormatter<MonthDayNanoIntervalType> formatter(ty.get());
 
     AssertFormatting(formatter, MonthDayNanos{0, 0, 0}, "0M0d0ns");
     AssertFormatting(formatter, MonthDayNanos{-1, -1, -1}, "-1M-1d-1ns");
diff --git a/cpp/src/arrow/util/int_util.cc b/cpp/src/arrow/util/int_util.cc
index ed9ea25579b..1a48c7a3ec8 100644
--- a/cpp/src/arrow/util/int_util.cc
+++ b/cpp/src/arrow/util/int_util.cc
@@ -531,7 +531,7 @@ static std::string FormatInt(T val) {
 }
 
 template <typename IndexCType, bool IsSigned = std::is_signed<IndexCType>::value>
-static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limit) {
+static Status CheckIndexBoundsImpl(const ArraySpan& values, uint64_t upper_limit) {
   // For unsigned integers, if the values array is larger than the maximum
   // index value (e.g. especially for UINT8 / UINT16), then there is no need to
   // boundscheck.
@@ -540,25 +540,22 @@ static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limi
     return Status::OK();
   }
 
-  const IndexCType* indices_data = indices.GetValues<IndexCType>(1);
-  const uint8_t* bitmap = nullptr;
-  if (indices.buffers[0]) {
-    bitmap = indices.buffers[0]->data();
-  }
+  const IndexCType* values_data = values.GetValues<IndexCType>(1);
+  const uint8_t* bitmap = values.buffers[0].data;
   auto IsOutOfBounds = [&](IndexCType val) -> bool {
     return ((IsSigned && val < 0) ||
             (val >= 0 && static_cast<uint64_t>(val) >= upper_limit));
   };
   return VisitSetBitRuns(
-      bitmap, indices.offset, indices.length, [&](int64_t offset, int64_t length) {
+      bitmap, values.offset, values.length, [&](int64_t offset, int64_t length) {
         bool block_out_of_bounds = false;
         for (int64_t i = 0; i < length; ++i) {
-          block_out_of_bounds |= IsOutOfBounds(indices_data[offset + i]);
+          block_out_of_bounds |= IsOutOfBounds(values_data[offset + i]);
         }
         if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
           for (int64_t i = 0; i < length; ++i) {
-            if (IsOutOfBounds(indices_data[offset + i])) {
-              return Status::IndexError("Index ", FormatInt(indices_data[offset + i]),
+            if (IsOutOfBounds(values_data[offset + i])) {
+              return Status::IndexError("Index ", FormatInt(values_data[offset + i]),
                                         " out of bounds");
             }
           }
@@ -567,27 +564,27 @@ static Status CheckIndexBoundsImpl(const ArrayData& indices, uint64_t upper_limi
       });
 }
 
-/// \brief Branchless boundschecking of the indices. Processes batches of
-/// indices at a time and shortcircuits when encountering an out-of-bounds
+/// \brief Branchless boundschecking of the values. Processes batches of
+/// values at a time and shortcircuits when encountering an out-of-bounds
 /// index in a batch
-Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit) {
-  switch (indices.type->id()) {
+Status CheckIndexBounds(const ArraySpan& values, uint64_t upper_limit) {
+  switch (values.type->id()) {
     case Type::INT8:
-      return CheckIndexBoundsImpl<int8_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<int8_t>(values, upper_limit);
     case Type::INT16:
-      return CheckIndexBoundsImpl<int16_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<int16_t>(values, upper_limit);
     case Type::INT32:
-      return CheckIndexBoundsImpl<int32_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<int32_t>(values, upper_limit);
     case Type::INT64:
-      return CheckIndexBoundsImpl<int64_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<int64_t>(values, upper_limit);
     case Type::UINT8:
-      return CheckIndexBoundsImpl<uint8_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<uint8_t>(values, upper_limit);
     case Type::UINT16:
-      return CheckIndexBoundsImpl<uint16_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<uint16_t>(values, upper_limit);
     case Type::UINT32:
-      return CheckIndexBoundsImpl<uint32_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<uint32_t>(values, upper_limit);
     case Type::UINT64:
-      return CheckIndexBoundsImpl<uint64_t>(indices, upper_limit);
+      return CheckIndexBoundsImpl<uint64_t>(values, upper_limit);
     default:
       return Status::Invalid("Invalid index type for boundschecking");
   }
@@ -599,7 +596,7 @@ Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit) {
 namespace {
 
 template <typename InType, typename CType = typename InType::c_type>
-Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper) {
+Status IntegersInRange(const ArraySpan& values, CType bound_lower, CType bound_upper) {
   if (std::numeric_limits<CType>::lowest() >= bound_lower &&
       std::numeric_limits<CType>::max() <= bound_upper) {
     return Status::OK();
@@ -617,25 +614,13 @@ Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper)
                            FormatInt(bound_upper));
   };
 
-  if (datum.kind() == Datum::SCALAR) {
-    const auto& scalar = datum.scalar_as<typename TypeTraits<InType>::ScalarType>();
-    if (IsOutOfBoundsMaybeNull(scalar.value, scalar.is_valid)) {
-      return GetErrorMessage(scalar.value);
-    }
-    return Status::OK();
-  }
-
-  const ArrayData& indices = *datum.array();
-  const CType* indices_data = indices.GetValues<CType>(1);
-  const uint8_t* bitmap = nullptr;
-  if (indices.buffers[0]) {
-    bitmap = indices.buffers[0]->data();
-  }
-  OptionalBitBlockCounter indices_bit_counter(bitmap, indices.offset, indices.length);
+  const CType* values_data = values.GetValues<CType>(1);
+  const uint8_t* bitmap = values.buffers[0].data;
+  OptionalBitBlockCounter values_bit_counter(bitmap, values.offset, values.length);
   int64_t position = 0;
-  int64_t offset_position = indices.offset;
-  while (position < indices.length) {
-    BitBlockCount block = indices_bit_counter.NextBlock();
+  int64_t offset_position = values.offset;
+  while (position < values.length) {
+    BitBlockCount block = values_bit_counter.NextBlock();
     bool block_out_of_bounds = false;
     if (block.popcount == block.length) {
       // Fast path: branchless
@@ -643,45 +628,45 @@ Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper)
       for (int64_t chunk = 0; chunk < block.length / 8; ++chunk) {
         // Let the compiler unroll this
         for (int j = 0; j < 8; ++j) {
-          block_out_of_bounds |= IsOutOfBounds(indices_data[i++]);
+          block_out_of_bounds |= IsOutOfBounds(values_data[i++]);
         }
       }
       for (; i < block.length; ++i) {
-        block_out_of_bounds |= IsOutOfBounds(indices_data[i]);
+        block_out_of_bounds |= IsOutOfBounds(values_data[i]);
       }
     } else if (block.popcount > 0) {
-      // Indices have nulls, must only boundscheck non-null values
+      // Values have nulls, must only boundscheck non-null values
       int64_t i = 0;
       for (int64_t chunk = 0; chunk < block.length / 8; ++chunk) {
         // Let the compiler unroll this
         for (int j = 0; j < 8; ++j) {
           block_out_of_bounds |= IsOutOfBoundsMaybeNull(
-              indices_data[i], bit_util::GetBit(bitmap, offset_position + i));
+              values_data[i], bit_util::GetBit(bitmap, offset_position + i));
           ++i;
         }
       }
       for (; i < block.length; ++i) {
         block_out_of_bounds |= IsOutOfBoundsMaybeNull(
-            indices_data[i], bit_util::GetBit(bitmap, offset_position + i));
+            values_data[i], bit_util::GetBit(bitmap, offset_position + i));
       }
     }
     if (ARROW_PREDICT_FALSE(block_out_of_bounds)) {
-      if (indices.GetNullCount() > 0) {
+      if (values.GetNullCount() > 0) {
         for (int64_t i = 0; i < block.length; ++i) {
-          if (IsOutOfBoundsMaybeNull(indices_data[i],
+          if (IsOutOfBoundsMaybeNull(values_data[i],
                                      bit_util::GetBit(bitmap, offset_position + i))) {
-            return GetErrorMessage(indices_data[i]);
+            return GetErrorMessage(values_data[i]);
           }
         }
       } else {
         for (int64_t i = 0; i < block.length; ++i) {
-          if (IsOutOfBounds(indices_data[i])) {
-            return GetErrorMessage(indices_data[i]);
+          if (IsOutOfBounds(values_data[i])) {
+            return GetErrorMessage(values_data[i]);
           }
         }
       }
     }
-    indices_data += block.length;
+    values_data += block.length;
     position += block.length;
     offset_position += block.length;
   }
@@ -689,19 +674,18 @@ Status IntegersInRange(const Datum& datum, CType bound_lower, CType bound_upper)
 }
 
 template <typename Type>
-Status CheckIntegersInRangeImpl(const Datum& datum, const Scalar& bound_lower,
+Status CheckIntegersInRangeImpl(const ArraySpan& values, const Scalar& bound_lower,
                                 const Scalar& bound_upper) {
   using ScalarType = typename TypeTraits<Type>::ScalarType;
-  return IntegersInRange<Type>(datum, checked_cast<const ScalarType&>(bound_lower).value,
+  return IntegersInRange<Type>(values, checked_cast<const ScalarType&>(bound_lower).value,
                                checked_cast<const ScalarType&>(bound_upper).value);
 }
 
 }  // namespace
 
-Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
+Status CheckIntegersInRange(const ArraySpan& values, const Scalar& bound_lower,
                             const Scalar& bound_upper) {
-  Type::type type_id = datum.type()->id();
-
+  Type::type type_id = values.type->id();
   if (bound_lower.type->id() != type_id || bound_upper.type->id() != type_id ||
       !bound_lower.is_valid || !bound_upper.is_valid) {
     return Status::Invalid("Scalar bound types must be non-null and same type as data");
@@ -709,21 +693,21 @@ Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
 
   switch (type_id) {
     case Type::INT8:
-      return CheckIntegersInRangeImpl<Int8Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<Int8Type>(values, bound_lower, bound_upper);
     case Type::INT16:
-      return CheckIntegersInRangeImpl<Int16Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<Int16Type>(values, bound_lower, bound_upper);
     case Type::INT32:
-      return CheckIntegersInRangeImpl<Int32Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<Int32Type>(values, bound_lower, bound_upper);
     case Type::INT64:
-      return CheckIntegersInRangeImpl<Int64Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<Int64Type>(values, bound_lower, bound_upper);
     case Type::UINT8:
-      return CheckIntegersInRangeImpl<UInt8Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<UInt8Type>(values, bound_lower, bound_upper);
     case Type::UINT16:
-      return CheckIntegersInRangeImpl<UInt16Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<UInt16Type>(values, bound_lower, bound_upper);
     case Type::UINT32:
-      return CheckIntegersInRangeImpl<UInt32Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<UInt32Type>(values, bound_lower, bound_upper);
     case Type::UINT64:
-      return CheckIntegersInRangeImpl<UInt64Type>(datum, bound_lower, bound_upper);
+      return CheckIntegersInRangeImpl<UInt64Type>(values, bound_lower, bound_upper);
     default:
       return Status::TypeError("Invalid index type for boundschecking");
   }
@@ -913,40 +897,51 @@ void GetSafeMinMax(Type::type out_type, T* min, T* max) {
 
 template <typename Type, typename CType = typename Type::c_type,
           typename ScalarType = typename TypeTraits<Type>::ScalarType>
-Status IntegersCanFitImpl(const Datum& datum, const DataType& target_type) {
+Status IntegersCanFitImpl(const ArraySpan& values, const DataType& target_type) {
   CType bound_min{}, bound_max{};
   GetSafeMinMax<Type>(target_type.id(), &bound_min, &bound_max);
-  return CheckIntegersInRange(datum, ScalarType(bound_min), ScalarType(bound_max));
+  return CheckIntegersInRange(values, ScalarType(bound_min), ScalarType(bound_max));
 }
 
 }  // namespace
 
-Status IntegersCanFit(const Datum& datum, const DataType& target_type) {
+Status IntegersCanFit(const ArraySpan& values, const DataType& target_type) {
   if (!is_integer(target_type.id())) {
     return Status::Invalid("Target type is not an integer type: ", target_type);
   }
 
-  switch (datum.type()->id()) {
+  switch (values.type->id()) {
     case Type::INT8:
-      return IntegersCanFitImpl<Int8Type>(datum, target_type);
+      return IntegersCanFitImpl<Int8Type>(values, target_type);
     case Type::INT16:
-      return IntegersCanFitImpl<Int16Type>(datum, target_type);
+      return IntegersCanFitImpl<Int16Type>(values, target_type);
     case Type::INT32:
-      return IntegersCanFitImpl<Int32Type>(datum, target_type);
+      return IntegersCanFitImpl<Int32Type>(values, target_type);
     case Type::INT64:
-      return IntegersCanFitImpl<Int64Type>(datum, target_type);
+      return IntegersCanFitImpl<Int64Type>(values, target_type);
     case Type::UINT8:
-      return IntegersCanFitImpl<UInt8Type>(datum, target_type);
+      return IntegersCanFitImpl<UInt8Type>(values, target_type);
     case Type::UINT16:
-      return IntegersCanFitImpl<UInt16Type>(datum, target_type);
+      return IntegersCanFitImpl<UInt16Type>(values, target_type);
     case Type::UINT32:
-      return IntegersCanFitImpl<UInt32Type>(datum, target_type);
+      return IntegersCanFitImpl<UInt32Type>(values, target_type);
     case Type::UINT64:
-      return IntegersCanFitImpl<UInt64Type>(datum, target_type);
+      return IntegersCanFitImpl<UInt64Type>(values, target_type);
     default:
       return Status::TypeError("Invalid index type for boundschecking");
   }
 }
 
+Status IntegersCanFit(const Scalar& scalar, const DataType& target_type) {
+  if (!is_integer(scalar.type->id())) {
+    return Status::Invalid("Scalar is not an integer");
+  } else if (!scalar.is_valid) {
+    return Status::OK();
+  }
+
+  ArraySpan span(scalar);
+  return IntegersCanFit(span, target_type);
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/int_util.h b/cpp/src/arrow/util/int_util.h
index e631f4dc1e7..5ce9dc2820e 100644
--- a/cpp/src/arrow/util/int_util.h
+++ b/cpp/src/arrow/util/int_util.h
@@ -27,8 +27,7 @@
 namespace arrow {
 
 class DataType;
-struct ArrayData;
-struct Datum;
+struct ArraySpan;
 struct Scalar;
 
 namespace internal {
@@ -99,20 +98,24 @@ Status TransposeInts(const DataType& src_type, const DataType& dest_type,
 /// indices must be nonnegative and strictly less than the passed upper
 /// limit (which is usually the length of an array that is being indexed-into).
 ARROW_EXPORT
-Status CheckIndexBounds(const ArrayData& indices, uint64_t upper_limit);
+Status CheckIndexBounds(const ArraySpan& values, uint64_t upper_limit);
 
 /// \brief Boundscheck integer values to determine if they are all between the
 /// passed upper and lower limits (inclusive). Upper and lower bounds must be
 /// the same type as the data and are not currently casted.
 ARROW_EXPORT
-Status CheckIntegersInRange(const Datum& datum, const Scalar& bound_lower,
+Status CheckIntegersInRange(const ArraySpan& values, const Scalar& bound_lower,
                             const Scalar& bound_upper);
 
 /// \brief Use CheckIntegersInRange to determine whether the passed integers
 /// can fit safely in the passed integer type. This helps quickly determine if
 /// integer narrowing (e.g. int64->int32) is safe to do.
 ARROW_EXPORT
-Status IntegersCanFit(const Datum& datum, const DataType& target_type);
+Status IntegersCanFit(const ArraySpan& values, const DataType& target_type);
+
+/// \brief Convenience for boundschecking a single Scalar vlue
+ARROW_EXPORT
+Status IntegersCanFit(const Scalar& value, const DataType& target_type);
 
 /// Upcast an integer to the largest possible width (currently 64 bits)
 
diff --git a/cpp/src/arrow/util/int_util_benchmark.cc b/cpp/src/arrow/util/int_util_benchmark.cc
index 1eae604a7da..de24a97af5c 100644
--- a/cpp/src/arrow/util/int_util_benchmark.cc
+++ b/cpp/src/arrow/util/int_util_benchmark.cc
@@ -107,7 +107,7 @@ static void CheckIndexBoundsInt32(
   random::RandomArrayGenerator rand(kSeed);
   auto arr = rand.Int32(args.size, 0, 100000, args.null_proportion);
   for (auto _ : state) {
-    ABORT_NOT_OK(CheckIndexBounds(*arr->data(), 100001));
+    ABORT_NOT_OK(CheckIndexBounds(ArraySpan(*arr->data()), 100001));
   }
 }
 
@@ -117,7 +117,7 @@ static void CheckIndexBoundsUInt32(
   random::RandomArrayGenerator rand(kSeed);
   auto arr = rand.UInt32(args.size, 0, 100000, args.null_proportion);
   for (auto _ : state) {
-    ABORT_NOT_OK(CheckIndexBounds(*arr->data(), 100001));
+    ABORT_NOT_OK(CheckIndexBounds(ArraySpan(*arr->data()), 100001));
   }
 }
 
diff --git a/cpp/src/arrow/util/int_util_test.cc b/cpp/src/arrow/util/int_util_test.cc
index d20640060bb..98878f00b5a 100644
--- a/cpp/src/arrow/util/int_util_test.cc
+++ b/cpp/src/arrow/util/int_util_test.cc
@@ -391,13 +391,13 @@ TEST(TransposeInts, Int8ToInt64) {
 void BoundsCheckPasses(const std::shared_ptr<DataType>& type,
                        const std::string& indices_json, uint64_t upper_limit) {
   auto indices = ArrayFromJSON(type, indices_json);
-  ASSERT_OK(CheckIndexBounds(*indices->data(), upper_limit));
+  ASSERT_OK(CheckIndexBounds(ArraySpan(*indices->data()), upper_limit));
 }
 
 void BoundsCheckFails(const std::shared_ptr<DataType>& type,
                       const std::string& indices_json, uint64_t upper_limit) {
   auto indices = ArrayFromJSON(type, indices_json);
-  ASSERT_RAISES(IndexError, CheckIndexBounds(*indices->data(), upper_limit));
+  ASSERT_RAISES(IndexError, CheckIndexBounds(ArraySpan(*indices->data()), upper_limit));
 }
 
 TEST(CheckIndexBounds, Batching) {
@@ -413,22 +413,23 @@ TEST(CheckIndexBounds, Batching) {
   uint8_t* bitmap = index_data->buffers[0]->mutable_data();
   bit_util::SetBitsTo(bitmap, 0, length, true);
 
-  ASSERT_OK(CheckIndexBounds(*index_data, 1));
+  ArraySpan index_span(*index_data);
+  ASSERT_OK(CheckIndexBounds(index_span, 1));
 
   // We'll place out of bounds indices at various locations
   values[99] = 1;
-  ASSERT_RAISES(IndexError, CheckIndexBounds(*index_data, 1));
+  ASSERT_RAISES(IndexError, CheckIndexBounds(index_span, 1));
 
   // Make that value null
   bit_util::ClearBit(bitmap, 99);
-  ASSERT_OK(CheckIndexBounds(*index_data, 1));
+  ASSERT_OK(CheckIndexBounds(index_span, 1));
 
   values[199] = 1;
-  ASSERT_RAISES(IndexError, CheckIndexBounds(*index_data, 1));
+  ASSERT_RAISES(IndexError, CheckIndexBounds(index_span, 1));
 
   // Make that value null
   bit_util::ClearBit(bitmap, 199);
-  ASSERT_OK(CheckIndexBounds(*index_data, 1));
+  ASSERT_OK(CheckIndexBounds(index_span, 1));
 }
 
 TEST(CheckIndexBounds, SignedInts) {
@@ -489,7 +490,7 @@ void CheckInRangePasses(const std::shared_ptr<DataType>& type,
                         const std::string& values_json, const std::string& limits_json) {
   auto values = ArrayFromJSON(type, values_json);
   auto limits = ArrayFromJSON(type, limits_json);
-  ASSERT_OK(CheckIntegersInRange(Datum(values->data()), **limits->GetScalar(0),
+  ASSERT_OK(CheckIntegersInRange(ArraySpan(*values->data()), **limits->GetScalar(0),
                                  **limits->GetScalar(1)));
 }
 
@@ -498,7 +499,7 @@ void CheckInRangeFails(const std::shared_ptr<DataType>& type,
   auto values = ArrayFromJSON(type, values_json);
   auto limits = ArrayFromJSON(type, limits_json);
   ASSERT_RAISES(Invalid,
-                CheckIntegersInRange(Datum(values->data()), **limits->GetScalar(0),
+                CheckIntegersInRange(ArraySpan(*values->data()), **limits->GetScalar(0),
                                      **limits->GetScalar(1)));
 }
 
@@ -518,26 +519,27 @@ TEST(CheckIntegersInRange, Batching) {
   auto zero = std::make_shared<Int16Scalar>(0);
   auto one = std::make_shared<Int16Scalar>(1);
 
-  ASSERT_OK(CheckIntegersInRange(*index_data, *zero, *one));
+  ArraySpan index_span(*index_data);
+  ASSERT_OK(CheckIntegersInRange(index_span, *zero, *one));
 
   // 1 is included
   values[99] = 1;
-  ASSERT_OK(CheckIntegersInRange(*index_data, *zero, *one));
+  ASSERT_OK(CheckIntegersInRange(index_span, *zero, *one));
 
   // We'll place out of bounds indices at various locations
   values[99] = 2;
-  ASSERT_RAISES(Invalid, CheckIntegersInRange(*index_data, *zero, *one));
+  ASSERT_RAISES(Invalid, CheckIntegersInRange(index_span, *zero, *one));
 
   // Make that value null
   bit_util::ClearBit(bitmap, 99);
-  ASSERT_OK(CheckIntegersInRange(*index_data, *zero, *one));
+  ASSERT_OK(CheckIntegersInRange(index_span, *zero, *one));
 
   values[199] = 2;
-  ASSERT_RAISES(Invalid, CheckIntegersInRange(*index_data, *zero, *one));
+  ASSERT_RAISES(Invalid, CheckIntegersInRange(index_span, *zero, *one));
 
   // Make that value null
   bit_util::ClearBit(bitmap, 199);
-  ASSERT_OK(CheckIntegersInRange(*index_data, *zero, *one));
+  ASSERT_OK(CheckIntegersInRange(index_span, *zero, *one));
 }
 
 TEST(CheckIntegersInRange, SignedInts) {

From 5ce919ca5cdb971879f5ec2a5d4d1ca01cedba00 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Fri, 10 Jun 2022 17:16:30 -0500
Subject: [PATCH 02/15] Do not require wrapping ExecSpanIterator in a
 unique_ptr, make reusable

---
 cpp/src/arrow/array/data.h                    |  1 -
 cpp/src/arrow/compute/exec.cc                 | 79 +++++++++----------
 cpp/src/arrow/compute/exec_internal.h         | 10 ++-
 cpp/src/arrow/compute/exec_test.cc            | 31 ++++----
 cpp/src/arrow/compute/kernels/util_internal.h | 14 ++--
 5 files changed, 68 insertions(+), 67 deletions(-)

diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index a6cbdcbe8ee..2f377378da8 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -273,7 +273,6 @@ struct ARROW_EXPORT ArraySpan {
   explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
 
   /// If dictionary-encoded, put dictionary in the first entry
-  // TODO(wesm): would a std::unique_ptr<vector<...>> be better?
   std::vector<ArraySpan> child_data;
 
   /// \brief Populate ArraySpan to look like an array of length 1 pointing at
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 04bb29e9bfa..7f7e014805f 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -355,32 +355,30 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
 // ----------------------------------------------------------------------
 // ExecSpanIterator; to eventually replace ExecBatchIterator
 
-ExecSpanIterator::ExecSpanIterator(const std::vector<Datum>& args, int64_t length,
-                                   int64_t max_chunksize)
-    : args_(args), position_(0), length_(length), max_chunksize_(max_chunksize) {
-  chunk_indexes_.resize(args_.size(), 0);
-  value_positions_.resize(args_.size(), 0);
-  value_offsets_.resize(args_.size(), 0);
-}
-
-Result<std::unique_ptr<ExecSpanIterator>> ExecSpanIterator::Make(
-    const std::vector<Datum>& args, int64_t max_chunksize) {
-  int64_t length = 1;
-  RETURN_NOT_OK(GetBatchLength(args, &length));
-  max_chunksize = std::min(length, max_chunksize);
-  return std::unique_ptr<ExecSpanIterator>(
-      new ExecSpanIterator(args, length, max_chunksize));
+Status ExecSpanIterator::Init(const std::vector<Datum>& args, int64_t max_chunksize) {
+  args_ = &args;
+  initialized_ = have_chunked_arrays_ = false;
+  position_ = 0;
+  chunk_indexes_.clear();
+  chunk_indexes_.resize(args_->size(), 0);
+  value_positions_.clear();
+  value_positions_.resize(args_->size(), 0);
+  value_offsets_.clear();
+  value_offsets_.resize(args_->size(), 0);
+  RETURN_NOT_OK(GetBatchLength(*args_, &length_));
+  max_chunksize_ = std::min(length_, max_chunksize);
+  return Status::OK();
 }
 
 int64_t ExecSpanIterator::GetNextChunkSpan(int64_t iteration_size, ExecSpan* span) {
-  for (size_t i = 0; i < args_.size() && iteration_size > 0; ++i) {
+  for (size_t i = 0; i < args_->size() && iteration_size > 0; ++i) {
     // If the argument is not a chunked array, it's either a Scalar or Array,
     // in which case it doesn't influence the size of this span. Note that if
     // the args are all scalars the span length is 1
-    if (!args_[i].is_chunked_array()) {
+    if (!args_->at(i).is_chunked_array()) {
       continue;
     }
-    const ChunkedArray* arg = args_[i].chunked_array().get();
+    const ChunkedArray* arg = args_->at(i).chunked_array().get();
     const Array* current_chunk;
     while (true) {
       current_chunk = arg->chunk(chunk_indexes_[i]).get();
@@ -412,22 +410,22 @@ bool ExecSpanIterator::Next(ExecSpan* span) {
   if (!initialized_) {
     span->length = 0;
 
-    // The first this this is called, we populate the output span with
-    // any Scalar or Array arguments in the ExecValue struct, and then
-    // just increment array offsets below. If any arguments are
-    // ChunkedArray, then the internal ArraySpans will see their
-    // members updated during hte iteration
-    span->values.resize(args_.size());
-    for (size_t i = 0; i < args_.size(); ++i) {
-      if (args_[i].is_scalar()) {
-        span->values[i].SetScalar(args_[i].scalar().get());
-      } else if (args_[i].is_array()) {
-        const ArrayData& arr = *args_[i].array();
+    // The first time this is called, we populate the output span with any
+    // Scalar or Array arguments in the ExecValue struct, and then just
+    // increment array offsets below. If any arguments are ChunkedArray, then
+    // the internal ArraySpans will see their members updated during hte
+    // iteration
+    span->values.resize(args_->size());
+    for (size_t i = 0; i < args_->size(); ++i) {
+      if (args_->at(i).is_scalar()) {
+        span->values[i].SetScalar(args_->at(i).scalar().get());
+      } else if (args_->at(i).is_array()) {
+        const ArrayData& arr = *args_->at(i).array();
         span->values[i].SetArray(arr);
         value_offsets_[i] = arr.offset;
       } else {
         // Populate members from the first chunk
-        const Array* first_chunk = args_[i].chunked_array()->chunk(0).get();
+        const Array* first_chunk = args_->at(i).chunked_array()->chunk(0).get();
         const ArrayData& arr = *first_chunk->data();
         span->values[i].SetArray(arr);
         value_offsets_[i] = arr.offset;
@@ -449,8 +447,8 @@ bool ExecSpanIterator::Next(ExecSpan* span) {
 
   // Now, adjust the span
   span->length = iteration_size;
-  for (size_t i = 0; i < args_.size(); ++i) {
-    const Datum& arg = args_[i];
+  for (size_t i = 0; i < args_->size(); ++i) {
+    const Datum& arg = args_->at(i);
     if (!arg.is_scalar()) {
       ArraySpan* arr = &span->values[i].array;
       arr->length = iteration_size;
@@ -774,8 +772,7 @@ class KernelExecutorImpl : public KernelExecutor {
 class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
  public:
   Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
-    ARROW_ASSIGN_OR_RAISE(span_iterator_,
-                          ExecSpanIterator::Make(args, exec_context()->exec_chunksize()));
+    RETURN_NOT_OK(span_iterator_.Init(args, exec_context()->exec_chunksize()));
 
     // TODO(wesm): remove if with ARROW-16757
     if (output_descr_.shape != ValueDescr::SCALAR) {
@@ -783,7 +780,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
       // kernels supporting preallocation, then we do so up front and then
       // iterate over slices of that large array. Otherwise, we preallocate prior
       // to processing each span emitted from the ExecSpanIterator
-      RETURN_NOT_OK(SetupPreallocation(span_iterator_->length(), args));
+      RETURN_NOT_OK(SetupPreallocation(span_iterator_.length(), args));
     }
 
     // ARROW-16756: Here we have to accommodate the distinct cases
@@ -836,16 +833,16 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
     ArraySpan* output_span = output.array_span();
     if (preallocate_contiguous_) {
       // Make one big output allocation
-      ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(span_iterator_->length()));
+      ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(span_iterator_.length()));
 
       // Populate and then reuse the ArraySpan inside
       output_span->SetMembers(*preallocation);
       output_span->offset = 0;
-      while (span_iterator_->Next(&input)) {
+      while (span_iterator_.Next(&input)) {
         // Set absolute output span position and length
         output_span->length = input.length;
         RETURN_NOT_OK(ExecuteSingleSpan(input, &output));
-        output_span->SetOffset(span_iterator_->position());
+        output_span->SetOffset(span_iterator_.position());
       }
 
       // Kernel execution is complete; emit result
@@ -854,7 +851,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
       // Fully preallocating, but not contiguously
       // We preallocate (maybe) only for the output of processing the current
       // chunk
-      while (span_iterator_->Next(&input)) {
+      while (span_iterator_.Next(&input)) {
         ARROW_ASSIGN_OR_RAISE(preallocation, PrepareOutput(input.length));
         output_span->SetMembers(*preallocation);
         RETURN_NOT_OK(ExecuteSingleSpan(input, &output));
@@ -889,7 +886,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
     // ARROW-16757.
     ExecSpan input;
     ExecResult output;
-    while (span_iterator_->Next(&input)) {
+    while (span_iterator_.Next(&input)) {
       if (output_descr_.shape == ValueDescr::ARRAY) {
         ARROW_ASSIGN_OR_RAISE(output.value, PrepareOutput(input.length));
         DCHECK(output.is_array_data());
@@ -1003,7 +1000,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
   // iterate through it while executing the kernel in chunks
   bool preallocate_contiguous_ = false;
 
-  std::unique_ptr<ExecSpanIterator> span_iterator_;
+  ExecSpanIterator span_iterator_;
 };
 
 Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 1219c39a2df..2321276e7c0 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -79,13 +79,15 @@ class ARROW_EXPORT ExecBatchIterator {
 /// must be longer than the lifetime of this object
 class ARROW_EXPORT ExecSpanIterator {
  public:
-  /// \brief Construct iterator and do basic argument validation
+  ExecSpanIterator() = default;
+
+  /// \brief Initialize itertor iterator and do basic argument validation
   ///
   /// \param[in] args the Datum argument, must be all array-like or scalar
   /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending
   /// on the chunk layout of ChunkedArray.
-  static Result<std::unique_ptr<ExecSpanIterator>> Make(
-      const std::vector<Datum>& args, int64_t max_chunksize = kDefaultMaxChunksize);
+  Status Init(const std::vector<Datum>& args,
+              int64_t max_chunksize = kDefaultMaxChunksize);
 
   /// \brief Compute the next span by updating the state of the
   /// previous span object. You must keep passing in the previous
@@ -108,7 +110,7 @@ class ARROW_EXPORT ExecSpanIterator {
 
   bool initialized_ = false;
   bool have_chunked_arrays_ = false;
-  const std::vector<Datum>& args_;
+  const std::vector<Datum>* args_;
   std::vector<int> chunk_indexes_;
   std::vector<int64_t> value_positions_;
 
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index c74d24d6ecf..3d99368bbb2 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -795,7 +795,7 @@ class TestExecSpanIterator : public TestComputeInternals {
  public:
   void SetupIterator(const std::vector<Datum>& args,
                      int64_t max_chunksize = kDefaultMaxChunksize) {
-    ASSERT_OK_AND_ASSIGN(iterator_, ExecSpanIterator::Make(args, max_chunksize));
+    ASSERT_OK(iterator_.Init(args, max_chunksize));
   }
   void CheckIteration(const std::vector<Datum>& args, int chunksize,
                       const std::vector<int>& ex_batch_sizes) {
@@ -803,8 +803,8 @@ class TestExecSpanIterator : public TestComputeInternals {
     ExecSpan batch;
     int64_t position = 0;
     for (size_t i = 0; i < ex_batch_sizes.size(); ++i) {
-      ASSERT_EQ(position, iterator_->position());
-      ASSERT_TRUE(iterator_->Next(&batch));
+      ASSERT_EQ(position, iterator_.position());
+      ASSERT_TRUE(iterator_.Next(&batch));
       ASSERT_EQ(ex_batch_sizes[i], batch.length);
 
       for (size_t j = 0; j < args.size(); ++j) {
@@ -834,13 +834,13 @@ class TestExecSpanIterator : public TestComputeInternals {
       position += ex_batch_sizes[i];
     }
     // Ensure that the iterator is exhausted
-    ASSERT_FALSE(iterator_->Next(&batch));
+    ASSERT_FALSE(iterator_.Next(&batch));
 
-    ASSERT_EQ(iterator_->length(), iterator_->position());
+    ASSERT_EQ(iterator_.length(), iterator_.position());
   }
 
  protected:
-  std::unique_ptr<ExecSpanIterator> iterator_;
+  ExecSpanIterator iterator_;
 };
 
 TEST_F(TestExecSpanIterator, Basics) {
@@ -852,7 +852,7 @@ TEST_F(TestExecSpanIterator, Basics) {
   SetupIterator(args);
 
   ExecSpan batch;
-  ASSERT_TRUE(iterator_->Next(&batch));
+  ASSERT_TRUE(iterator_.Next(&batch));
   ASSERT_EQ(3, batch.values.size());
   ASSERT_EQ(3, batch.num_values());
   ASSERT_EQ(length, batch.length);
@@ -861,22 +861,24 @@ TEST_F(TestExecSpanIterator, Basics) {
   AssertArraysEqual(*args[1].make_array(), *batch[1].array.ToArray());
   ASSERT_TRUE(args[2].scalar()->Equals(*batch[2].scalar));
 
-  ASSERT_EQ(length, iterator_->position());
-  ASSERT_FALSE(iterator_->Next(&batch));
+  ASSERT_EQ(length, iterator_.position());
+  ASSERT_FALSE(iterator_.Next(&batch));
 
   // Split into chunks of size 16
   CheckIteration(args, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
 }
 
 TEST_F(TestExecSpanIterator, InputValidation) {
+  ExecSpanIterator iterator;
+
   std::vector<Datum> args = {Datum(GetInt32Array(10)), Datum(GetInt32Array(9))};
-  ASSERT_RAISES(Invalid, ExecSpanIterator::Make(args));
+  ASSERT_RAISES(Invalid, iterator.Init(args));
 
   args = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
-  ASSERT_RAISES(Invalid, ExecSpanIterator::Make(args));
+  ASSERT_RAISES(Invalid, iterator.Init(args));
 
   args = {Datum(GetInt32Array(10))};
-  ASSERT_OK_AND_ASSIGN(auto iterator, ExecSpanIterator::Make(args));
+  ASSERT_OK(iterator.Init(args));
 }
 
 TEST_F(TestExecSpanIterator, ChunkedArrays) {
@@ -894,9 +896,10 @@ TEST_F(TestExecSpanIterator, ZeroLengthInputs) {
   auto carr = std::shared_ptr<ChunkedArray>(new ChunkedArray({}, int32()));
 
   auto CheckArgs = [&](const std::vector<Datum>& args) {
-    auto iterator = ExecSpanIterator::Make(args).ValueOrDie();
+    ExecSpanIterator iterator;
+    ASSERT_OK(iterator.Init(args));
     ExecSpan batch;
-    ASSERT_FALSE(iterator->Next(&batch));
+    ASSERT_FALSE(iterator.Next(&batch));
   };
 
   // Zero-length ChunkedArray with zero chunks
diff --git a/cpp/src/arrow/compute/kernels/util_internal.h b/cpp/src/arrow/compute/kernels/util_internal.h
index 40788b0ea60..3b513a8647f 100644
--- a/cpp/src/arrow/compute/kernels/util_internal.h
+++ b/cpp/src/arrow/compute/kernels/util_internal.h
@@ -67,13 +67,13 @@ int GetBitWidth(const DataType& type);
 // rather than duplicating compiled code to do all these in each kernel.
 PrimitiveArg GetPrimitiveArg(const ArrayData& arr);
 
-// Augment a unary ScalarernelExec which supports only array-like inputs with
-// support for scalar inputs. Scalars will be transformed to 1-long arrays with
-// the scalar's value (or null if the scalar is null) as its only element. This
-// 1-long array will be passed to the original exec, then the only element of
-// the resulting array will be extracted as the output scalar. This could be
-// far more efficient, but instead of optimizing this it'd be better to support
-// scalar inputs "upstream" in original exec.
+// Augment a unary ScalarKernel::ExecFunc which supports only array-like inputs
+// with support for scalar inputs. Scalars will be transformed to 1-long arrays
+// with the scalar's value (or null if the scalar is null) as its only
+// element. This 1-long array will be passed to the original exec, then the
+// only element of the resulting array will be extracted as the output
+// scalar. This could be far more efficient, but instead of optimizing this
+// it'd be better to support scalar inputs "upstream" in original exec.
 ScalarKernel::ExecFunc TrivialScalarUnaryAsArraysExec(
     ScalarKernel::ExecFunc exec, bool use_array_span = true,
     NullHandling::type null_handling = NullHandling::INTERSECTION);

From df07599f8a7fc7a6ea31591d82482eb07345e5fe Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sat, 11 Jun 2022 16:08:09 -0500
Subject: [PATCH 03/15] Respond to code review comments, simplify

---
 .../arrow/compute_register_example.cc         |   6 +-
 cpp/examples/arrow/udf_example.cc             |  20 +-
 cpp/src/arrow/adapters/orc/util.cc            |  12 +-
 cpp/src/arrow/array/array_binary_test.cc      |   6 +-
 cpp/src/arrow/array/array_dict.cc             |   2 +-
 cpp/src/arrow/array/array_test.cc             |   8 +-
 cpp/src/arrow/array/data.cc                   |  10 +-
 cpp/src/arrow/array/data.h                    |  13 +-
 cpp/src/arrow/array/validate.cc               |  12 +-
 cpp/src/arrow/compute/cast.cc                 |   2 +-
 cpp/src/arrow/compute/cast.h                  |   2 +-
 cpp/src/arrow/compute/exec.h                  |   2 +-
 cpp/src/arrow/compute/function.cc             |   4 +-
 cpp/src/arrow/compute/function.h              |   4 +-
 cpp/src/arrow/compute/kernel.h                |  43 +--
 .../arrow/compute/kernels/aggregate_basic.cc  |   6 +-
 .../arrow/compute/kernels/aggregate_mode.cc   |   2 +-
 .../arrow/compute/kernels/aggregate_test.cc   |   2 +-
 .../arrow/compute/kernels/codegen_internal.cc |   2 +-
 .../arrow/compute/kernels/codegen_internal.h  | 252 ++----------------
 .../arrow/compute/kernels/hash_aggregate.cc   |  10 +-
 cpp/src/arrow/compute/kernels/row_encoder.cc  |   4 +-
 cpp/src/arrow/compute/kernels/row_encoder.h   |   4 +-
 .../compute/kernels/scalar_arithmetic.cc      |   8 +-
 .../arrow/compute/kernels/scalar_boolean.cc   |   2 +-
 .../compute/kernels/scalar_cast_boolean.cc    |   7 +-
 .../compute/kernels/scalar_cast_dictionary.cc |   2 +-
 .../compute/kernels/scalar_cast_internal.cc   |  19 +-
 .../compute/kernels/scalar_cast_numeric.cc    |   1 -
 .../compute/kernels/scalar_cast_string.cc     |   6 +-
 .../arrow/compute/kernels/scalar_compare.cc   |   8 +-
 .../arrow/compute/kernels/scalar_if_else.cc   |  80 +++---
 .../arrow/compute/kernels/scalar_nested.cc    |   4 +-
 .../compute/kernels/scalar_set_lookup.cc      |   2 +-
 .../compute/kernels/scalar_string_ascii.cc    |  20 +-
 .../arrow/compute/kernels/scalar_validity.cc  |   5 +-
 .../arrow/compute/kernels/util_internal.cc    |   5 +-
 cpp/src/arrow/compute/kernels/util_internal.h |   6 +-
 .../compute/kernels/vector_array_sort.cc      |   9 +-
 .../compute/kernels/vector_cumulative_ops.cc  |   4 +-
 cpp/src/arrow/compute/kernels/vector_hash.cc  |   4 +-
 .../arrow/compute/kernels/vector_replace.cc   |   4 +-
 .../arrow/compute/kernels/vector_selection.cc |  10 +-
 cpp/src/arrow/csv/writer.cc                   |  10 +-
 cpp/src/arrow/python/arrow_to_pandas.cc       |   9 +-
 cpp/src/arrow/python/udf.cc                   |   2 +-
 cpp/src/arrow/scalar.h                        |  22 +-
 cpp/src/arrow/type.h                          |   5 +-
 cpp/src/arrow/util/bit_block_counter.h        |  55 ----
 cpp/src/arrow/util/bitmap.h                   |   4 +-
 cpp/src/arrow/util/int_util_benchmark.cc      |   4 +-
 cpp/src/arrow/util/int_util_test.cc           |  11 +-
 cpp/src/arrow/visit_data_inline.h             |  92 +++----
 cpp/src/parquet/encoding.cc                   |   4 +-
 cpp/src/parquet/statistics.cc                 |   4 +-
 55 files changed, 284 insertions(+), 572 deletions(-)

diff --git a/cpp/examples/arrow/compute_register_example.cc b/cpp/examples/arrow/compute_register_example.cc
index f089b910ec4..13d80b29631 100644
--- a/cpp/examples/arrow/compute_register_example.cc
+++ b/cpp/examples/arrow/compute_register_example.cc
@@ -69,9 +69,9 @@ std::unique_ptr<cp::FunctionOptions> ExampleFunctionOptionsType::Copy(
   return std::unique_ptr<cp::FunctionOptions>(new ExampleFunctionOptions());
 }
 
-arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecBatch& batch,
-                                  arrow::Datum* out) {
-  *out->mutable_array() = *batch[0].array();
+arrow::Status ExampleFunctionImpl(cp::KernelContext* ctx, const cp::ExecSpan& batch,
+                                  cp::ExecResult* out) {
+  out->value = batch[0].array.ToArrayData();
   return arrow::Status::OK();
 }
 
diff --git a/cpp/examples/arrow/udf_example.cc b/cpp/examples/arrow/udf_example.cc
index f45e2c644d0..47c45411477 100644
--- a/cpp/examples/arrow/udf_example.cc
+++ b/cpp/examples/arrow/udf_example.cc
@@ -59,11 +59,17 @@ const cp::FunctionDoc func_doc{
     {"x", "y", "z"},
     "UDFOptions"};
 
-arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecBatch& batch,
-                             arrow::Datum* out) {
-  // temp = x + y; return temp + z
-  ARROW_ASSIGN_OR_RAISE(auto temp, cp::CallFunction("add", {batch[0], batch[1]}));
-  return cp::CallFunction("add", {temp, batch[2]}).Value(out);
+arrow::Status SampleFunction(cp::KernelContext* ctx, const cp::ExecSpan& batch,
+                             cp::ExecResult* out) {
+  // return x + y + z
+  const int64_t* x = batch[0].array.GetValues<int64_t>(1);
+  const int64_t* y = batch[1].array.GetValues<int64_t>(1);
+  const int64_t* z = batch[2].array.GetValues<int64_t>(1);
+  int64_t* out_values = out->array_span()->GetValues<int64_t>(1);
+  for (int64_t i = 0; i < batch.length; ++i) {
+    *out_values++ = *x++ + *y++ + *z++;
+  }
+  return arrow::Status::OK();
 }
 
 arrow::Status Execute() {
@@ -74,8 +80,8 @@ arrow::Status Execute() {
        cp::InputType::Array(arrow::int64())},
       arrow::int64(), SampleFunction);
 
-  kernel.mem_allocation = cp::MemAllocation::NO_PREALLOCATE;
-  kernel.null_handling = cp::NullHandling::COMPUTED_NO_PREALLOCATE;
+  kernel.mem_allocation = cp::MemAllocation::PREALLOCATE;
+  kernel.null_handling = cp::NullHandling::INTERSECTION;
 
   ARROW_RETURN_NOT_OK(func->AddKernel(std::move(kernel)));
 
diff --git a/cpp/src/arrow/adapters/orc/util.cc b/cpp/src/arrow/adapters/orc/util.cc
index 5a8ae93532b..dbdb110fb46 100644
--- a/cpp/src/arrow/adapters/orc/util.cc
+++ b/cpp/src/arrow/adapters/orc/util.cc
@@ -586,8 +586,8 @@ Status WriteGenericBatch(const Array& array, int64_t orc_offset,
     batch->hasNulls = true;
   }
   Appender<DataType, BatchType> appender{array_, batch, orc_offset, 0};
-  ArrayDataVisitor<DataType> visitor;
-  RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+  ArraySpanVisitor<DataType> visitor;
+  RETURN_NOT_OK(visitor.Visit(*array_.data(), &appender));
   return Status::OK();
 }
 
@@ -608,8 +608,8 @@ Status WriteTimestampBatch(const Array& array, int64_t orc_offset,
                                        0,
                                        conversion_factor_from_second,
                                        conversion_factor_to_nano};
-  ArrayDataVisitor<DataType> visitor;
-  RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+  ArraySpanVisitor<DataType> visitor;
+  RETURN_NOT_OK(visitor.Visit(*array_.data(), &appender));
   return Status::OK();
 }
 
@@ -621,8 +621,8 @@ Status WriteFixedSizeBinaryBatch(const Array& array, int64_t orc_offset,
     batch->hasNulls = true;
   }
   FixedSizeBinaryAppender appender{array_, batch, orc_offset, 0, array_.byte_width()};
-  ArrayDataVisitor<FixedSizeBinaryType> visitor;
-  RETURN_NOT_OK(visitor.Visit(*(array_.data()), &appender));
+  ArraySpanVisitor<FixedSizeBinaryType> visitor;
+  RETURN_NOT_OK(visitor.Visit(*array_.data(), &appender));
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc
index da1469ebea0..b7225eb8b7d 100644
--- a/cpp/src/arrow/array/array_binary_test.cc
+++ b/cpp/src/arrow/array/array_binary_test.cc
@@ -859,7 +859,7 @@ TEST(TestChunkedStringBuilder, BasicOperation) {
 }
 
 // ----------------------------------------------------------------------
-// ArrayDataVisitor<binary-like> tests
+// ArraySpanVisitor<binary-like> tests
 
 struct BinaryAppender {
   Status VisitNull() {
@@ -885,7 +885,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
   void TestBasics() {
     auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])");
     BinaryAppender appender;
-    ArrayDataVisitor<TypeClass> visitor;
+    ArraySpanVisitor<TypeClass> visitor;
     ASSERT_OK(visitor.Visit(*array->data(), &appender));
     ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"}));
     ARROW_UNUSED(visitor);  // Workaround weird MSVC warning
@@ -894,7 +894,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
   void TestSliced() {
     auto array = ArrayFromJSON(type_, R"(["ab", null, "cd", "ef"])")->Slice(1, 2);
     BinaryAppender appender;
-    ArrayDataVisitor<TypeClass> visitor;
+    ArraySpanVisitor<TypeClass> visitor;
     ASSERT_OK(visitor.Visit(*array->data(), &appender));
     ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"(null)", "cd"}));
     ARROW_UNUSED(visitor);  // Workaround weird MSVC warning
diff --git a/cpp/src/arrow/array/array_dict.cc b/cpp/src/arrow/array/array_dict.cc
index 0a4d33e03da..8fbe9f69d78 100644
--- a/cpp/src/arrow/array/array_dict.cc
+++ b/cpp/src/arrow/array/array_dict.cc
@@ -125,7 +125,7 @@ Result<std::shared_ptr<Array>> DictionaryArray::FromArrays(
         "Dictionary type's index type does not match "
         "indices array's type");
   }
-  RETURN_NOT_OK(internal::CheckIndexBounds(ArraySpan(*indices->data()),
+  RETURN_NOT_OK(internal::CheckIndexBounds(*indices->data(),
                                            static_cast<uint64_t>(dictionary->length())));
   return std::make_shared<DictionaryArray>(type, indices, dictionary);
 }
diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
index 6057796875c..0d9afba6ece 100644
--- a/cpp/src/arrow/array/array_test.cc
+++ b/cpp/src/arrow/array/array_test.cc
@@ -2262,23 +2262,23 @@ struct FWBinaryAppender {
   std::vector<util::string_view> data;
 };
 
-TEST_F(TestFWBinaryArray, ArrayDataVisitor) {
+TEST_F(TestFWBinaryArray, ArraySpanVisitor) {
   auto type = fixed_size_binary(3);
 
   auto array = ArrayFromJSON(type, R"(["abc", null, "def"])");
   FWBinaryAppender appender;
-  ArrayDataVisitor<FixedSizeBinaryType> visitor;
+  ArraySpanVisitor<FixedSizeBinaryType> visitor;
   ASSERT_OK(visitor.Visit(*array->data(), &appender));
   ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"abc", "(null)", "def"}));
   ARROW_UNUSED(visitor);  // Workaround weird MSVC warning
 }
 
-TEST_F(TestFWBinaryArray, ArrayDataVisitorSliced) {
+TEST_F(TestFWBinaryArray, ArraySpanVisitorSliced) {
   auto type = fixed_size_binary(3);
 
   auto array = ArrayFromJSON(type, R"(["abc", null, "def", "ghi"])")->Slice(1, 2);
   FWBinaryAppender appender;
-  ArrayDataVisitor<FixedSizeBinaryType> visitor;
+  ArraySpanVisitor<FixedSizeBinaryType> visitor;
   ASSERT_OK(visitor.Visit(*array->data(), &appender));
   ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"(null)", "def"}));
   ARROW_UNUSED(visitor);  // Workaround weird MSVC warning
diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc
index e1587695d6b..653d206f015 100644
--- a/cpp/src/arrow/array/data.cc
+++ b/cpp/src/arrow/array/data.cc
@@ -140,7 +140,7 @@ void ArraySpan::SetMembers(const ArrayData& data) {
   this->null_count = data.null_count.load();
   this->offset = data.offset;
 
-  for (size_t i = 0; i < data.buffers.size(); ++i) {
+  for (int i = 0; i < static_cast<int>(data.buffers.size()); ++i) {
     const std::shared_ptr<Buffer>& buffer = data.buffers[i];
     // It is the invoker-of-kernels's responsibility to ensure that
     // const buffers are not written to accidentally.
@@ -152,12 +152,10 @@ void ArraySpan::SetMembers(const ArrayData& data) {
   }
 
   // Makes sure any other buffers are seen as null / non-existent
-  for (size_t i = data.buffers.size(); i < 3; ++i) {
+  for (int i = static_cast<int>(data.buffers.size()); i < 3; ++i) {
     ClearBuffer(i);
   }
 
-  // TODO(wesm): what about extension arrays?
-
   if (this->type->id() == Type::DICTIONARY) {
     this->child_data.resize(1);
     this->child_data[0].SetMembers(*data.dictionary);
@@ -184,7 +182,7 @@ void ArraySpan::FillFromScalar(const Scalar& value) {
   if (is_primitive(value.type->id())) {
     const auto& scalar =
         internal::checked_cast<const internal::PrimitiveScalarBase&>(value);
-    const uint8_t* scalar_data = reinterpret_cast<const uint8_t*>(scalar.data());
+    const uint8_t* scalar_data = reinterpret_cast<const uint8_t*>(scalar.view().data());
     this->buffers[1].data = const_cast<uint8_t*>(scalar_data);
     this->buffers[1].size = scalar.type->byte_width();
   } else {
@@ -233,7 +231,7 @@ int GetNumBuffers(const DataType& type) {
 int ArraySpan::num_buffers() const { return GetNumBuffers(*this->type); }
 
 std::shared_ptr<ArrayData> ArraySpan::ToArrayData() const {
-  auto result = std::make_shared<ArrayData>(this->type->GetSharedPtr(), this->length,
+  auto result = std::make_shared<ArrayData>(this->type->Copy(), this->length,
                                             kUnknownNullCount, this->offset);
 
   for (int i = 0; i < this->num_buffers(); ++i) {
diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index 2f377378da8..09125268eb9 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -246,7 +246,7 @@ struct ARROW_EXPORT ArrayData {
 };
 
 /// \brief A non-owning Buffer reference
-struct ARROW_EXPORT BufferRef {
+struct ARROW_EXPORT BufferSpan {
   // It is the user of this class's responsibility to ensure that
   // buffers that were const originally are not written to
   // accidentally.
@@ -264,13 +264,18 @@ struct ARROW_EXPORT ArraySpan {
   int64_t length = 0;
   mutable int64_t null_count = kUnknownNullCount;
   int64_t offset = 0;
-  BufferRef buffers[3];
+  BufferSpan buffers[3];
 
   ArraySpan() = default;
 
   explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
-  explicit ArraySpan(const ArrayData& data) { SetMembers(data); }
-  explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
+
+  ArraySpan(const ArrayData& data) {  // NOLINT implicit conversion
+    SetMembers(data);
+  }
+  ArraySpan(const Scalar& data) {  // NOLINT implicit converstion
+    FillFromScalar(data);
+  }
 
   /// If dictionary-encoded, put dictionary in the first entry
   std::vector<ArraySpan> child_data;
diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc
index 96f8cdef07a..05155d64b6a 100644
--- a/cpp/src/arrow/array/validate.cc
+++ b/cpp/src/arrow/array/validate.cc
@@ -52,7 +52,7 @@ struct UTF8DataValidator {
     util::InitializeUTF8();
 
     int64_t i = 0;
-    return VisitArrayDataInline<StringType>(
+    return VisitArraySpanInline<StringType>(
         data,
         [&](util::string_view v) {
           if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) {
@@ -83,7 +83,7 @@ struct BoundsChecker {
     using c_type = typename IntegerType::c_type;
 
     int64_t i = 0;
-    return VisitArrayDataInline<IntegerType>(
+    return VisitArraySpanInline<IntegerType>(
         data,
         [&](c_type value) {
           const auto v = static_cast<int64_t>(value);
@@ -173,7 +173,7 @@ struct ValidateArrayImpl {
 
     if (full_validation) {
       using c_type = typename Date64Type::c_type;
-      return VisitArrayDataInline<Date64Type>(
+      return VisitArraySpanInline<Date64Type>(
           data,
           [&](c_type date) {
             constexpr c_type kFullDayMillis = 1000 * 60 * 60 * 24;
@@ -193,7 +193,7 @@ struct ValidateArrayImpl {
 
     if (full_validation) {
       using c_type = typename Time32Type::c_type;
-      return VisitArrayDataInline<Time32Type>(
+      return VisitArraySpanInline<Time32Type>(
           data,
           [&](c_type time) {
             constexpr c_type kFullDaySeconds = 60 * 60 * 24;
@@ -221,7 +221,7 @@ struct ValidateArrayImpl {
 
     if (full_validation) {
       using c_type = typename Time64Type::c_type;
-      return VisitArrayDataInline<Time64Type>(
+      return VisitArraySpanInline<Time64Type>(
           data,
           [&](c_type time) {
             constexpr c_type kFullDayMicro = 1000000LL * 60 * 60 * 24;
@@ -673,7 +673,7 @@ struct ValidateArrayImpl {
     using CType = typename TypeTraits<DecimalType>::CType;
     if (full_validation) {
       const int32_t precision = type.precision();
-      return VisitArrayDataInline<DecimalType>(
+      return VisitArraySpanInline<DecimalType>(
           data,
           [&](util::string_view bytes) {
             DCHECK_EQ(bytes.size(), DecimalType::kByteWidth);
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 0acab32e447..bd49041b4f3 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -165,7 +165,7 @@ Status CastFunction::AddKernel(Type::type in_type_id, ScalarKernel kernel) {
 }
 
 Status CastFunction::AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
-                               OutputType out_type, ScalarKernel::ExecFunc exec,
+                               OutputType out_type, ArrayKernelExec exec,
                                NullHandling::type null_handling,
                                MemAllocation::type mem_allocation) {
   ScalarKernel kernel;
diff --git a/cpp/src/arrow/compute/cast.h b/cpp/src/arrow/compute/cast.h
index a27dafe97c3..e9c3cf55da9 100644
--- a/cpp/src/arrow/compute/cast.h
+++ b/cpp/src/arrow/compute/cast.h
@@ -84,7 +84,7 @@ class CastFunction : public ScalarFunction {
   const std::vector<Type::type>& in_type_ids() const { return in_type_ids_; }
 
   Status AddKernel(Type::type in_type_id, std::vector<InputType> in_types,
-                   OutputType out_type, ScalarKernel::ExecFunc exec,
+                   OutputType out_type, ArrayKernelExec exec,
                    NullHandling::type = NullHandling::INTERSECTION,
                    MemAllocation::type = MemAllocation::PREALLOCATE);
 
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 994254ffb70..6301aaaf286 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -348,7 +348,7 @@ struct ARROW_EXPORT ExecResult {
       default:
         // scalar
         return this->scalar()->type.get();
-    };
+    }
   }
 
   ArraySpan* array_span() const {
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index d2b36f0080d..a7508bbc710 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -314,7 +314,7 @@ Status Function::Validate() const {
 }
 
 Status ScalarFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                                 ScalarKernel::ExecFunc exec, KernelInit init) {
+                                 ArrayKernelExec exec, KernelInit init) {
   RETURN_NOT_OK(CheckArity(in_types));
 
   if (arity_.is_varargs && in_types.size() != 1) {
@@ -336,7 +336,7 @@ Status ScalarFunction::AddKernel(ScalarKernel kernel) {
 }
 
 Status VectorFunction::AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                                 KernelBatchExec exec, KernelInit init) {
+                                 ArrayKernelExecOld exec, KernelInit init) {
   RETURN_NOT_OK(CheckArity(in_types));
 
   if (arity_.is_varargs && in_types.size() != 1) {
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index 91696b84fa2..a1b01a9dc7a 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -314,7 +314,7 @@ class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
   /// initialization, preallocation for fixed-width types, and default null
   /// handling (intersect validity bitmaps of inputs).
   Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                   ScalarKernel::ExecFunc exec, KernelInit init = NULLPTR);
+                   ArrayKernelExec exec, KernelInit init = NULLPTR);
 
   /// \brief Add a kernel (function implementation). Returns error if the
   /// kernel's signature does not match the function's arity.
@@ -338,7 +338,7 @@ class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
   /// state initialization, no data preallocation, and no preallocation of the
   /// validity bitmap.
   Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
-                   KernelBatchExec exec, KernelInit init = NULLPTR);
+                   ArrayKernelExecOld exec, KernelInit init = NULLPTR);
 
   /// \brief Add a kernel (function implementation). Returns error if the
   /// kernel's signature does not match the function's arity.
diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h
index 3a0fc2ccd64..e115c5194bc 100644
--- a/cpp/src/arrow/compute/kernel.h
+++ b/cpp/src/arrow/compute/kernel.h
@@ -538,25 +538,31 @@ struct Kernel {
   SimdLevel::type simd_level = SimdLevel::NONE;
 };
 
+/// \brief The scalar kernel execution API that must be implemented for SCALAR
+/// kernel types. This includes both stateless and stateful kernels. Kernels
+/// depending on some execution state access that state via subclasses of
+/// KernelState set on the KernelContext object. Implementations should
+/// endeavor to write into pre-allocated memory if they are able, though for
+/// some kernels (e.g. in cases when a builder like StringBuilder) must be
+/// employed this may not be possible.
+using ArrayKernelExec =
+    std::function<Status(KernelContext*, const ExecSpan&, ExecResult*)>;
+
+/// \brief Kernel execution API being phased out per ARROW-16756
+using ArrayKernelExecOld =
+    std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
+
 /// \brief Kernel data structure for implementations of ScalarFunction. In
 /// addition to the members found in Kernel, contains the null handling
 /// and memory pre-allocation preferences.
 struct ScalarKernel : public Kernel {
-  /// \brief The scalar kernel execution API that must be implemented for SCALAR
-  /// kernel types. This includes both stateless and stateful kernels. Kernels
-  /// depending on some execution state access that state via subclasses of
-  /// KernelState set on the KernelContext object. Implementations should
-  /// endeavor to write into pre-allocated memory if they are able, though for
-  /// some kernels (e.g. in cases when a builder like StringBuilder) must be
-  /// employed this may not be possible.
-  using ExecFunc = std::function<Status(KernelContext*, const ExecSpan&, ExecResult*)>;
   ScalarKernel() = default;
 
-  ScalarKernel(std::shared_ptr<KernelSignature> sig, ExecFunc exec,
+  ScalarKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
                KernelInit init = NULLPTR)
       : Kernel(std::move(sig), init), exec(std::move(exec)) {}
 
-  ScalarKernel(std::vector<InputType> in_types, OutputType out_type, ExecFunc exec,
+  ScalarKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
                KernelInit init = NULLPTR)
       : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
         exec(std::move(exec)) {}
@@ -565,7 +571,7 @@ struct ScalarKernel : public Kernel {
   /// implementation, it may only write into preallocated memory, while in some
   /// cases it will allocate its own memory. Any required state is managed
   /// through the KernelContext.
-  ExecFunc exec;
+  ArrayKernelExec exec;
 
   /// \brief Writing execution results into larger contiguous allocations
   /// requires that the kernel be able to write into sliced output ArrayData*,
@@ -583,12 +589,6 @@ struct ScalarKernel : public Kernel {
 // ----------------------------------------------------------------------
 // VectorKernel (for VectorFunction)
 
-/// \brief scalar kernel execution API that must be implemented for VECTOR
-/// kernel types. This includes both stateless and stateful kernels. Kernels
-/// depending on some execution state access that state via subclasses of
-/// KernelState set on the KernelContext object.
-using KernelBatchExec = std::function<Status(KernelContext*, const ExecBatch&, Datum*)>;
-
 /// \brief Kernel data structure for implementations of VectorFunction. In
 /// contains an optional finalizer function, the null handling and memory
 /// pre-allocation preferences (which have different defaults from
@@ -599,13 +599,14 @@ struct VectorKernel : public Kernel {
 
   VectorKernel() = default;
 
-  VectorKernel(std::vector<InputType> in_types, OutputType out_type, KernelBatchExec exec,
-               KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
+  VectorKernel(std::vector<InputType> in_types, OutputType out_type,
+               ArrayKernelExecOld exec, KernelInit init = NULLPTR,
+               FinalizeFunc finalize = NULLPTR)
       : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
         exec(std::move(exec)),
         finalize(std::move(finalize)) {}
 
-  VectorKernel(std::shared_ptr<KernelSignature> sig, KernelBatchExec exec,
+  VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExecOld exec,
                KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
       : Kernel(std::move(sig), std::move(init)),
         exec(std::move(exec)),
@@ -613,7 +614,7 @@ struct VectorKernel : public Kernel {
 
   /// \brief Perform a single invocation of this kernel. Any required state is
   /// managed through the KernelContext.
-  KernelBatchExec exec;
+  ArrayKernelExecOld exec;
 
   /// \brief For VectorKernel, convert intermediate results into finalized
   /// results. Mutates input argument. Some kernels may accumulate state
diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
index 8acdce323ed..661b6a4edb1 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc
@@ -141,7 +141,7 @@ struct CountDistinctImpl : public ScalarAggregator {
         int y;
         return memo_table_->GetOrInsert(arg, &y);
       };
-      RETURN_NOT_OK(VisitArrayDataInline<Type>(arr, visit_value, visit_null));
+      RETURN_NOT_OK(VisitArraySpanInline<Type>(arr, visit_value, visit_null));
       this->non_nulls += memo_table_->size();
       this->has_nulls = arr.GetNullCount() > 0;
     } else {
@@ -297,7 +297,7 @@ struct ProductImpl : public ScalarAggregator {
       }
 
       internal::VisitArrayValuesInline<ArrowType>(
-          ArraySpan(*data),
+          *data,
           [&](typename TypeTraits<ArrowType>::CType value) {
             this->product =
                 MultiplyTraits<AccType>::Multiply(*out_type, this->product, value);
@@ -630,7 +630,7 @@ struct IndexImpl : public ScalarAggregator {
     int64_t i = 0;
 
     ARROW_UNUSED(internal::VisitArrayValuesInline<ArgType>(
-        ArraySpan(*input),
+        *input,
         [&](ArgValue v) -> Status {
           if (v == desired) {
             index = i;
diff --git a/cpp/src/arrow/compute/kernels/aggregate_mode.cc b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
index d54ed12a1f7..f8c56b2a220 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_mode.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_mode.cc
@@ -389,7 +389,7 @@ Result<ValueDescr> ModeType(KernelContext*, const std::vector<ValueDescr>& descr
 }
 
 VectorKernel NewModeKernel(const std::shared_ptr<DataType>& in_type,
-                           KernelBatchExec exec) {
+                           ArrayKernelExecOld exec) {
   VectorKernel kernel;
   kernel.init = ModeState::Init;
   kernel.can_execute_chunkwise = false;
diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc
index faae139c130..79b01b477af 100644
--- a/cpp/src/arrow/compute/kernels/aggregate_test.cc
+++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc
@@ -1063,7 +1063,7 @@ TEST_F(TestCountDistinctKernel, Random) {
   };
   auto rand = random::RandomArrayGenerator(0x1205643);
   auto arr = rand.Numeric<UInt32Type>(1024, 0, 100, 0.0)->data();
-  auto r = VisitArrayDataInline<UInt32Type>(*arr, visit_value, visit_null);
+  auto r = VisitArraySpanInline<UInt32Type>(*arr, visit_value, visit_null);
   auto input = builder.Finish().ValueOrDie();
   Check(input, memo.size(), false);
 }
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
index cf2c2b9c195..c696e6376f7 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.cc
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -37,7 +37,7 @@ Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   return Status::NotImplemented("This kernel is malformed");
 }
 
-ScalarKernel::ExecFunc MakeFlippedBinaryExec(ScalarKernel::ExecFunc exec) {
+ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec) {
   return [exec](KernelContext* ctx, const ExecSpan& span, ExecResult* out) {
     ExecSpan flipped_span = span;
     std::swap(flipped_span.values[0], flipped_span.values[1]);
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h
index b3d989ec781..8c3c7e3d423 100644
--- a/cpp/src/arrow/compute/kernels/codegen_internal.h
+++ b/cpp/src/arrow/compute/kernels/codegen_internal.h
@@ -400,195 +400,11 @@ struct BoxScalar<Decimal256Type> {
   static void Box(T val, Scalar* out) { checked_cast<ScalarType*>(out)->value = val; }
 };
 
-// ----------------------------------------------------------------------
-// Like VisitArrayDataInline, but for ArraySpans
-
-template <typename T, typename Enable = void>
-struct ArraySpanInlineVisitor {};
-
-// Numeric and primitive C-compatible types
-template <typename T>
-struct ArraySpanInlineVisitor<T, enable_if_has_c_type<T>> {
-  using c_type = typename T::c_type;
-
-  template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
-                            NullFunc&& null_func) {
-    const c_type* data = arr.GetValues<c_type>(1);
-    auto visit_valid = [&](int64_t i) { return valid_func(data[i]); };
-    return VisitBitBlocks(arr.buffers[0].data, arr.offset, arr.length,
-                          std::move(visit_valid), std::forward<NullFunc>(null_func));
-  }
-
-  template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
-                        NullFunc&& null_func) {
-    using c_type = typename T::c_type;
-    const c_type* data = arr.GetValues<c_type>(1);
-    auto visit_valid = [&](int64_t i) { valid_func(data[i]); };
-    VisitBitBlocksVoid(arr.buffers[0].data, arr.offset, arr.length,
-                       std::move(visit_valid), std::forward<NullFunc>(null_func));
-  }
-};
-
-// Boolean
-template <>
-struct ArraySpanInlineVisitor<BooleanType> {
-  using c_type = bool;
-
-  template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
-                            NullFunc&& null_func) {
-    int64_t offset = arr.offset;
-    const uint8_t* data = arr.buffers[1].data;
-    return VisitBitBlocks(
-        arr.buffers[0].data, offset, arr.length,
-        [&](int64_t i) { return valid_func(bit_util::GetBit(data, offset + i)); },
-        std::forward<NullFunc>(null_func));
-  }
-
-  template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
-                        NullFunc&& null_func) {
-    int64_t offset = arr.offset;
-    const uint8_t* data = arr.buffers[1].data;
-    VisitBitBlocksVoid(
-        arr.buffers[0].data, offset, arr.length,
-        [&](int64_t i) { valid_func(bit_util::GetBit(data, offset + i)); },
-        std::forward<NullFunc>(null_func));
-  }
-};
-
-// Binary, String...
-template <typename T>
-struct ArraySpanInlineVisitor<T, enable_if_base_binary<T>> {
-  using c_type = util::string_view;
-
-  template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
-                            NullFunc&& null_func) {
-    using offset_type = typename T::offset_type;
-    constexpr char empty_value = 0;
-
-    if (arr.length == 0) {
-      return Status::OK();
-    }
-    const offset_type* offsets = arr.GetValues<offset_type>(1);
-    const char* data;
-    if (arr.buffers[2].data == NULLPTR) {
-      data = &empty_value;
-    } else {
-      // Do not apply the array offset to the values array; the value_offsets
-      // index the non-sliced values array.
-      data = arr.GetValues<char>(2, /*absolute_offset=*/0);
-    }
-    offset_type cur_offset = *offsets++;
-    return VisitBitBlocks(
-        arr.buffers[0].data, arr.offset, arr.length,
-        [&](int64_t i) {
-          ARROW_UNUSED(i);
-          auto value = util::string_view(data + cur_offset, *offsets - cur_offset);
-          cur_offset = *offsets++;
-          return valid_func(value);
-        },
-        [&]() {
-          cur_offset = *offsets++;
-          return null_func();
-        });
-  }
-
-  template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
-                        NullFunc&& null_func) {
-    using offset_type = typename T::offset_type;
-    constexpr uint8_t empty_value = 0;
-
-    if (arr.length == 0) {
-      return;
-    }
-    const offset_type* offsets = arr.GetValues<offset_type>(1);
-    const uint8_t* data;
-    if (arr.buffers[2].data == NULLPTR) {
-      data = &empty_value;
-    } else {
-      // Do not apply the array offset to the values array; the value_offsets
-      // index the non-sliced values array.
-      data = arr.GetValues<uint8_t>(2, /*absolute_offset=*/0);
-    }
-
-    VisitBitBlocksVoid(
-        arr.buffers[0].data, arr.offset, arr.length,
-        [&](int64_t i) {
-          auto value = util::string_view(reinterpret_cast<const char*>(data + offsets[i]),
-                                         offsets[i + 1] - offsets[i]);
-          valid_func(value);
-        },
-        std::forward<NullFunc>(null_func));
-  }
-};
-
-// FixedSizeBinary, Decimal128
-template <typename T>
-struct ArraySpanInlineVisitor<T, enable_if_fixed_size_binary<T>> {
-  using c_type = util::string_view;
-
-  template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
-                            NullFunc&& null_func) {
-    const int32_t byte_width = arr.type->byte_width();
-    const char* data = arr.GetValues<char>(1,
-                                           /*absolute_offset=*/arr.offset * byte_width);
-    return VisitBitBlocks(
-        arr.buffers[0].data, arr.offset, arr.length,
-        [&](int64_t i) {
-          auto value = util::string_view(data, byte_width);
-          data += byte_width;
-          return valid_func(value);
-        },
-        [&]() {
-          data += byte_width;
-          return null_func();
-        });
-  }
-
-  template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
-                        NullFunc&& null_func) {
-    const int32_t byte_width = arr.type->byte_width();
-    const char* data = arr.GetValues<char>(1,
-                                           /*absolute_offset=*/arr.offset * byte_width);
-    VisitBitBlocksVoid(
-        arr.buffers[0].data, arr.offset, arr.length,
-        [&](int64_t i) {
-          valid_func(util::string_view(data, byte_width));
-          data += byte_width;
-        },
-        [&]() {
-          data += byte_width;
-          null_func();
-        });
-  }
-};
-
-template <typename T, typename ValidFunc, typename NullFunc>
-typename ::arrow::internal::call_traits::enable_if_return<ValidFunc, Status>::type
-VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
-  return internal::ArraySpanInlineVisitor<T>::VisitStatus(
-      arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
-}
-
-template <typename T, typename ValidFunc, typename NullFunc>
-typename ::arrow::internal::call_traits::enable_if_return<ValidFunc, void>::type
-VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
-  return internal::ArraySpanInlineVisitor<T>::VisitVoid(
-      arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
-}
-
 // A VisitArraySpanInline variant that calls its visitor function with logical
 // values, such as Decimal128 rather than util::string_view.
 
 template <typename T, typename VisitFunc, typename NullFunc>
-static typename arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
+static typename ::arrow::internal::call_traits::enable_if_return<VisitFunc, void>::type
 VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func,
                        NullFunc&& null_func) {
   VisitArraySpanInline<T>(
@@ -600,7 +416,7 @@ VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func,
 }
 
 template <typename T, typename VisitFunc, typename NullFunc>
-static typename arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
+static typename ::arrow::internal::call_traits::enable_if_return<VisitFunc, Status>::type
 VisitArrayValuesInline(const ArraySpan& arr, VisitFunc&& valid_func,
                        NullFunc&& null_func) {
   return VisitArraySpanInline<T>(
@@ -632,20 +448,6 @@ static void VisitTwoArrayValuesInline(const ArraySpan& arr0, const ArraySpan& ar
                         arr1.offset, arr0.length, std::move(visit_valid),
                         std::move(visit_null));
 }
-// Like ArrayDataVisitor (see visit_data_inline.h), but for ArraySpans
-
-template <typename T>
-struct ArraySpanVisitor {
-  using InlineVisitorType = ArraySpanInlineVisitor<T>;
-  using c_type = typename InlineVisitorType::c_type;
-
-  template <typename Visitor>
-  static Status Visit(const ArraySpan& arr, Visitor* visitor) {
-    return InlineVisitorType::VisitStatus(
-        arr, [visitor](c_type v) { return visitor->VisitValue(v); },
-        [visitor]() { return visitor->VisitNull(); });
-  }
-};
 
 // ----------------------------------------------------------------------
 // Reusable type resolvers
@@ -660,7 +462,7 @@ Result<ValueDescr> ListValuesType(KernelContext*, const std::vector<ValueDescr>&
 Status ExecFail(KernelContext* ctx, const ExecSpan& batch, ExecResult* out);
 Status ExecFailOld(KernelContext* ctx, const ExecBatch& batch, Datum* out);
 
-ScalarKernel::ExecFunc MakeFlippedBinaryExec(ScalarKernel::ExecFunc exec);
+ArrayKernelExec MakeFlippedBinaryExec(ArrayKernelExec exec);
 
 // ----------------------------------------------------------------------
 // Helpers for iterating over common DataType instances for adding kernels to
@@ -686,12 +488,12 @@ const std::vector<std::shared_ptr<DataType>>& ExampleParametricTypes();
 
 // ----------------------------------------------------------------------
 // "Applicators" take an operator definition (which may be scalar-valued or
-// array-valued) and creates an ScalarKernel::ExecFunc which can be used to add an
+// array-valued) and creates an ArrayKernelExec which can be used to add an
 // ArrayKernel to a Function.
 
 namespace applicator {
 
-// Generate an ScalarKernel::ExecFunc given a functor that handles all of its own
+// Generate an ArrayKernelExec given a functor that handles all of its own
 // iteration, etc.
 //
 // Operator must implement
@@ -708,7 +510,7 @@ static Status SimpleUnary(KernelContext* ctx, const ExecSpan& batch, ExecResult*
   return Status::OK();
 }
 
-// Generate an ScalarKernel::ExecFunc given a functor that handles all of its own
+// Generate an ArrayKernelExec given a functor that handles all of its own
 // iteration, etc.
 //
 // Operator must implement
@@ -1232,7 +1034,7 @@ struct GetTypeId {
 
 // GD for numeric types (integer and floating point)
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateNumeric(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateNumeric(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1264,7 +1066,7 @@ ScalarKernel::ExecFunc GenerateNumeric(detail::GetTypeId get_id) {
 // API I duplicated this generator dispatcher to be able to create old
 // kernel types
 template <template <typename...> class Generator, typename Type0, typename... Args>
-KernelBatchExec GenerateNumericOld(detail::GetTypeId get_id) {
+ArrayKernelExecOld GenerateNumericOld(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1296,7 +1098,7 @@ KernelBatchExec GenerateNumericOld(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateFloatingPoint(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateFloatingPoint(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::FLOAT:
       return Generator<Type0, FloatType, Args...>::Exec;
@@ -1312,7 +1114,7 @@ ScalarKernel::ExecFunc GenerateFloatingPoint(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateInteger(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateInteger(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1337,7 +1139,7 @@ ScalarKernel::ExecFunc GenerateInteger(detail::GetTypeId get_id) {
 }
 
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GeneratePhysicalInteger(detail::GetTypeId get_id) {
+ArrayKernelExec GeneratePhysicalInteger(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1368,7 +1170,7 @@ ScalarKernel::ExecFunc GeneratePhysicalInteger(detail::GetTypeId get_id) {
 }
 
 template <template <typename...> class KernelGenerator, typename Op, typename... Args>
-ScalarKernel::ExecFunc ArithmeticExecFromOp(detail::GetTypeId get_id) {
+ArrayKernelExec ArithmeticExecFromOp(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<Int8Type, Int8Type, Op, Args...>::Exec;
@@ -1401,7 +1203,7 @@ ScalarKernel::ExecFunc ArithmeticExecFromOp(detail::GetTypeId get_id) {
 // ARROW-16756: temporarily duplicated until we get all the kernels
 // migrated to the new API
 template <template <typename...> class KernelGenerator, typename Op, typename... Args>
-KernelBatchExec ArithmeticExecFromOpOld(detail::GetTypeId get_id) {
+ArrayKernelExecOld ArithmeticExecFromOpOld(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<Int8Type, Int8Type, Op, Args...>::Exec;
@@ -1432,7 +1234,7 @@ KernelBatchExec ArithmeticExecFromOpOld(detail::GetTypeId get_id) {
 }
 
 template <template <typename... Args> class Generator, typename... Args>
-ScalarKernel::ExecFunc GeneratePhysicalNumeric(detail::GetTypeId get_id) {
+ArrayKernelExec GeneratePhysicalNumeric(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Int8Type, Args...>::Exec;
@@ -1468,7 +1270,7 @@ ScalarKernel::ExecFunc GeneratePhysicalNumeric(detail::GetTypeId get_id) {
 
 // Generate a kernel given a templated functor for decimal types
 template <template <typename... Args> class Generator, typename... Args>
-ScalarKernel::ExecFunc GenerateDecimalToDecimal(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateDecimalToDecimal(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::DECIMAL128:
       return Generator<Decimal128Type, Args...>::Exec;
@@ -1484,7 +1286,7 @@ ScalarKernel::ExecFunc GenerateDecimalToDecimal(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateSignedInteger(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateSignedInteger(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return Generator<Type0, Int8Type, Args...>::Exec;
@@ -1508,7 +1310,7 @@ ScalarKernel::ExecFunc GenerateSignedInteger(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename... Args>
-ScalarKernel::ExecFunc GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::NA:
       return Generator<NullType, Args...>::Exec;
@@ -1546,7 +1348,7 @@ ScalarKernel::ExecFunc GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
 
 // XXX: Duplicated temporarily
 template <template <typename...> class Generator, typename... Args>
-KernelBatchExec GenerateTypeAgnosticPrimitiveOld(detail::GetTypeId get_id) {
+ArrayKernelExecOld GenerateTypeAgnosticPrimitiveOld(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::NA:
       return Generator<NullType, Args...>::Exec;
@@ -1584,7 +1386,7 @@ KernelBatchExec GenerateTypeAgnosticPrimitiveOld(detail::GetTypeId get_id) {
 
 // similar to GenerateTypeAgnosticPrimitive, but for base variable binary types
 template <template <typename...> class Generator, typename... Args>
-ScalarKernel::ExecFunc GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
     case Type::STRING:
@@ -1600,7 +1402,7 @@ ScalarKernel::ExecFunc GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_i
 
 // XXX: Duplicated temporarily
 template <template <typename...> class Generator, typename... Args>
-KernelBatchExec GenerateTypeAgnosticVarBinaryBaseOld(detail::GetTypeId get_id) {
+ArrayKernelExecOld GenerateTypeAgnosticVarBinaryBaseOld(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
     case Type::STRING:
@@ -1616,7 +1418,7 @@ KernelBatchExec GenerateTypeAgnosticVarBinaryBaseOld(detail::GetTypeId get_id) {
 
 // Generate a kernel given a templated functor for binary and string types
 template <template <typename...> class Generator, typename... Args>
-ScalarKernel::ExecFunc GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
       return Generator<BinaryType, Args...>::Exec;
@@ -1639,7 +1441,7 @@ ScalarKernel::ExecFunc GenerateVarBinaryToVarBinary(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateVarBinaryBase(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateVarBinaryBase(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
     case Type::STRING:
@@ -1655,7 +1457,7 @@ ScalarKernel::ExecFunc GenerateVarBinaryBase(detail::GetTypeId get_id) {
 
 // TODO: Duplicated in ARROW-16756
 template <template <typename...> class Generator, typename Type0, typename... Args>
-KernelBatchExec GenerateVarBinaryBaseOld(detail::GetTypeId get_id) {
+ArrayKernelExecOld GenerateVarBinaryBaseOld(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
     case Type::STRING:
@@ -1671,7 +1473,7 @@ KernelBatchExec GenerateVarBinaryBaseOld(detail::GetTypeId get_id) {
 
 // See BaseBinary documentation
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateVarBinary(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateVarBinary(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::BINARY:
       return Generator<Type0, BinaryType, Args...>::Exec;
@@ -1691,7 +1493,7 @@ ScalarKernel::ExecFunc GenerateVarBinary(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateTemporal(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateTemporal(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::DATE32:
       return Generator<Type0, Date32Type, Args...>::Exec;
@@ -1715,7 +1517,7 @@ ScalarKernel::ExecFunc GenerateTemporal(detail::GetTypeId get_id) {
 //
 // See "Numeric" above for description of the generator functor
 template <template <typename...> class Generator, typename Type0, typename... Args>
-ScalarKernel::ExecFunc GenerateDecimal(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateDecimal(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::DECIMAL128:
       return Generator<Type0, Decimal128Type, Args...>::Exec;
@@ -1729,7 +1531,7 @@ ScalarKernel::ExecFunc GenerateDecimal(detail::GetTypeId get_id) {
 
 // Temporarily duplicated for ARROW-16756
 template <template <typename...> class Generator, typename Type0, typename... Args>
-KernelBatchExec GenerateDecimalOld(detail::GetTypeId get_id) {
+ArrayKernelExecOld GenerateDecimalOld(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::DECIMAL128:
       return Generator<Type0, Decimal128Type, Args...>::Exec;
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index ec3dda85d22..216a8dc2e0a 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -651,7 +651,7 @@ VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
   auto g = batch[1].array()->GetValues<uint32_t>(1);
   if (batch[0].is_array()) {
     VisitArrayValuesInline<Type>(
-        ArraySpan(*batch[0].array()),
+        *batch[0].array(),
         [&](typename TypeTraits<Type>::CType val) { valid_func(*g++, val); },
         [&]() { null_func(*g++); });
     return;
@@ -676,7 +676,7 @@ VisitGroupedValues(const ExecBatch& batch, ConsumeValue&& valid_func,
   auto g = batch[1].array()->GetValues<uint32_t>(1);
   if (batch[0].is_array()) {
     return VisitArrayValuesInline<Type>(
-        ArraySpan(*batch[0].array()),
+        *batch[0].array(),
         [&](typename GetViewType<Type>::T val) { return valid_func(*g++, val); },
         [&]() { return null_func(*g++); });
   }
@@ -2166,10 +2166,10 @@ struct GroupedBooleanAggregator : public GroupedAggregator {
 
     if (batch[0].is_array()) {
       const auto& input = *batch[0].array();
+      const uint8_t* bitmap = input.buffers[1]->data();
       if (input.MayHaveNulls()) {
-        const uint8_t* bitmap = input.buffers[1]->data();
         arrow::internal::VisitBitBlocksVoid(
-            input.buffers[0], input.offset, input.length,
+            input.buffers[0]->data(), input.offset, input.length,
             [&](int64_t position) {
               counts[*g]++;
               Impl::UpdateGroupWith(reduced, *g, bit_util::GetBit(bitmap, position));
@@ -2178,7 +2178,7 @@ struct GroupedBooleanAggregator : public GroupedAggregator {
             [&] { bit_util::SetBitTo(no_nulls, *g++, false); });
       } else {
         arrow::internal::VisitBitBlocksVoid(
-            input.buffers[1], input.offset, input.length,
+            bitmap, input.offset, input.length,
             [&](int64_t) {
               Impl::UpdateGroupWith(reduced, *g, true);
               counts[*g++]++;
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.cc b/cpp/src/arrow/compute/kernels/row_encoder.cc
index 10a1f4cda52..a64d388e363 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.cc
+++ b/cpp/src/arrow/compute/kernels/row_encoder.cc
@@ -75,7 +75,7 @@ void BooleanKeyEncoder::AddLengthNull(int32_t* length) {
 Status BooleanKeyEncoder::Encode(const Datum& data, int64_t batch_length,
                                  uint8_t** encoded_bytes) {
   if (data.is_array()) {
-    VisitArrayDataInline<BooleanType>(
+    VisitArraySpanInline<BooleanType>(
         *data.array(),
         [&](bool value) {
           auto& encoded_ptr = *encoded_bytes++;
@@ -144,7 +144,7 @@ Status FixedWidthKeyEncoder::Encode(const Datum& data, int64_t batch_length,
     ArrayData viewed(fixed_size_binary(byte_width_), arr.length, arr.buffers,
                      arr.null_count, arr.offset);
 
-    VisitArrayDataInline<FixedSizeBinaryType>(
+    VisitArraySpanInline<FixedSizeBinaryType>(
         viewed,
         [&](util::string_view bytes) {
           auto& encoded_ptr = *encoded_bytes++;
diff --git a/cpp/src/arrow/compute/kernels/row_encoder.h b/cpp/src/arrow/compute/kernels/row_encoder.h
index 4087472b67a..1a404ed66ba 100644
--- a/cpp/src/arrow/compute/kernels/row_encoder.h
+++ b/cpp/src/arrow/compute/kernels/row_encoder.h
@@ -117,7 +117,7 @@ struct VarLengthKeyEncoder : KeyEncoder {
   void AddLength(const Datum& data, int64_t batch_length, int32_t* lengths) override {
     if (data.is_array()) {
       int64_t i = 0;
-      VisitArrayDataInline<T>(
+      VisitArraySpanInline<T>(
           *data.array(),
           [&](util::string_view bytes) {
             lengths[i++] +=
@@ -142,7 +142,7 @@ struct VarLengthKeyEncoder : KeyEncoder {
   Status Encode(const Datum& data, int64_t batch_length,
                 uint8_t** encoded_bytes) override {
     if (data.is_array()) {
-      VisitArrayDataInline<T>(
+      VisitArraySpanInline<T>(
           *data.array(),
           [&](util::string_view bytes) {
             auto& encoded_ptr = *encoded_bytes++;
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
index 999bd71c30b..00138adc793 100644
--- a/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc
@@ -1005,7 +1005,7 @@ struct Trunc {
 // Generate a kernel given a bitwise arithmetic functor. Assumes the
 // functor treats all integer types of equal width identically
 template <template <typename... Args> class KernelGenerator, typename Op>
-ScalarKernel::ExecFunc TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
+ArrayKernelExec TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
     case Type::UINT8:
@@ -1026,7 +1026,7 @@ ScalarKernel::ExecFunc TypeAgnosticBitWiseExecFromOp(detail::GetTypeId get_id) {
 }
 
 template <template <typename... Args> class KernelGenerator, typename Op>
-ScalarKernel::ExecFunc ShiftExecFromOp(detail::GetTypeId get_id) {
+ArrayKernelExec ShiftExecFromOp(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<Int8Type, Int8Type, Op>::Exec;
@@ -1051,7 +1051,7 @@ ScalarKernel::ExecFunc ShiftExecFromOp(detail::GetTypeId get_id) {
 }
 
 template <template <typename... Args> class KernelGenerator, typename Op>
-ScalarKernel::ExecFunc GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateArithmeticFloatingPoint(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::FLOAT:
       return KernelGenerator<FloatType, FloatType, Op>::Exec;
@@ -1163,7 +1163,7 @@ void AddDecimalBinaryKernels(const std::string& name, ScalarFunction* func) {
 
 // Generate a kernel given an arithmetic functor
 template <template <typename...> class KernelGenerator, typename OutType, typename Op>
-ScalarKernel::ExecFunc GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
+ArrayKernelExec GenerateArithmeticWithFixedIntOutType(detail::GetTypeId get_id) {
   switch (get_id.id) {
     case Type::INT8:
       return KernelGenerator<OutType, Int8Type, Op>::Exec;
diff --git a/cpp/src/arrow/compute/kernels/scalar_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
index 1bbabd70970..042c4c1304b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_boolean.cc
@@ -473,7 +473,7 @@ struct KleeneAndNotOp {
   }
 };
 
-void MakeFunction(const std::string& name, int arity, ScalarKernel::ExecFunc exec,
+void MakeFunction(const std::string& name, int arity, ArrayKernelExec exec,
                   FunctionDoc doc, FunctionRegistry* registry,
                   NullHandling::type null_handling = NullHandling::INTERSECTION) {
   auto func = std::make_shared<ScalarFunction>(name, Arity(arity), std::move(doc));
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
index 2d7fef1f4a0..dad94c1ace7 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc
@@ -53,14 +53,13 @@ std::vector<std::shared_ptr<CastFunction>> GetBooleanCasts() {
   AddZeroCopyCast(Type::BOOL, boolean(), boolean(), func.get());
 
   for (const auto& ty : NumericTypes()) {
-    ScalarKernel::ExecFunc exec =
+    ArrayKernelExec exec =
         GenerateNumeric<applicator::ScalarUnary, BooleanType, IsNonZero>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   for (const auto& ty : BaseBinaryTypes()) {
-    ScalarKernel::ExecFunc exec =
-        GenerateVarBinaryBase<applicator::ScalarUnaryNotNull, BooleanType,
-                              ParseBooleanString>(*ty);
+    ArrayKernelExec exec = GenerateVarBinaryBase<applicator::ScalarUnaryNotNull,
+                                                 BooleanType, ParseBooleanString>(*ty);
     DCHECK_OK(func->AddKernel(ty->id(), {ty}, boolean(), exec));
   }
   return {func};
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
index b6cb5e6e9d7..57a5ccd7ab5 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc
@@ -50,7 +50,7 @@ Status CastToDictionary(KernelContext* ctx, const ExecSpan& batch, ExecResult* o
 
     // if invalid scalar, return null scalar
     if (!in_scalar.is_valid) {
-      out->value = MakeNullScalar(out_type.GetSharedPtr());
+      out->value = MakeNullScalar(out_type.Copy());
       return Status::OK();
     }
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
index 04edace5998..49e1e26e6ad 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc
@@ -198,14 +198,14 @@ Status CastFromExtension(KernelContext* ctx, const ExecSpan& batch, ExecResult*
   if (batch[0].is_scalar()) {
     const auto& ext_scalar = checked_cast<const ExtensionScalar&>(*batch[0].scalar);
     if (ext_scalar.is_valid) {
-      RETURN_NOT_OK(Cast(ext_scalar.value, out->type()->GetSharedPtr(), options,
-                         ctx->exec_context())
-                        .Value(&result));
+      RETURN_NOT_OK(
+          Cast(ext_scalar.value, out->type()->Copy(), options, ctx->exec_context())
+              .Value(&result));
     } else {
       const auto& storage_type =
           checked_cast<const ExtensionType&>(*ext_scalar.type).storage_type();
-      RETURN_NOT_OK(Cast(MakeNullScalar(storage_type), out->type()->GetSharedPtr(),
-                         options, ctx->exec_context())
+      RETURN_NOT_OK(Cast(MakeNullScalar(storage_type), out->type()->Copy(), options,
+                         ctx->exec_context())
                         .Value(&result));
     }
     out->value = std::move(result.scalar());
@@ -213,9 +213,9 @@ Status CastFromExtension(KernelContext* ctx, const ExecSpan& batch, ExecResult*
     DCHECK(batch[0].is_array());
     ExtensionArray extension(batch[0].array.ToArrayData());
     std::shared_ptr<Array> result;
-    RETURN_NOT_OK(Cast(*extension.storage(), out->type()->GetSharedPtr(), options,
-                       ctx->exec_context())
-                      .Value(&result));
+    RETURN_NOT_OK(
+        Cast(*extension.storage(), out->type()->Copy(), options, ctx->exec_context())
+            .Value(&result));
     out->value = std::move(result->data());
   }
   return Status::OK();
@@ -225,8 +225,7 @@ Status CastFromNull(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
   // TODO(wesm): handle this case more gracefully
   if (!batch[0].is_scalar()) {
     std::shared_ptr<Array> nulls;
-    RETURN_NOT_OK(
-        MakeArrayOfNull(out->type()->GetSharedPtr(), batch.length).Value(&nulls));
+    RETURN_NOT_OK(MakeArrayOfNull(out->type()->Copy(), batch.length).Value(&nulls));
     out->value = nulls->data();
   }
   return Status::OK();
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
index 2476cad4b76..f8d72dc08cc 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
@@ -263,7 +263,6 @@ Status CastIntegerToFloating(KernelContext* ctx, const ExecSpan& batch, ExecResu
   const auto& options = checked_cast<const CastState*>(ctx->state())->options;
   Type::type out_type = out->type()->id();
   if (!options.allow_float_truncate) {
-    /// XXX: refactor to not use Datum
     RETURN_NOT_OK(CheckForIntegerToFloatingTruncation(batch[0], out_type));
   }
   CastNumberToNumberUnsafe(batch[0].type()->id(), out_type, batch[0], out);
diff --git a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
index 447baded322..e6e0795ab16 100644
--- a/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_cast_string.cc
@@ -54,7 +54,7 @@ struct NumericToStringCastFunctor {
     DCHECK(out->is_array_data());
     const ArraySpan& input = batch[0].array;
     FormatterType formatter(input.type);
-    BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool());
+    BuilderType builder(input.type->Copy(), ctx->memory_pool());
     RETURN_NOT_OK(VisitArraySpanInline<I>(
         input,
         [&](value_type v) {
@@ -82,7 +82,7 @@ struct TemporalToStringCastFunctor {
     DCHECK(out->is_array_data());
     const ArraySpan& input = batch[0].array;
     FormatterType formatter(input.type);
-    BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool());
+    BuilderType builder(input.type->Copy(), ctx->memory_pool());
     RETURN_NOT_OK(VisitArraySpanInline<I>(
         input,
         [&](value_type v) {
@@ -108,7 +108,7 @@ struct TemporalToStringCastFunctor<O, TimestampType> {
     const ArraySpan& input = batch[0].array;
     const auto& timezone = GetInputTimezone(*input.type);
     const auto& ty = checked_cast<const TimestampType&>(*input.type);
-    BuilderType builder(input.type->GetSharedPtr(), ctx->memory_pool());
+    BuilderType builder(input.type->Copy(), ctx->memory_pool());
 
     // Preallocate
     int64_t string_length = 19;  // YYYY-MM-DD HH:MM:SS
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare.cc b/cpp/src/arrow/compute/kernels/scalar_compare.cc
index 38e8b9f58a2..8ca907b7f5e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare.cc
@@ -381,7 +381,7 @@ struct ScalarMinMax {
     bool initialize_output = true;
     if (scalar_count > 0) {
       ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> temp_scalar,
-                            MakeScalar(out->type()->GetSharedPtr(), 0));
+                            MakeScalar(out->type()->Copy(), 0));
       ExecScalar(batch, options, temp_scalar.get());
       if (temp_scalar->is_valid) {
         const auto value = UnboxScalar<OutType>::Unbox(*temp_scalar);
@@ -575,7 +575,7 @@ struct BinaryScalarMinMax {
     std::shared_ptr<Array> string_array;
     RETURN_NOT_OK(builder.Finish(&string_array));
     out->value = std::move(string_array->data());
-    out->array_data()->type = batch[0].type()->GetSharedPtr();
+    out->array_data()->type = batch[0].type()->Copy();
     DCHECK_EQ(batch.length, out->array_data()->length);
     return Status::OK();
   }
@@ -618,7 +618,7 @@ struct FixedSizeBinaryScalarMinMax {
     int32_t byte_width = binary_type->byte_width();
     // Presize data to avoid reallocations.
     int64_t estimated_final_size = batch.length * byte_width;
-    FixedSizeBinaryBuilder builder(batch_type->GetSharedPtr());
+    FixedSizeBinaryBuilder builder(batch_type->Copy());
     RETURN_NOT_OK(builder.Reserve(batch.length));
     RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
 
@@ -662,7 +662,7 @@ struct FixedSizeBinaryScalarMinMax {
     std::shared_ptr<Array> string_array;
     RETURN_NOT_OK(builder.Finish(&string_array));
     out->value = std::move(string_array->data());
-    out->array_data()->type = batch[0].type()->GetSharedPtr();
+    out->array_data()->type = batch[0].type()->Copy();
     DCHECK_EQ(batch.length, out->array_data()->length);
     return Status::OK();
   }
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else.cc b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
index de1f61a3231..2b7261a0bad 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else.cc
@@ -186,13 +186,7 @@ struct IfElseNullPromoter {
     if (output->is_array_data()) {
       return ExecIntoArrayData(need_to_allocate);
     } else {
-      if (need_to_allocate) {
-        // TODO: turn this into a DCHECK, but have this strong error to be
-        // helpful for now
-        return Status::Invalid(
-            "Conditional kernel writing into array span must "
-            "preallocate validity bitmap");
-      }
+      DCHECK(!need_to_allocate) << "Conditional kernel must preallocate validity bitmap";
       return ExecIntoArraySpan();
     }
   }
@@ -380,9 +374,9 @@ Status RunIfElseScalar(const BooleanScalar& cond, const ExecValue& left,
   if (left.is_scalar() && right.is_scalar()) {  // output will be a scalar
     if (cond.is_valid) {
       const Scalar* which_scalar = cond.value ? left.scalar : right.scalar;
-      out->value = which_scalar->GetSharedPtr();
+      out->value = which_scalar->Copy();
     } else {
-      out->value = MakeNullScalar(left.type()->GetSharedPtr());
+      out->value = MakeNullScalar(left.type()->Copy());
     }
     return Status::OK();
   }
@@ -662,9 +656,9 @@ static Status IfElseGenericSXXCall(KernelContext* ctx, const BooleanScalar& cond
   if (left.is_scalar() && right.is_scalar()) {
     if (cond.is_valid) {
       const Scalar* which_scalar = cond.value ? left.scalar : right.scalar;
-      out->value = which_scalar->GetSharedPtr();
+      out->value = which_scalar->Copy();
     } else {
-      out->value = MakeNullScalar(left.type()->GetSharedPtr());
+      out->value = MakeNullScalar(left.type()->Copy());
     }
     return Status::OK();
   }
@@ -674,7 +668,7 @@ static Status IfElseGenericSXXCall(KernelContext* ctx, const BooleanScalar& cond
     // cond is null; just create a null array
     ARROW_ASSIGN_OR_RAISE(
         std::shared_ptr<Array> result,
-        MakeArrayOfNull(left.type()->GetSharedPtr(), out_arr_len, ctx->memory_pool()));
+        MakeArrayOfNull(left.type()->Copy(), out_arr_len, ctx->memory_pool()));
     out->value = std::move(result->data());
     return Status::OK();
   }
@@ -1069,8 +1063,8 @@ struct NestedIfElseExec {
   static Status RunLoop(KernelContext* ctx, const ArraySpan& cond, ExecResult* out,
                         HandleLeft&& handle_left, HandleRight&& handle_right) {
     std::unique_ptr<ArrayBuilder> raw_builder;
-    RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
-                                        &raw_builder));
+    RETURN_NOT_OK(
+        MakeBuilderExactIndex(ctx->memory_pool(), out->type()->Copy(), &raw_builder));
     RETURN_NOT_OK(raw_builder->Reserve(out->length()));
 
     const auto* cond_data = cond.buffers[1].data;
@@ -1469,15 +1463,15 @@ Status ExecScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch, ExecResult*
     }
   }
   if (out->is_scalar()) {
-    out->value = result.is_scalar() ? result.scalar->GetSharedPtr()
-                                    : MakeNullScalar(out->type()->GetSharedPtr());
+    out->value =
+        result.is_scalar() ? result.scalar->Copy() : MakeNullScalar(out->type()->Copy());
     return Status::OK();
   }
 
   std::shared_ptr<Scalar> temp;
   if (!has_result) {
     // All conditions false, no 'else' argument
-    temp = MakeNullScalar(out->type()->GetSharedPtr());
+    temp = MakeNullScalar(out->type()->Copy());
     result = temp.get();
   }
 
@@ -1675,15 +1669,15 @@ Status ExecVarWidthScalarCaseWhen(KernelContext* ctx, const ExecSpan& batch,
   }
   if (out->is_scalar()) {
     DCHECK(result.is_scalar() || !has_result);
-    out->value = result.is_scalar() ? result.scalar->GetSharedPtr()
-                                    : MakeNullScalar(out->type()->GetSharedPtr());
+    out->value =
+        result.is_scalar() ? result.scalar->Copy() : MakeNullScalar(out->type()->Copy());
     return Status::OK();
   }
   if (!has_result) {
     // All conditions false, no 'else' argument
     ARROW_ASSIGN_OR_RAISE(
         std::shared_ptr<Array> array,
-        MakeArrayOfNull(out->type()->GetSharedPtr(), batch.length, ctx->memory_pool()));
+        MakeArrayOfNull(out->type()->Copy(), batch.length, ctx->memory_pool()));
     out->value = std::move(array->data());
   } else if (result.is_scalar()) {
     ARROW_ASSIGN_OR_RAISE(auto array, MakeArrayFromScalar(*result.scalar, batch.length,
@@ -1703,8 +1697,8 @@ static Status ExecVarWidthArrayCaseWhenImpl(
   const ArraySpan& conds_array = batch[0].array;
   const bool have_else_arg = conds_array.type->num_fields() < (batch.num_values() - 1);
   std::unique_ptr<ArrayBuilder> raw_builder;
-  RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
-                                      &raw_builder));
+  RETURN_NOT_OK(
+      MakeBuilderExactIndex(ctx->memory_pool(), out->type()->Copy(), &raw_builder));
   RETURN_NOT_OK(raw_builder->Reserve(batch.length));
   RETURN_NOT_OK(reserve_data(raw_builder.get()));
 
@@ -1992,7 +1986,7 @@ struct CoalesceFunction : ScalarFunction {
 Status ExecScalarCoalesce(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   for (const auto& value : batch.values) {
     if (value.scalar->is_valid) {
-      out->value = value.scalar->GetSharedPtr();
+      out->value = value.scalar->Copy();
       break;
     }
   }
@@ -2196,7 +2190,7 @@ Status ExecBinaryCoalesce(KernelContext* ctx, const ExecValue& left,
   // TODO(wesm): remove the scalar output path
   if (left.is_scalar() && right.is_scalar()) {
     // Both scalar
-    out->value = (left.scalar->is_valid ? left : right).scalar->GetSharedPtr();
+    out->value = (left.scalar->is_valid ? left : right).scalar->Copy();
     return Status::OK();
   }
 
@@ -2281,8 +2275,8 @@ static Status ExecVarWidthCoalesceImpl(KernelContext* ctx, const ExecSpan& batch
     break;
   }
   std::unique_ptr<ArrayBuilder> raw_builder;
-  RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
-                                      &raw_builder));
+  RETURN_NOT_OK(
+      MakeBuilderExactIndex(ctx->memory_pool(), out->type()->Copy(), &raw_builder));
   RETURN_NOT_OK(raw_builder->Reserve(batch.length));
   RETURN_NOT_OK(reserve_data(raw_builder.get()));
 
@@ -2309,7 +2303,7 @@ static Status ExecVarWidthCoalesceImpl(KernelContext* ctx, const ExecSpan& batch
   }
   ARROW_ASSIGN_OR_RAISE(auto temp_output, raw_builder->Finish());
   out->value = std::move(temp_output->data());
-  out->array_data()->type = batch[0].type()->GetSharedPtr();
+  out->array_data()->type = batch[0].type()->Copy();
   return Status::OK();
 }
 
@@ -2374,7 +2368,7 @@ struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
       out->value = left.ToArrayData();
       return Status::OK();
     }
-    BuilderType builder(left.type->GetSharedPtr(), ctx->memory_pool());
+    BuilderType builder(left.type->Copy(), ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(left.length));
     const auto& scalar = checked_cast<const BaseBinaryScalar&>(right);
     const offset_type* offsets = left.GetValues<offset_type>(1);
@@ -2393,7 +2387,7 @@ struct CoalesceFunctor<Type, enable_if_base_binary<Type>> {
 
     ARROW_ASSIGN_OR_RAISE(auto temp_output, builder.Finish());
     out->value = std::move(temp_output->data());
-    out->array_data()->type = left.type->GetSharedPtr();
+    out->array_data()->type = left.type->Copy();
     return Status::OK();
   }
 
@@ -2459,8 +2453,8 @@ struct CoalesceFunctor<Type, enable_if_union<Type>> {
 
   static Status ExecArray(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     std::unique_ptr<ArrayBuilder> raw_builder;
-    RETURN_NOT_OK(MakeBuilderExactIndex(ctx->memory_pool(), out->type()->GetSharedPtr(),
-                                        &raw_builder));
+    RETURN_NOT_OK(
+        MakeBuilderExactIndex(ctx->memory_pool(), out->type()->Copy(), &raw_builder));
     RETURN_NOT_OK(raw_builder->Reserve(batch.length));
 
     const UnionType& type = checked_cast<const UnionType&>(*out->type());
@@ -2514,7 +2508,7 @@ struct CoalesceFunctor<Type, enable_if_union<Type>> {
       const auto& scalar = checked_cast<const UnionScalar&>(*value.scalar);
       // Union scalars can have top-level validity
       if (scalar.is_valid && scalar.value->is_valid) {
-        out->value = value.scalar->GetSharedPtr();
+        out->value = value.scalar->Copy();
         break;
       }
     }
@@ -2531,7 +2525,7 @@ Status ExecScalarChoose(KernelContext* ctx, const ExecSpan& batch, ExecResult* o
     if (out->is_array_span()) {
       // TODO(wesm): more graceful implementation than using
       // MakeNullScalar, which is a little bit lazy
-      std::shared_ptr<Scalar> source = MakeNullScalar(out->type()->GetSharedPtr());
+      std::shared_ptr<Scalar> source = MakeNullScalar(out->type()->Copy());
       ArraySpan* output = out->array_span();
       ExecValue copy_source;
       copy_source.SetScalar(source.get());
@@ -2549,7 +2543,7 @@ Status ExecScalarChoose(KernelContext* ctx, const ExecSpan& batch, ExecResult* o
   auto source = batch[index + 1];
   if (out->is_scalar()) {
     // All inputs to choose were scalar values
-    out->value = source.scalar->GetSharedPtr();
+    out->value = source.scalar->Copy();
   } else {
     ArraySpan* output = out->array_span();
     CopyValues<Type>(source, /*row=*/0, batch.length,
@@ -2622,9 +2616,9 @@ struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
       const Scalar& index_scalar = *batch[0].scalar;
       if (!index_scalar.is_valid) {
         if (out->is_array_data()) {
-          ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> temp_array,
-                                MakeArrayOfNull(out->type()->GetSharedPtr(), batch.length,
-                                                ctx->memory_pool()));
+          ARROW_ASSIGN_OR_RAISE(
+              std::shared_ptr<Array> temp_array,
+              MakeArrayOfNull(out->type()->Copy(), batch.length, ctx->memory_pool()));
           out->value = std::move(temp_array->data());
         }
         return Status::OK();
@@ -2642,7 +2636,7 @@ struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
           out->value = std::move(temp_array->data());
         } else {
           DCHECK(out->is_scalar());
-          out->value = source.scalar->GetSharedPtr();
+          out->value = source.scalar->Copy();
         }
       } else {
         DCHECK(out->is_array_data());
@@ -2654,7 +2648,7 @@ struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
     }
 
     // Row-wise implementation
-    BuilderType builder(out->type()->GetSharedPtr(), ctx->memory_pool());
+    BuilderType builder(out->type()->Copy(), ctx->memory_pool());
     RETURN_NOT_OK(builder.Reserve(batch.length));
     int64_t reserve_data = 0;
     for (const auto& value : batch.values) {
@@ -2687,7 +2681,7 @@ struct ChooseFunctor<Type, enable_if_base_binary<Type>> {
         }));
     std::shared_ptr<Array> temp_output;
     RETURN_NOT_OK(builder.Finish(&temp_output));
-    std::shared_ptr<DataType> actual_result_type = out->type()->GetSharedPtr();
+    std::shared_ptr<DataType> actual_result_type = out->type()->Copy();
     out->value = std::move(temp_output->data());
     // Builder type != logical type due to GenerateTypeAgnosticVarBinaryBase
     out->array_data()->type = std::move(actual_result_type);
@@ -2736,7 +2730,7 @@ struct ChooseFunction : ScalarFunction {
 };
 
 void AddCaseWhenKernel(const std::shared_ptr<CaseWhenFunction>& scalar_function,
-                       detail::GetTypeId get_id, ScalarKernel::ExecFunc exec) {
+                       detail::GetTypeId get_id, ArrayKernelExec exec) {
   ScalarKernel kernel(
       KernelSignature::Make({InputType(Type::STRUCT), InputType(get_id.id)},
                             OutputType(LastType),
@@ -2771,7 +2765,7 @@ void AddBinaryCaseWhenKernels(const std::shared_ptr<CaseWhenFunction>& scalar_fu
 }
 
 void AddCoalesceKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
-                       detail::GetTypeId get_id, ScalarKernel::ExecFunc exec) {
+                       detail::GetTypeId get_id, ArrayKernelExec exec) {
   ScalarKernel kernel(KernelSignature::Make({InputType(get_id.id)}, OutputType(FirstType),
                                             /*is_varargs=*/true),
                       exec);
@@ -2790,7 +2784,7 @@ void AddPrimitiveCoalesceKernels(const std::shared_ptr<ScalarFunction>& scalar_f
 }
 
 void AddChooseKernel(const std::shared_ptr<ScalarFunction>& scalar_function,
-                     detail::GetTypeId get_id, ScalarKernel::ExecFunc exec) {
+                     detail::GetTypeId get_id, ArrayKernelExec exec) {
   ScalarKernel kernel(
       KernelSignature::Make({Type::INT64, InputType(get_id.id)}, OutputType(LastType),
                             /*is_varargs=*/true),
diff --git a/cpp/src/arrow/compute/kernels/scalar_nested.cc b/cpp/src/arrow/compute/kernels/scalar_nested.cc
index 59209b08bd9..3754cc8e1c0 100644
--- a/cpp/src/arrow/compute/kernels/scalar_nested.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_nested.cc
@@ -333,7 +333,7 @@ struct StructFieldFunctor {
       }
     }
     // XXX: Revisit the above to see if we can avoid shared_from_this
-    out->value = current->GetSharedPtr();
+    out->value = current->Copy();
     return Status::OK();
   }
 
@@ -456,7 +456,7 @@ Status MakeStructExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out
   if (descr.shape == ValueDescr::SCALAR) {
     ScalarVector scalars(batch.num_values());
     for (int i = 0; i < batch.num_values(); ++i) {
-      scalars[i] = batch[i].scalar->GetSharedPtr();
+      scalars[i] = batch[i].scalar->Copy();
     }
     out->value =
         std::make_shared<StructScalar>(std::move(scalars), std::move(descr.type));
diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
index a608df87146..383ff30f342 100644
--- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
@@ -89,7 +89,7 @@ struct SetLookupState : public KernelState {
       return Status::OK();
     };
 
-    return VisitArrayDataInline<Type>(data, visit_valid, visit_null);
+    return VisitArraySpanInline<Type>(data, visit_valid, visit_null);
   }
 
   using MemoTable = typename HashTraits<Type>::MemoTableType;
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
index 6b3be303227..d66d074d0ba 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
@@ -159,9 +159,9 @@ struct FixedSizeBinaryTransformExecWithState
   }
 };
 
-template <typename T>
+template <typename offset_type>
 static int64_t GetVarBinaryValuesLength(const ArraySpan& span) {
-  const T* offsets = span.GetValues<T>(1);
+  const offset_type* offsets = span.GetValues<offset_type>(1);
   return span.length > 0 ? offsets[span.length] - offsets[0] : 0;
 }
 
@@ -568,8 +568,8 @@ Status StringDataTransform(KernelContext* ctx, const ExecSpan& batch,
     // Isn't an null output scalar already created? Anyway this code
     // will be deleted soon per ARROW-16577
     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar);
-    auto result = checked_pointer_cast<BaseBinaryScalar>(
-        MakeNullScalar(out->type()->GetSharedPtr()));
+    auto result =
+        checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()->Copy()));
     if (input.is_valid) {
       result->is_valid = true;
       int64_t data_nbytes = input.value->size();
@@ -2872,17 +2872,15 @@ struct BinaryJoin {
                                 const ArraySpan& right, ExecResult* out) {
     const auto& list_scalar = checked_cast<const BaseListScalar&>(left);
     if (!list_scalar.is_valid) {
-      ARROW_ASSIGN_OR_RAISE(
-          auto nulls,
-          MakeArrayOfNull(right.type->GetSharedPtr(), right.length, ctx->memory_pool()));
+      ARROW_ASSIGN_OR_RAISE(auto nulls, MakeArrayOfNull(right.type->Copy(), right.length,
+                                                        ctx->memory_pool()));
       out->value = std::move(nulls->data());
       return Status::OK();
     }
     const auto& strings = checked_cast<const ArrayType&>(*list_scalar.value);
     if (strings.null_count() != 0) {
-      ARROW_ASSIGN_OR_RAISE(
-          auto nulls,
-          MakeArrayOfNull(right.type->GetSharedPtr(), right.length, ctx->memory_pool()));
+      ARROW_ASSIGN_OR_RAISE(auto nulls, MakeArrayOfNull(right.type->Copy(), right.length,
+                                                        ctx->memory_pool()));
       out->value = std::move(nulls->data());
       return Status::OK();
     }
@@ -3187,7 +3185,7 @@ struct BinaryJoinElementWise {
     std::shared_ptr<Array> string_array;
     RETURN_NOT_OK(builder.Finish(&string_array));
     out->value = std::move(string_array->data());
-    out->array_data()->type = batch[0].type()->GetSharedPtr();
+    out->array_data()->type = batch[0].type()->Copy();
     DCHECK_EQ(batch.length, out->array_data()->length);
     DCHECK_EQ(final_size,
               checked_cast<const ArrayType&>(*string_array).total_values_length());
diff --git a/cpp/src/arrow/compute/kernels/scalar_validity.cc b/cpp/src/arrow/compute/kernels/scalar_validity.cc
index 713dafb3bb8..1685718b65e 100644
--- a/cpp/src/arrow/compute/kernels/scalar_validity.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_validity.cc
@@ -184,9 +184,8 @@ struct IsNanOperator {
 };
 
 void MakeFunction(std::string name, FunctionDoc doc, std::vector<InputType> in_types,
-                  OutputType out_type, ScalarKernel::ExecFunc exec,
-                  FunctionRegistry* registry, NullHandling::type null_handling,
-                  bool can_write_into_slices,
+                  OutputType out_type, ArrayKernelExec exec, FunctionRegistry* registry,
+                  NullHandling::type null_handling, bool can_write_into_slices,
                   const FunctionOptions* default_options = NULLPTR,
                   KernelInit init = NULLPTR) {
   Arity arity{static_cast<int>(in_types.size())};
diff --git a/cpp/src/arrow/compute/kernels/util_internal.cc b/cpp/src/arrow/compute/kernels/util_internal.cc
index 010569d687b..90f3687bee2 100644
--- a/cpp/src/arrow/compute/kernels/util_internal.cc
+++ b/cpp/src/arrow/compute/kernels/util_internal.cc
@@ -58,9 +58,8 @@ PrimitiveArg GetPrimitiveArg(const ArrayData& arr) {
 }
 
 // TODO(wesm): ARROW-16577: this will be unneeded later
-ScalarKernel::ExecFunc TrivialScalarUnaryAsArraysExec(ScalarKernel::ExecFunc exec,
-                                                      bool use_array_span,
-                                                      NullHandling::type null_handling) {
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(ArrayKernelExec exec, bool use_array_span,
+                                               NullHandling::type null_handling) {
   return [=](KernelContext* ctx, const ExecSpan& span, ExecResult* out) -> Status {
     if (!out->is_scalar()) {
       return exec(ctx, span, out);
diff --git a/cpp/src/arrow/compute/kernels/util_internal.h b/cpp/src/arrow/compute/kernels/util_internal.h
index 3b513a8647f..df2f2c64f3a 100644
--- a/cpp/src/arrow/compute/kernels/util_internal.h
+++ b/cpp/src/arrow/compute/kernels/util_internal.h
@@ -67,15 +67,15 @@ int GetBitWidth(const DataType& type);
 // rather than duplicating compiled code to do all these in each kernel.
 PrimitiveArg GetPrimitiveArg(const ArrayData& arr);
 
-// Augment a unary ScalarKernel::ExecFunc which supports only array-like inputs
+// Augment a unary ArrayKernelExec which supports only array-like inputs
 // with support for scalar inputs. Scalars will be transformed to 1-long arrays
 // with the scalar's value (or null if the scalar is null) as its only
 // element. This 1-long array will be passed to the original exec, then the
 // only element of the resulting array will be extracted as the output
 // scalar. This could be far more efficient, but instead of optimizing this
 // it'd be better to support scalar inputs "upstream" in original exec.
-ScalarKernel::ExecFunc TrivialScalarUnaryAsArraysExec(
-    ScalarKernel::ExecFunc exec, bool use_array_span = true,
+ArrayKernelExec TrivialScalarUnaryAsArraysExec(
+    ArrayKernelExec exec, bool use_array_span = true,
     NullHandling::type null_handling = NullHandling::INTERSECTION);
 
 // Return (min, max) of a numerical array, ignore nulls.
diff --git a/cpp/src/arrow/compute/kernels/vector_array_sort.cc b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
index c8b13e6d638..a42d8a72933 100644
--- a/cpp/src/arrow/compute/kernels/vector_array_sort.cc
+++ b/cpp/src/arrow/compute/kernels/vector_array_sort.cc
@@ -112,8 +112,10 @@ inline void VisitRawValuesInline(const ArrayType& values,
                                  VisitorNotNull&& visitor_not_null,
                                  VisitorNull&& visitor_null) {
   const auto data = values.raw_values();
+  auto validity_buf = values.data()->buffers[0];
+  const uint8_t* bitmap = validity_buf == nullptr ? nullptr : validity_buf->data();
   VisitBitBlocksVoid(
-      values.null_bitmap(), values.offset(), values.length(),
+      bitmap, values.offset(), values.length(),
       [&](int64_t i) { visitor_not_null(data[i]); }, [&]() { visitor_null(); });
 }
 
@@ -123,14 +125,15 @@ inline void VisitRawValuesInline(const BooleanArray& values,
                                  VisitorNull&& visitor_null) {
   if (values.null_count() != 0) {
     const uint8_t* data = values.data()->GetValues<uint8_t>(1, 0);
+    const uint8_t* bitmap = values.data()->buffers[0]->data();
     VisitBitBlocksVoid(
-        values.null_bitmap(), values.offset(), values.length(),
+        bitmap, values.offset(), values.length(),
         [&](int64_t i) { visitor_not_null(bit_util::GetBit(data, values.offset() + i)); },
         [&]() { visitor_null(); });
   } else {
     // Can avoid GetBit() overhead in the no-nulls case
     VisitBitBlocksVoid(
-        values.data()->buffers[1], values.offset(), values.length(),
+        values.data()->buffers[1]->data(), values.offset(), values.length(),
         [&](int64_t i) { visitor_not_null(true); }, [&]() { visitor_not_null(false); });
   }
 }
diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
index 5d1f7bd7e6c..8e2afe1af8c 100644
--- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
+++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc
@@ -132,7 +132,7 @@ struct CumulativeGeneric {
 
     if (skip_nulls || (input.GetNullCount() == 0 && !encountered_null)) {
       VisitArrayValuesInline<ArgType>(
-          ArraySpan(input),
+          input,
           [&](ArgValue v) {
             accumulator =
                 Op::template Call<OutValue, ArgValue, ArgValue>(ctx, v, accumulator, &st);
@@ -142,7 +142,7 @@ struct CumulativeGeneric {
     } else {
       int64_t nulls_start_idx = 0;
       VisitArrayValuesInline<ArgType>(
-          ArraySpan(input),
+          input,
           [&](ArgValue v) {
             if (!encountered_null) {
               accumulator = Op::template Call<OutValue, ArgValue, ArgValue>(
diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc
index bb3572e9ebe..9640b4d4d9b 100644
--- a/cpp/src/arrow/compute/kernels/vector_hash.cc
+++ b/cpp/src/arrow/compute/kernels/vector_hash.cc
@@ -292,7 +292,7 @@ class RegularHashKernel : public HashKernel {
 
   template <bool HasError = with_error_status>
   enable_if_t<!HasError, Status> DoAppend(const ArrayData& arr) {
-    return VisitArrayDataInline<Type>(
+    return VisitArraySpanInline<Type>(
         arr,
         [this](Scalar v) {
           auto on_found = [this](int32_t memo_index) {
@@ -324,7 +324,7 @@ class RegularHashKernel : public HashKernel {
 
   template <bool HasError = with_error_status>
   enable_if_t<HasError, Status> DoAppend(const ArrayData& arr) {
-    return VisitArrayDataInline<Type>(
+    return VisitArraySpanInline<Type>(
         arr,
         [this](Scalar v) {
           Status s = Status::OK();
diff --git a/cpp/src/arrow/compute/kernels/vector_replace.cc b/cpp/src/arrow/compute/kernels/vector_replace.cc
index bdcb37438dd..ff425f9e166 100644
--- a/cpp/src/arrow/compute/kernels/vector_replace.cc
+++ b/cpp/src/arrow/compute/kernels/vector_replace.cc
@@ -290,7 +290,7 @@ struct ReplaceWithMask<Type, enable_if_base_binary<Type>> {
     ArrayData adjusted_mask = mask;
     adjusted_mask.offset += mask_offset;
     adjusted_mask.length = std::min(adjusted_mask.length - mask_offset, array.length);
-    RETURN_NOT_OK(VisitArrayDataInline<BooleanType>(
+    RETURN_NOT_OK(VisitArraySpanInline<BooleanType>(
         adjusted_mask,
         [&](bool replace) {
           if (replace && replacements.is_scalar()) {
@@ -832,7 +832,7 @@ struct FillNullBackwardFunctor {
 template <template <class> class Functor>
 void RegisterVectorFunction(FunctionRegistry* registry,
                             std::shared_ptr<VectorFunction> func) {
-  auto add_kernel = [&](detail::GetTypeId get_id, KernelBatchExec exec) {
+  auto add_kernel = [&](detail::GetTypeId get_id, ArrayKernelExecOld exec) {
     VectorKernel kernel;
     kernel.can_execute_chunkwise = false;
     if (is_fixed_width(get_id.id)) {
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 5ef84d4fd92..c189074bb4c 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -495,7 +495,7 @@ void TakeIndexDispatch(const PrimitiveArg& values, const PrimitiveArg& indices,
 
 Status PrimitiveTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*batch[1].array()), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
   }
 
   PrimitiveArg values = GetPrimitiveArg(*batch[0].array());
@@ -1134,7 +1134,7 @@ Status BinaryFilter(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
 Status NullTake(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*batch[1].array()), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
   }
   // batch.length doesn't take into account the take indices
   auto new_length = batch[1].array()->length;
@@ -2319,7 +2319,7 @@ Status FilterExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 template <typename Impl>
 Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*batch[1].array()), batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(*batch[1].array(), batch[0].length()));
   }
   Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
   return kernel.ExecTake();
@@ -2327,7 +2327,7 @@ Status TakeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
 
 struct SelectionKernelDescr {
   InputType input;
-  KernelBatchExec exec;
+  ArrayKernelExecOld exec;
 };
 
 void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
@@ -2385,7 +2385,7 @@ struct NonZeroVisitor {
 
     for (const std::shared_ptr<ArrayData>& current_array : arrays) {
       VisitArrayValuesInline<Type>(
-          ArraySpan(*current_array),
+          *current_array,
           [&](T v) {
             if (v != zero) {
               this->builder->UnsafeAppend(index++);
diff --git a/cpp/src/arrow/csv/writer.cc b/cpp/src/arrow/csv/writer.cc
index 5ae33d78bad..a9dbec542df 100644
--- a/cpp/src/arrow/csv/writer.cc
+++ b/cpp/src/arrow/csv/writer.cc
@@ -168,7 +168,7 @@ class UnquotedColumnPopulator : public ColumnPopulator {
     }
 
     int64_t row_number = 0;
-    VisitArrayDataInline<StringType>(
+    VisitArraySpanInline<StringType>(
         *casted_array_->data(),
         [&](arrow::util::string_view s) {
           row_lengths[row_number] += static_cast<int64_t>(s.length());
@@ -202,7 +202,7 @@ class UnquotedColumnPopulator : public ColumnPopulator {
       return Status::OK();
     };
 
-    return VisitArrayDataInline<StringType>(*casted_array_->data(), valid_function,
+    return VisitArraySpanInline<StringType>(*casted_array_->data(), valid_function,
                                             null_function);
   }
 
@@ -269,7 +269,7 @@ class QuotedColumnPopulator : public ColumnPopulator {
     if (NoQuoteInArray(input)) {
       // fast path if no quote
       int row_number = 0;
-      VisitArrayDataInline<StringType>(
+      VisitArraySpanInline<StringType>(
           *input.data(),
           [&](arrow::util::string_view s) {
             row_lengths[row_number] += static_cast<int64_t>(s.length()) + kQuoteCount;
@@ -281,7 +281,7 @@ class QuotedColumnPopulator : public ColumnPopulator {
           });
     } else {
       int row_number = 0;
-      VisitArrayDataInline<StringType>(
+      VisitArraySpanInline<StringType>(
           *input.data(),
           [&](arrow::util::string_view s) {
             // Each quote in the value string needs to be escaped.
@@ -301,7 +301,7 @@ class QuotedColumnPopulator : public ColumnPopulator {
 
   Status PopulateRows(char* output, int64_t* offsets) const override {
     auto needs_escaping = row_needs_escaping_.begin();
-    VisitArrayDataInline<StringType>(
+    VisitArraySpanInline<StringType>(
         *(casted_array_->data()),
         [&](arrow::util::string_view s) {
           // still needs string content length to be added
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index 17afcf06e60..957dd3c5ca1 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -1657,8 +1657,7 @@ class CategoricalWriter
       const auto& indices = checked_cast<const ArrayType&>(*arr.indices());
       auto values = reinterpret_cast<const T*>(indices.raw_values());
 
-      RETURN_NOT_OK(
-          CheckIndexBounds(ArraySpan(*indices.data()), arr.dictionary()->length()));
+      RETURN_NOT_OK(CheckIndexBounds(*indices.data(), arr.dictionary()->length()));
       // Null is -1 in CategoricalBlock
       for (int i = 0; i < arr.length(); ++i) {
         if (indices.IsValid(i)) {
@@ -1692,7 +1691,7 @@ class CategoricalWriter
       auto transpose = reinterpret_cast<const int32_t*>(transpose_buffer->data());
       int64_t dict_length = arr.dictionary()->length();
 
-      RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*indices.data()), dict_length));
+      RETURN_NOT_OK(CheckIndexBounds(*indices.data(), dict_length));
 
       // Null is -1 in CategoricalBlock
       for (int i = 0; i < arr.length(); ++i) {
@@ -1716,8 +1715,8 @@ class CategoricalWriter
     const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
 
     if (data.num_chunks() == 1 && indices_first->null_count() == 0) {
-      RETURN_NOT_OK(CheckIndexBounds(ArraySpan(*indices_first->data()),
-                                     arr_first.dictionary()->length()));
+      RETURN_NOT_OK(
+          CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
 
       PyObject* wrapped;
       npy_intp dims[1] = {static_cast<npy_intp>(this->num_rows_)};
diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc
index e0046c2ab88..7307e9f49c4 100644
--- a/cpp/src/arrow/python/udf.cc
+++ b/cpp/src/arrow/python/udf.cc
@@ -60,7 +60,7 @@ struct PythonUdf {
     RETURN_NOT_OK(CheckPyError());
     for (int arg_id = 0; arg_id < num_args; arg_id++) {
       if (batch[arg_id].is_scalar()) {
-        std::shared_ptr<Scalar> c_data = batch[arg_id].scalar->GetSharedPtr();
+        std::shared_ptr<Scalar> c_data = batch[arg_id].scalar->Copy();
         PyObject* data = wrap_scalar(c_data);
         PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
       } else {
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 38e7c614124..dec9715afe1 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -107,8 +107,10 @@ struct ARROW_EXPORT Scalar : public std::enable_shared_from_this<Scalar>,
   /// \brief Apply the ScalarVisitor::Visit() method specialized to the scalar type
   Status Accept(ScalarVisitor* visitor) const;
 
-  /// EXPERIMENTAL
-  std::shared_ptr<Scalar> GetSharedPtr() const {
+  /// \brief EXPERIMENTAL Enable obtaining shared_ptr<Scalar> from a const
+  /// Scalar& context. Implementation depends on enable_shared_from_this, but
+  /// we may change this in the future
+  std::shared_ptr<Scalar> Copy() const {
     return const_cast<Scalar*>(this)->shared_from_this();
   }
 
@@ -135,9 +137,6 @@ namespace internal {
 
 struct ARROW_EXPORT PrimitiveScalarBase : public Scalar {
   using Scalar::Scalar;
-  /// \brief Get an immutable pointer to the value of this scalar. May be null.
-  virtual const void* data() const = 0;
-
   /// \brief Get a mutable pointer to the value of this scalar. May be null.
   virtual void* mutable_data() = 0;
   /// \brief Get an immutable view of the value of this scalar as bytes.
@@ -159,7 +158,6 @@ struct ARROW_EXPORT PrimitiveScalar : public PrimitiveScalarBase {
 
   ValueType value{};
 
-  const void* data() const override { return &value; }
   void* mutable_data() override { return &value; }
   util::string_view view() const override {
     return util::string_view(reinterpret_cast<const char*>(&value), sizeof(ValueType));
@@ -244,10 +242,6 @@ struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
 
   std::shared_ptr<Buffer> value;
 
-  const void* data() const override {
-    return value ? reinterpret_cast<const void*>(value->data()) : NULLPTR;
-  }
-
   void* mutable_data() override {
     return value ? reinterpret_cast<void*>(value->mutable_data()) : NULLPTR;
   }
@@ -418,10 +412,6 @@ struct ARROW_EXPORT DecimalScalar : public internal::PrimitiveScalarBase {
   DecimalScalar(ValueType value, std::shared_ptr<DataType> type)
       : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
 
-  const void* data() const override {
-    return reinterpret_cast<const void*>(value.native_endian_bytes());
-  }
-
   void* mutable_data() override {
     return reinterpret_cast<void*>(value.mutable_native_endian_bytes());
   }
@@ -544,10 +534,6 @@ struct ARROW_EXPORT DictionaryScalar : public internal::PrimitiveScalarBase {
 
   Result<std::shared_ptr<Scalar>> GetEncodedValue() const;
 
-  const void* data() const override {
-    return internal::checked_cast<const internal::PrimitiveScalarBase&>(*value.index)
-        .data();
-  }
   void* mutable_data() override {
     return internal::checked_cast<internal::PrimitiveScalarBase&>(*value.index)
         .mutable_data();
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 85c3f6e09e9..4c1fbcc6bc3 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -189,8 +189,9 @@ class ARROW_EXPORT DataType : public std::enable_shared_from_this<DataType>,
   virtual int bit_width() const { return -1; }
 
   // \brief EXPERIMENTAL: Enable retrieving shared_ptr<DataType> from a const
-  // context. Consider renaming or removing in favor of a Copy method
-  std::shared_ptr<DataType> GetSharedPtr() const {
+  // context. Implementation requires enable_shared_from_this but we may fix
+  // this in the future
+  std::shared_ptr<DataType> Copy() const {
     return const_cast<DataType*>(this)->shared_from_this();
   }
 
diff --git a/cpp/src/arrow/util/bit_block_counter.h b/cpp/src/arrow/util/bit_block_counter.h
index c1b6eef6bf1..f77cc319362 100644
--- a/cpp/src/arrow/util/bit_block_counter.h
+++ b/cpp/src/arrow/util/bit_block_counter.h
@@ -452,19 +452,6 @@ static Status VisitBitBlocks(const uint8_t* bitmap, int64_t offset, int64_t leng
   return Status::OK();
 }
 
-template <typename VisitNotNull, typename VisitNull>
-static Status VisitBitBlocks(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
-                             int64_t length, VisitNotNull&& visit_not_null,
-                             VisitNull&& visit_null) {
-  const uint8_t* bitmap = NULLPTR;
-  if (bitmap_buf != NULLPTR) {
-    bitmap = bitmap_buf->data();
-  }
-  return VisitBitBlocks(bitmap, offset, length,
-                        std::forward<VisitNotNull>(visit_not_null),
-                        std::forward<VisitNull>(visit_null));
-}
-
 template <typename VisitNotNull, typename VisitNull>
 static void VisitBitBlocksVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
                                VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
@@ -492,18 +479,6 @@ static void VisitBitBlocksVoid(const uint8_t* bitmap, int64_t offset, int64_t le
   }
 }
 
-template <typename VisitNotNull, typename VisitNull>
-static void VisitBitBlocksVoid(const std::shared_ptr<Buffer>& bitmap_buf, int64_t offset,
-                               int64_t length, VisitNotNull&& visit_not_null,
-                               VisitNull&& visit_null) {
-  const uint8_t* bitmap = NULLPTR;
-  if (bitmap_buf != NULLPTR) {
-    bitmap = bitmap_buf->data();
-  }
-  VisitBitBlocksVoid(bitmap, offset, length, std::forward<VisitNotNull>(visit_not_null),
-                     std::forward<VisitNull>(visit_null));
-}
-
 template <typename VisitNotNull, typename VisitNull>
 static Status VisitTwoBitBlocks(const uint8_t* left_bitmap, int64_t left_offset,
                                 const uint8_t* right_bitmap, int64_t right_offset,
@@ -548,21 +523,6 @@ static Status VisitTwoBitBlocks(const uint8_t* left_bitmap, int64_t left_offset,
   return Status::OK();
 }
 
-template <typename VisitNotNull, typename VisitNull>
-static Status VisitTwoBitBlocks(const std::shared_ptr<Buffer>& left_bitmap_buf,
-                                int64_t left_offset,
-                                const std::shared_ptr<Buffer>& right_bitmap_buf,
-                                int64_t right_offset, int64_t length,
-                                VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
-  const uint8_t* left_bitmap =
-      left_bitmap_buf == NULLPTR ? nullptr : left_bitmap_buf->data();
-  const uint8_t* right_bitmap =
-      right_bitmap_buf == NULLPTR ? nullptr : right_bitmap_buf->data();
-  return VisitTwoBitBlocks(left_bitmap, left_offset, right_bitmap, right_offset, length,
-                           std::forward<VisitNotNull>(visit_not_null),
-                           std::forward<VisitNull>(visit_null));
-}
-
 template <typename VisitNotNull, typename VisitNull>
 static void VisitTwoBitBlocksVoid(const uint8_t* left_bitmap, int64_t left_offset,
                                   const uint8_t* right_bitmap, int64_t right_offset,
@@ -606,20 +566,5 @@ static void VisitTwoBitBlocksVoid(const uint8_t* left_bitmap, int64_t left_offse
   }
 }
 
-template <typename VisitNotNull, typename VisitNull>
-static void VisitTwoBitBlocksVoid(const std::shared_ptr<Buffer>& left_bitmap_buf,
-                                  int64_t left_offset,
-                                  const std::shared_ptr<Buffer>& right_bitmap_buf,
-                                  int64_t right_offset, int64_t length,
-                                  VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
-  const uint8_t* left_bitmap =
-      left_bitmap_buf == NULLPTR ? nullptr : left_bitmap_buf->data();
-  const uint8_t* right_bitmap =
-      right_bitmap_buf == NULLPTR ? nullptr : right_bitmap_buf->data();
-  return VisitTwoBitBlocksVoid(
-      left_bitmap_buf, left_offset, right_bitmap_buf, right_offset, length,
-      std::forward<VisitNotNull>(visit_not_null), std::forward<VisitNull>(visit_null));
-}
-
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/bitmap.h b/cpp/src/arrow/util/bitmap.h
index 267783da08e..51a5fac97fb 100644
--- a/cpp/src/arrow/util/bitmap.h
+++ b/cpp/src/arrow/util/bitmap.h
@@ -70,7 +70,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
         length_(length) {}
 
   Bitmap Slice(int64_t offset) const {
-    if (mutable_data_ != nullptr) {
+    if (mutable_data_ != NULLPTR) {
       return Bitmap(mutable_data_, offset_ + offset, length_ - offset);
     } else {
       return Bitmap(data_, offset_ + offset, length_ - offset);
@@ -78,7 +78,7 @@ class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
   }
 
   Bitmap Slice(int64_t offset, int64_t length) const {
-    if (mutable_data_ != nullptr) {
+    if (mutable_data_ != NULLPTR) {
       return Bitmap(mutable_data_, offset_ + offset, length);
     } else {
       return Bitmap(data_, offset_ + offset, length);
diff --git a/cpp/src/arrow/util/int_util_benchmark.cc b/cpp/src/arrow/util/int_util_benchmark.cc
index de24a97af5c..1eae604a7da 100644
--- a/cpp/src/arrow/util/int_util_benchmark.cc
+++ b/cpp/src/arrow/util/int_util_benchmark.cc
@@ -107,7 +107,7 @@ static void CheckIndexBoundsInt32(
   random::RandomArrayGenerator rand(kSeed);
   auto arr = rand.Int32(args.size, 0, 100000, args.null_proportion);
   for (auto _ : state) {
-    ABORT_NOT_OK(CheckIndexBounds(ArraySpan(*arr->data()), 100001));
+    ABORT_NOT_OK(CheckIndexBounds(*arr->data(), 100001));
   }
 }
 
@@ -117,7 +117,7 @@ static void CheckIndexBoundsUInt32(
   random::RandomArrayGenerator rand(kSeed);
   auto arr = rand.UInt32(args.size, 0, 100000, args.null_proportion);
   for (auto _ : state) {
-    ABORT_NOT_OK(CheckIndexBounds(ArraySpan(*arr->data()), 100001));
+    ABORT_NOT_OK(CheckIndexBounds(*arr->data(), 100001));
   }
 }
 
diff --git a/cpp/src/arrow/util/int_util_test.cc b/cpp/src/arrow/util/int_util_test.cc
index 98878f00b5a..1a5fa472619 100644
--- a/cpp/src/arrow/util/int_util_test.cc
+++ b/cpp/src/arrow/util/int_util_test.cc
@@ -391,13 +391,13 @@ TEST(TransposeInts, Int8ToInt64) {
 void BoundsCheckPasses(const std::shared_ptr<DataType>& type,
                        const std::string& indices_json, uint64_t upper_limit) {
   auto indices = ArrayFromJSON(type, indices_json);
-  ASSERT_OK(CheckIndexBounds(ArraySpan(*indices->data()), upper_limit));
+  ASSERT_OK(CheckIndexBounds(*indices->data(), upper_limit));
 }
 
 void BoundsCheckFails(const std::shared_ptr<DataType>& type,
                       const std::string& indices_json, uint64_t upper_limit) {
   auto indices = ArrayFromJSON(type, indices_json);
-  ASSERT_RAISES(IndexError, CheckIndexBounds(ArraySpan(*indices->data()), upper_limit));
+  ASSERT_RAISES(IndexError, CheckIndexBounds(*indices->data(), upper_limit));
 }
 
 TEST(CheckIndexBounds, Batching) {
@@ -490,7 +490,7 @@ void CheckInRangePasses(const std::shared_ptr<DataType>& type,
                         const std::string& values_json, const std::string& limits_json) {
   auto values = ArrayFromJSON(type, values_json);
   auto limits = ArrayFromJSON(type, limits_json);
-  ASSERT_OK(CheckIntegersInRange(ArraySpan(*values->data()), **limits->GetScalar(0),
+  ASSERT_OK(CheckIntegersInRange(*values->data(), **limits->GetScalar(0),
                                  **limits->GetScalar(1)));
 }
 
@@ -498,9 +498,8 @@ void CheckInRangeFails(const std::shared_ptr<DataType>& type,
                        const std::string& values_json, const std::string& limits_json) {
   auto values = ArrayFromJSON(type, values_json);
   auto limits = ArrayFromJSON(type, limits_json);
-  ASSERT_RAISES(Invalid,
-                CheckIntegersInRange(ArraySpan(*values->data()), **limits->GetScalar(0),
-                                     **limits->GetScalar(1)));
+  ASSERT_RAISES(Invalid, CheckIntegersInRange(*values->data(), **limits->GetScalar(0),
+                                              **limits->GetScalar(1)));
 }
 
 TEST(CheckIntegersInRange, Batching) {
diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h
index fdefd97c9fb..2919f3d96fe 100644
--- a/cpp/src/arrow/visit_data_inline.h
+++ b/cpp/src/arrow/visit_data_inline.h
@@ -28,60 +28,59 @@
 #include "arrow/util/string_view.h"
 
 namespace arrow {
-
 namespace internal {
 
 template <typename T, typename Enable = void>
-struct ArrayDataInlineVisitor {};
+struct ArraySpanInlineVisitor {};
 
 // Numeric and primitive C-compatible types
 template <typename T>
-struct ArrayDataInlineVisitor<T, enable_if_has_c_type<T>> {
+struct ArraySpanInlineVisitor<T, enable_if_has_c_type<T>> {
   using c_type = typename T::c_type;
 
   template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
                             NullFunc&& null_func) {
     const c_type* data = arr.GetValues<c_type>(1);
     auto visit_valid = [&](int64_t i) { return valid_func(data[i]); };
-    return VisitBitBlocks(arr.buffers[0], arr.offset, arr.length, std::move(visit_valid),
-                          std::forward<NullFunc>(null_func));
+    return VisitBitBlocks(arr.buffers[0].data, arr.offset, arr.length,
+                          std::move(visit_valid), std::forward<NullFunc>(null_func));
   }
 
   template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
                         NullFunc&& null_func) {
     using c_type = typename T::c_type;
     const c_type* data = arr.GetValues<c_type>(1);
     auto visit_valid = [&](int64_t i) { valid_func(data[i]); };
-    VisitBitBlocksVoid(arr.buffers[0], arr.offset, arr.length, std::move(visit_valid),
-                       std::forward<NullFunc>(null_func));
+    VisitBitBlocksVoid(arr.buffers[0].data, arr.offset, arr.length,
+                       std::move(visit_valid), std::forward<NullFunc>(null_func));
   }
 };
 
 // Boolean
 template <>
-struct ArrayDataInlineVisitor<BooleanType> {
+struct ArraySpanInlineVisitor<BooleanType> {
   using c_type = bool;
 
   template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
                             NullFunc&& null_func) {
     int64_t offset = arr.offset;
-    const uint8_t* data = arr.buffers[1]->data();
+    const uint8_t* data = arr.buffers[1].data;
     return VisitBitBlocks(
-        arr.buffers[0], offset, arr.length,
+        arr.buffers[0].data, offset, arr.length,
         [&](int64_t i) { return valid_func(bit_util::GetBit(data, offset + i)); },
         std::forward<NullFunc>(null_func));
   }
 
   template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
                         NullFunc&& null_func) {
     int64_t offset = arr.offset;
-    const uint8_t* data = arr.buffers[1]->data();
+    const uint8_t* data = arr.buffers[1].data;
     VisitBitBlocksVoid(
-        arr.buffers[0], offset, arr.length,
+        arr.buffers[0].data, offset, arr.length,
         [&](int64_t i) { valid_func(bit_util::GetBit(data, offset + i)); },
         std::forward<NullFunc>(null_func));
   }
@@ -89,11 +88,11 @@ struct ArrayDataInlineVisitor<BooleanType> {
 
 // Binary, String...
 template <typename T>
-struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
+struct ArraySpanInlineVisitor<T, enable_if_base_binary<T>> {
   using c_type = util::string_view;
 
   template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
                             NullFunc&& null_func) {
     using offset_type = typename T::offset_type;
     constexpr char empty_value = 0;
@@ -103,7 +102,7 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
     }
     const offset_type* offsets = arr.GetValues<offset_type>(1);
     const char* data;
-    if (!arr.buffers[2]) {
+    if (arr.buffers[2].data == NULLPTR) {
       data = &empty_value;
     } else {
       // Do not apply the array offset to the values array; the value_offsets
@@ -112,7 +111,7 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
     }
     offset_type cur_offset = *offsets++;
     return VisitBitBlocks(
-        arr.buffers[0], arr.offset, arr.length,
+        arr.buffers[0].data, arr.offset, arr.length,
         [&](int64_t i) {
           ARROW_UNUSED(i);
           auto value = util::string_view(data + cur_offset, *offsets - cur_offset);
@@ -126,7 +125,7 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
   }
 
   template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
                         NullFunc&& null_func) {
     using offset_type = typename T::offset_type;
     constexpr uint8_t empty_value = 0;
@@ -136,7 +135,7 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
     }
     const offset_type* offsets = arr.GetValues<offset_type>(1);
     const uint8_t* data;
-    if (!arr.buffers[2]) {
+    if (arr.buffers[2].data == NULLPTR) {
       data = &empty_value;
     } else {
       // Do not apply the array offset to the values array; the value_offsets
@@ -145,7 +144,7 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
     }
 
     VisitBitBlocksVoid(
-        arr.buffers[0], arr.offset, arr.length,
+        arr.buffers[0].data, arr.offset, arr.length,
         [&](int64_t i) {
           auto value = util::string_view(reinterpret_cast<const char*>(data + offsets[i]),
                                          offsets[i + 1] - offsets[i]);
@@ -157,20 +156,17 @@ struct ArrayDataInlineVisitor<T, enable_if_base_binary<T>> {
 
 // FixedSizeBinary, Decimal128
 template <typename T>
-struct ArrayDataInlineVisitor<T, enable_if_fixed_size_binary<T>> {
+struct ArraySpanInlineVisitor<T, enable_if_fixed_size_binary<T>> {
   using c_type = util::string_view;
 
   template <typename ValidFunc, typename NullFunc>
-  static Status VisitStatus(const ArrayData& arr, ValidFunc&& valid_func,
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
                             NullFunc&& null_func) {
-    const auto& fw_type = internal::checked_cast<const FixedSizeBinaryType&>(*arr.type);
-
-    const int32_t byte_width = fw_type.byte_width();
+    const int32_t byte_width = arr.type->byte_width();
     const char* data = arr.GetValues<char>(1,
                                            /*absolute_offset=*/arr.offset * byte_width);
-
     return VisitBitBlocks(
-        arr.buffers[0], arr.offset, arr.length,
+        arr.buffers[0].data, arr.offset, arr.length,
         [&](int64_t i) {
           auto value = util::string_view(data, byte_width);
           data += byte_width;
@@ -183,16 +179,13 @@ struct ArrayDataInlineVisitor<T, enable_if_fixed_size_binary<T>> {
   }
 
   template <typename ValidFunc, typename NullFunc>
-  static void VisitVoid(const ArrayData& arr, ValidFunc&& valid_func,
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
                         NullFunc&& null_func) {
-    const auto& fw_type = internal::checked_cast<const FixedSizeBinaryType&>(*arr.type);
-
-    const int32_t byte_width = fw_type.byte_width();
+    const int32_t byte_width = arr.type->byte_width();
     const char* data = arr.GetValues<char>(1,
                                            /*absolute_offset=*/arr.offset * byte_width);
-
     VisitBitBlocksVoid(
-        arr.buffers[0], arr.offset, arr.length,
+        arr.buffers[0].data, arr.offset, arr.length,
         [&](int64_t i) {
           valid_func(util::string_view(data, byte_width));
           data += byte_width;
@@ -206,30 +199,17 @@ struct ArrayDataInlineVisitor<T, enable_if_fixed_size_binary<T>> {
 
 }  // namespace internal
 
-// Visit an array's data values, in order, without overhead.
-//
-// The given `ValidFunc` should be a callable with either of these signatures:
-// - void(scalar_type)
-// - Status(scalar_type)
-//
-// The `NullFunc` should have the same return type as `ValidFunc`.
-//
-// ... where `scalar_type` depends on the array data type:
-// - the type's `c_type`, if any
-// - for boolean arrays, a `bool`
-// - for binary, string and fixed-size binary arrays, a `util::string_view`
-
 template <typename T, typename ValidFunc, typename NullFunc>
 typename internal::call_traits::enable_if_return<ValidFunc, Status>::type
-VisitArrayDataInline(const ArrayData& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
-  return internal::ArrayDataInlineVisitor<T>::VisitStatus(
+VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+  return internal::ArraySpanInlineVisitor<T>::VisitStatus(
       arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
 }
 
 template <typename T, typename ValidFunc, typename NullFunc>
 typename internal::call_traits::enable_if_return<ValidFunc, void>::type
-VisitArrayDataInline(const ArrayData& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
-  return internal::ArrayDataInlineVisitor<T>::VisitVoid(
+VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+  return internal::ArraySpanInlineVisitor<T>::VisitVoid(
       arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
 }
 
@@ -245,12 +225,12 @@ VisitArrayDataInline(const ArrayData& arr, ValidFunc&& valid_func, NullFunc&& nu
 // - for binary, string and fixed-size binary arrays, a `util::string_view`
 
 template <typename T>
-struct ArrayDataVisitor {
-  using InlineVisitorType = internal::ArrayDataInlineVisitor<T>;
+struct ArraySpanVisitor {
+  using InlineVisitorType = internal::ArraySpanInlineVisitor<T>;
   using c_type = typename InlineVisitorType::c_type;
 
   template <typename Visitor>
-  static Status Visit(const ArrayData& arr, Visitor* visitor) {
+  static Status Visit(const ArraySpan& arr, Visitor* visitor) {
     return InlineVisitorType::VisitStatus(
         arr, [visitor](c_type v) { return visitor->VisitValue(v); },
         [visitor]() { return visitor->VisitNull(); });
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 6af7af50cba..083117c2019 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -152,7 +152,7 @@ class PlainEncoder : public EncoderImpl, virtual public TypedEncoder<DType> {
         array.value_offset(array.length()) - array.value_offset(0);
     PARQUET_THROW_NOT_OK(sink_.Reserve(total_bytes + array.length() * sizeof(uint32_t)));
 
-    PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
         *array.data(),
         [&](::arrow::util::string_view view) {
           if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
@@ -615,7 +615,7 @@ class DictEncoderImpl : public EncoderImpl, virtual public DictEncoder<DType> {
 
   template <typename ArrayType>
   void PutBinaryArray(const ArrayType& array) {
-    PARQUET_THROW_NOT_OK(::arrow::VisitArrayDataInline<typename ArrayType::TypeClass>(
+    PARQUET_THROW_NOT_OK(::arrow::VisitArraySpanInline<typename ArrayType::TypeClass>(
         *array.data(),
         [&](::arrow::util::string_view view) {
           if (ARROW_PREDICT_FALSE(view.size() > kMaxByteArraySize)) {
diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc
index 33b6c6dfbea..591925554fa 100644
--- a/cpp/src/parquet/statistics.cc
+++ b/cpp/src/parquet/statistics.cc
@@ -434,11 +434,11 @@ std::pair<ByteArray, ByteArray> GetMinMaxBinaryHelper(
   const auto null_func = [&]() {};
 
   if (::arrow::is_binary_like(values.type_id())) {
-    ::arrow::VisitArrayDataInline<::arrow::BinaryType>(
+    ::arrow::VisitArraySpanInline<::arrow::BinaryType>(
         *values.data(), std::move(valid_func), std::move(null_func));
   } else {
     DCHECK(::arrow::is_large_binary_like(values.type_id()));
-    ::arrow::VisitArrayDataInline<::arrow::LargeBinaryType>(
+    ::arrow::VisitArraySpanInline<::arrow::LargeBinaryType>(
         *values.data(), std::move(valid_func), std::move(null_func));
   }
 

From fe4227184c3e0c7a30777fceba0bc2a706222907 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 09:24:30 -0500
Subject: [PATCH 04/15] Checkpoint, various fixes moving toward cleanups to do
 in ARROW-16819

---
 cpp/src/arrow/compute/exec.cc                 | 246 ++++++++----------
 cpp/src/arrow/compute/exec/expression.cc      |   3 +-
 cpp/src/arrow/compute/exec_internal.h         |  23 +-
 cpp/src/arrow/compute/exec_test.cc            | 165 ++++++------
 cpp/src/arrow/compute/function.cc             |  33 ++-
 cpp/src/arrow/compute/kernel_test.cc          |   8 +-
 .../arrow/compute/kernels/hash_aggregate.cc   |  15 +-
 .../compute/kernels/scalar_compare_test.cc    |  23 +-
 cpp/src/arrow/datum.cc                        |  13 +-
 cpp/src/arrow/python/udf.cc                   |  11 +-
 10 files changed, 271 insertions(+), 269 deletions(-)

diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 7f7e014805f..a89973cb263 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -229,75 +229,24 @@ Status CheckAllValues(const std::vector<Datum>& values) {
   return Status::OK();
 }
 
-Status GetBatchLength(const std::vector<Datum>& values, int64_t* out_length) {
-  for (const auto& arg : values) {
-    if (!(arg.is_arraylike() || arg.is_scalar())) {
-      return Status::Invalid(
-          "Batch iteration only works with Scalar, Array, and "
-          "ChunkedArray arguments");
-    }
-  }
-
-  // If the arguments are all scalars, then the length is 1
-  int64_t length = 1;
-
-  bool length_set = false;
-  for (auto& arg : values) {
-    if (arg.is_scalar()) {
-      continue;
-    }
-    if (!length_set) {
-      length = arg.length();
-      length_set = true;
-    } else {
-      if (arg.length() != length) {
-        return Status::Invalid("Array arguments must all be the same length");
-      }
-    }
-  }
-  *out_length = length;
-  return Status::OK();
-}
-
-ExecBatchIterator::ExecBatchIterator(const std::vector<Datum>& args, int64_t length,
-                                     int64_t max_chunksize)
-    : args_(args), position_(0), length_(length), max_chunksize_(max_chunksize) {
-  chunk_indexes_.resize(args_.size(), 0);
-  chunk_positions_.resize(args_.size(), 0);
+ExecBatchIterator::ExecBatchIterator(const ExecBatch& batch, int64_t max_chunksize)
+    : batch_(batch), position_(0), length_(batch.length),
+      max_chunksize_(std::min(batch.length, max_chunksize)) {
+  chunk_indexes_.resize(batch.num_values(), 0);
+  chunk_positions_.resize(batch.num_values(), 0);
 }
 
 Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
-    const std::vector<Datum>& args, int64_t max_chunksize) {
-  for (const auto& arg : args) {
-    if (!(arg.is_arraylike() || arg.is_scalar())) {
-      return Status::Invalid(
-          "ExecBatchIterator only works with Scalar, Array, and "
-          "ChunkedArray arguments");
-    }
-  }
-
-  // If the arguments are all scalars, then the length is 1
-  int64_t length = 1;
-
-  bool length_set = false;
-  for (auto& arg : args) {
-    if (arg.is_scalar()) {
-      continue;
-    }
-    if (!length_set) {
-      length = arg.length();
-      length_set = true;
-    } else {
-      if (arg.length() != length) {
-        return Status::Invalid("Array arguments must all be the same length");
-      }
+    const ExecBatch& batch, int64_t max_chunksize) {
+  if (batch.num_values() > 0) {
+    // Validate arguments
+    ARROW_ASSIGN_OR_RAISE(int64_t inferred_length, InferBatchLength(batch.values));
+    if (inferred_length != batch.length) {
+      return Status::Invalid("Value lengths differed from ExecBatch length");
     }
   }
-
-  max_chunksize = std::min(length, max_chunksize);
-
   return std::unique_ptr<ExecBatchIterator>(
-      new ExecBatchIterator(args, length, max_chunksize));
+      new ExecBatchIterator(batch, max_chunksize));
 }
 
 bool ExecBatchIterator::Next(ExecBatch* batch) {
@@ -309,14 +258,14 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
   int64_t iteration_size = std::min(length_ - position_, max_chunksize_);
 
   // If length_ is 0, then this loop will never execute
-  for (size_t i = 0; i < args_.size() && iteration_size > 0; ++i) {
+  for (int i = 0; i < batch_.num_values() && iteration_size > 0; ++i) {
     // If the argument is not a chunked array, it's either a Scalar or Array,
     // in which case it doesn't influence the size of this batch. Note that if
     // the args are all scalars the batch length is 1
-    if (args_[i].kind() != Datum::CHUNKED_ARRAY) {
+    if (batch_[i].kind() != Datum::CHUNKED_ARRAY) {
       continue;
     }
-    const ChunkedArray& arg = *args_[i].chunked_array();
+    const ChunkedArray& arg = *batch_[i].chunked_array();
     std::shared_ptr<Array> current_chunk;
     while (true) {
       current_chunk = arg.chunk(chunk_indexes_[i]);
@@ -333,15 +282,15 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
   }
 
   // Now, fill the batch
-  batch->values.resize(args_.size());
+  batch->values.resize(batch_.num_values());
   batch->length = iteration_size;
-  for (size_t i = 0; i < args_.size(); ++i) {
-    if (args_[i].is_scalar()) {
-      batch->values[i] = args_[i].scalar();
-    } else if (args_[i].is_array()) {
-      batch->values[i] = args_[i].array()->Slice(position_, iteration_size);
+  for (int i = 0; i < batch_.num_values(); ++i) {
+    if (batch_[i].is_scalar()) {
+      batch->values[i] = batch_[i].scalar();
+    } else if (batch_[i].is_array()) {
+      batch->values[i] = batch_[i].array()->Slice(position_, iteration_size);
     } else {
-      const ChunkedArray& carr = *args_[i].chunked_array();
+      const ChunkedArray& carr = *batch_[i].chunked_array();
       const auto& chunk = carr.chunk(chunk_indexes_[i]);
       batch->values[i] = chunk->data()->Slice(chunk_positions_[i], iteration_size);
       chunk_positions_[i] += iteration_size;
@@ -355,17 +304,25 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
 // ----------------------------------------------------------------------
 // ExecSpanIterator; to eventually replace ExecBatchIterator
 
-Status ExecSpanIterator::Init(const std::vector<Datum>& args, int64_t max_chunksize) {
-  args_ = &args;
+Status ExecSpanIterator::Init(const ExecBatch& batch,
+                              ValueDescr::Shape output_shape, int64_t max_chunksize) {
+  if (batch.num_values() > 0) {
+    // Validate arguments
+    ARROW_ASSIGN_OR_RAISE(int64_t inferred_length, InferBatchLength(batch.values));
+    if (inferred_length != batch.length) {
+      return Status::Invalid("Value lengths differed from ExecBatch length");
+    }
+  }
+  args_ = &batch.values;
   initialized_ = have_chunked_arrays_ = false;
   position_ = 0;
+  length_ = output_shape == ValueDescr::SCALAR ? 1 : batch.length;
   chunk_indexes_.clear();
   chunk_indexes_.resize(args_->size(), 0);
   value_positions_.clear();
   value_positions_.resize(args_->size(), 0);
   value_offsets_.clear();
   value_offsets_.resize(args_->size(), 0);
-  RETURN_NOT_OK(GetBatchLength(*args_, &length_));
   max_chunksize_ = std::min(length_, max_chunksize);
   return Status::OK();
 }
@@ -771,8 +728,9 @@ class KernelExecutorImpl : public KernelExecutor {
 
 class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
  public:
-  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
-    RETURN_NOT_OK(span_iterator_.Init(args, exec_context()->exec_chunksize()));
+  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
+    RETURN_NOT_OK(
+        span_iterator_.Init(batch, output_descr_.shape, exec_context()->exec_chunksize()));
 
     // TODO(wesm): remove if with ARROW-16757
     if (output_descr_.shape != ValueDescr::SCALAR) {
@@ -780,7 +738,7 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
       // kernels supporting preallocation, then we do so up front and then
       // iterate over slices of that large array. Otherwise, we preallocate prior
       // to processing each span emitted from the ExecSpanIterator
-      RETURN_NOT_OK(SetupPreallocation(span_iterator_.length(), args));
+      RETURN_NOT_OK(SetupPreallocation(span_iterator_.length(), batch.values));
     }
 
     // ARROW-16756: Here we have to accommodate the distinct cases
@@ -922,6 +880,12 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
 
       RETURN_NOT_OK(kernel_->exec(kernel_ctx_, input, &output));
 
+      // Assert that the kernel did not alter the shape of the output
+      // type. After ARROW-16577 delete this since ValueDescr::SCALAR will not
+      // exist anymore
+      DCHECK(((output_descr_.shape == ValueDescr::ARRAY) && output.is_array_data()) ||
+             ((output_descr_.shape == ValueDescr::SCALAR) && output.is_scalar()));
+
       // Emit a result for each chunk
       if (output_descr_.shape == ValueDescr::ARRAY) {
         RETURN_NOT_OK(listener->OnResult(output.array_data()));
@@ -1003,37 +967,28 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
   ExecSpanIterator span_iterator_;
 };
 
-Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
-  int64_t length = 0;
-  for (const auto& arg : args) {
-    switch (arg.kind()) {
-      case Datum::SCALAR:
-      case Datum::ARRAY:
-      case Datum::CHUNKED_ARRAY:
-        length = std::max(arg.length(), length);
-        break;
-      default:
-        DCHECK(false);
-        break;
-    }
-  }
-  out->length = length;
-  out->values = args;
-  return Status::OK();
-}
-
 class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
  public:
-  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
-    RETURN_NOT_OK(PrepareExecute(args));
-    ExecBatch batch;
-    if (kernel_->can_execute_chunkwise) {
-      while (batch_iterator_->Next(&batch)) {
-        RETURN_NOT_OK(ExecuteBatch(batch, listener));
-      }
+  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
+    RETURN_NOT_OK(PrepareExecute(batch));
+    Datum out;
+    if (output_descr_.shape == ValueDescr::ARRAY) {
+      // We preallocate (maybe) only for the output of processing the current
+      // batch
+      ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
+    }
+
+    if (kernel_->null_handling == NullHandling::INTERSECTION &&
+        output_descr_.shape == ValueDescr::ARRAY) {
+      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array()));
+    }
+    RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+    if (!kernel_->finalize) {
+      // If there is no result finalizer (e.g. for hash-based functions, we can
+      // emit the processed batch right away rather than waiting
+      RETURN_NOT_OK(listener->OnResult(std::move(out)));
     } else {
-      RETURN_NOT_OK(PackBatchNoChunks(args, &batch));
-      RETURN_NOT_OK(ExecuteBatch(batch, listener));
+      results_.emplace_back(std::move(out));
     }
     return Finalize(listener);
   }
@@ -1055,29 +1010,6 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
   }
 
  protected:
-  Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
-    Datum out;
-    if (output_descr_.shape == ValueDescr::ARRAY) {
-      // We preallocate (maybe) only for the output of processing the current
-      // batch
-      ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
-    }
-
-    if (kernel_->null_handling == NullHandling::INTERSECTION &&
-        output_descr_.shape == ValueDescr::ARRAY) {
-      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array()));
-    }
-    RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
-    if (!kernel_->finalize) {
-      // If there is no result finalizer (e.g. for hash-based functions, we can
-      // emit the processed batch right away rather than waiting
-      RETURN_NOT_OK(listener->OnResult(std::move(out)));
-    } else {
-      results_.emplace_back(std::move(out));
-    }
-    return Status::OK();
-  }
-
   Status Finalize(ExecListener* listener) {
     if (kernel_->finalize) {
       // Intermediate results require post-processing after the execution is
@@ -1090,11 +1022,7 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
     return Status::OK();
   }
 
-  Status PrepareExecute(const std::vector<Datum>& args) {
-    if (kernel_->can_execute_chunkwise) {
-      ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
-                                                 args, exec_context()->exec_chunksize()));
-    }
+  Status PrepareExecute(const ExecBatch& batch) {
     output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
 
     // Decide if we need to preallocate memory for this kernel
@@ -1107,7 +1035,6 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
     return Status::OK();
   }
 
-  std::unique_ptr<ExecBatchIterator> batch_iterator_;
   std::vector<Datum> results_;
 };
 
@@ -1119,15 +1046,15 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
     return KernelExecutorImpl<ScalarAggregateKernel>::Init(ctx, args);
   }
 
-  Status Execute(const std::vector<Datum>& args, ExecListener* listener) override {
+  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
     ARROW_ASSIGN_OR_RAISE(
-        batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
+        batch_iterator_, ExecBatchIterator::Make(batch, exec_context()->exec_chunksize()));
 
-    ExecBatch batch;
-    while (batch_iterator_->Next(&batch)) {
+    ExecBatch iter_batch;
+    while (batch_iterator_->Next(&iter_batch)) {
       // TODO: implement parallelism
-      if (batch.length > 0) {
-        RETURN_NOT_OK(Consume(batch));
+      if (iter_batch.length > 0) {
+        RETURN_NOT_OK(Consume(iter_batch));
       }
     }
 
@@ -1274,6 +1201,41 @@ std::unique_ptr<KernelExecutor> KernelExecutor::MakeScalarAggregate() {
   return ::arrow::internal::make_unique<detail::ScalarAggExecutor>();
 }
 
+Result<int64_t> InferBatchLength(const std::vector<Datum>& values) {
+  int64_t length = -1;
+  bool are_all_scalar = true;
+  for (const Datum& arg : values) {
+    if (arg.is_array()) {
+      int64_t arg_length = arg.array()->length;
+      if (length < 0) {
+        length = arg_length;
+      } else {
+        if (length != arg_length) {
+          return Status::Invalid("Array lengths were not all the same");
+        }
+      }
+      are_all_scalar = false;
+    } else if (arg.is_chunked_array()) {
+      int64_t arg_length = arg.chunked_array()->length();
+      if (length < 0) {
+        length = arg_length;
+      } else {
+        if (length != arg_length) {
+          return Status::Invalid("Array lengths were not all the same");
+        }
+      }
+      are_all_scalar = false;
+    }
+  }
+
+  if (are_all_scalar && values.size() > 0) {
+    length = 1;
+  } else if (length < 0) {
+    length = 0;
+  }
+  return length;
+}
+
 }  // namespace detail
 
 ExecContext::ExecContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc
index a65379460dd..c20ee2865fc 100644
--- a/cpp/src/arrow/compute/exec/expression.cc
+++ b/cpp/src/arrow/compute/exec/expression.cc
@@ -597,7 +597,8 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i
   RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
 
   compute::detail::DatumAccumulator listener;
-  RETURN_NOT_OK(executor->Execute(arguments, &listener));
+  RETURN_NOT_OK(executor->Execute(ExecBatch(std::move(arguments),
+                                            input.length), &listener));
   const auto out = executor->WrapResults(arguments, listener.values());
 #ifndef NDEBUG
   DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str()));
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index 2321276e7c0..ce1b9092b12 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -46,11 +46,11 @@ class ARROW_EXPORT ExecBatchIterator {
  public:
   /// \brief Construct iterator and do basic argument validation
   ///
-  /// \param[in] args the Datum argument, must be all array-like or scalar
+  /// \param[in] batch the ExecBatch to iterate over
   /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending
   /// on the chunk layout of ChunkedArray.
   static Result<std::unique_ptr<ExecBatchIterator>> Make(
-      const std::vector<Datum>& args, int64_t max_chunksize = kDefaultMaxChunksize);
+      const ExecBatch& batch, int64_t max_chunksize = kDefaultMaxChunksize);
 
   /// \brief Compute the next batch. Always returns at least one batch. Return
   /// false if the iterator is exhausted
@@ -63,10 +63,9 @@ class ARROW_EXPORT ExecBatchIterator {
   int64_t max_chunksize() const { return max_chunksize_; }
 
  private:
-  ExecBatchIterator(const std::vector<Datum>& args, int64_t length,
-                    int64_t max_chunksize);
+  ExecBatchIterator(const ExecBatch& batch, int64_t max_chunksize);
 
-  const std::vector<Datum>& args_;
+  const ExecBatch& batch_;
   std::vector<int> chunk_indexes_;
   std::vector<int64_t> chunk_positions_;
   int64_t position_;
@@ -83,10 +82,11 @@ class ARROW_EXPORT ExecSpanIterator {
 
   /// \brief Initialize itertor iterator and do basic argument validation
   ///
-  /// \param[in] args the Datum argument, must be all array-like or scalar
+  /// \param[in] batch the input ExecBatch
   /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending
   /// on the chunk layout of ChunkedArray.
-  Status Init(const std::vector<Datum>& args,
+  Status Init(const ExecBatch& batch,
+              ValueDescr::Shape output_shape = ValueDescr::ARRAY,
               int64_t max_chunksize = kDefaultMaxChunksize);
 
   /// \brief Compute the next span by updating the state of the
@@ -165,9 +165,10 @@ class ARROW_EXPORT KernelExecutor {
   /// for all scanned batches in a dataset filter.
   virtual Status Init(KernelContext*, KernelInitArgs) = 0;
 
-  /// XXX: Better configurability for listener
-  /// Not thread-safe
-  virtual Status Execute(const std::vector<Datum>& args, ExecListener* listener) = 0;
+  // TODO(wesm): per ARROW-16819, adding ExecBatch variant so that a batch
+  // length can be passed in for scalar functions; will have to return and
+  // clean a bunch of things up
+  virtual Status Execute(const ExecBatch& batch, ExecListener* listener) = 0;
 
   virtual Datum WrapResults(const std::vector<Datum>& args,
                             const std::vector<Datum>& outputs) = 0;
@@ -180,6 +181,8 @@ class ARROW_EXPORT KernelExecutor {
   static std::unique_ptr<KernelExecutor> MakeScalarAggregate();
 };
 
+Result<int64_t> InferBatchLength(const std::vector<Datum>& values);
+
 /// \brief Populate validity bitmap with the intersection of the nullity of the
 /// arguments. If a preallocated bitmap is not provided, then one will be
 /// allocated if needed (in some cases a bitmap can be zero-copied from the
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index 3d99368bbb2..ba6fb059681 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -663,13 +663,13 @@ TEST_F(TestPropagateNullsSpans, NullOutputTypeNoop) {
 
 class TestExecBatchIterator : public TestComputeInternals {
  public:
-  void SetupIterator(const std::vector<Datum>& args,
+  void SetupIterator(const ExecBatch& input,
                      int64_t max_chunksize = kDefaultMaxChunksize) {
-    ASSERT_OK_AND_ASSIGN(iterator_, ExecBatchIterator::Make(args, max_chunksize));
+    ASSERT_OK_AND_ASSIGN(iterator_, ExecBatchIterator::Make(input, max_chunksize));
   }
-  void CheckIteration(const std::vector<Datum>& args, int chunksize,
+  void CheckIteration(const ExecBatch& input, int chunksize,
                       const std::vector<int>& ex_batch_sizes) {
-    SetupIterator(args, chunksize);
+    SetupIterator(input, chunksize);
     ExecBatch batch;
     int64_t position = 0;
     for (size_t i = 0; i < ex_batch_sizes.size(); ++i) {
@@ -677,17 +677,17 @@ class TestExecBatchIterator : public TestComputeInternals {
       ASSERT_TRUE(iterator_->Next(&batch));
       ASSERT_EQ(ex_batch_sizes[i], batch.length);
 
-      for (size_t j = 0; j < args.size(); ++j) {
-        switch (args[j].kind()) {
+      for (size_t j = 0; j < input.values.size(); ++j) {
+        switch (input[j].kind()) {
           case Datum::SCALAR:
-            ASSERT_TRUE(args[j].scalar()->Equals(batch[j].scalar()));
+            ASSERT_TRUE(input[j].scalar()->Equals(batch[j].scalar()));
             break;
           case Datum::ARRAY:
-            AssertArraysEqual(*args[j].make_array()->Slice(position, batch.length),
+            AssertArraysEqual(*input[j].make_array()->Slice(position, batch.length),
                               *batch[j].make_array());
             break;
           case Datum::CHUNKED_ARRAY: {
-            const ChunkedArray& carr = *args[j].chunked_array();
+            const ChunkedArray& carr = *input[j].chunked_array();
             if (batch.length == 0) {
               ASSERT_EQ(0, carr.length());
             } else {
@@ -717,9 +717,9 @@ TEST_F(TestExecBatchIterator, Basics) {
   const int64_t length = 100;
 
   // Simple case with a single chunk
-  std::vector<Datum> args = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
-                             Datum(std::make_shared<Int32Scalar>(3))};
-  SetupIterator(args);
+  ExecBatch input({Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
+        Datum(std::make_shared<Int32Scalar>(3))}, length);
+  SetupIterator(input);
 
   ExecBatch batch;
   ASSERT_TRUE(iterator_->Next(&batch));
@@ -732,60 +732,62 @@ TEST_F(TestExecBatchIterator, Basics) {
   ASSERT_EQ(ValueDescr::Array(float64()), descrs[1]);
   ASSERT_EQ(ValueDescr::Scalar(int32()), descrs[2]);
 
-  AssertArraysEqual(*args[0].make_array(), *batch[0].make_array());
-  AssertArraysEqual(*args[1].make_array(), *batch[1].make_array());
-  ASSERT_TRUE(args[2].scalar()->Equals(batch[2].scalar()));
+  AssertArraysEqual(*input[0].make_array(), *batch[0].make_array());
+  AssertArraysEqual(*input[1].make_array(), *batch[1].make_array());
+  ASSERT_TRUE(input[2].scalar()->Equals(batch[2].scalar()));
 
   ASSERT_EQ(length, iterator_->position());
   ASSERT_FALSE(iterator_->Next(&batch));
 
   // Split into chunks of size 16
-  CheckIteration(args, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
+  CheckIteration(input, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
 }
 
 TEST_F(TestExecBatchIterator, InputValidation) {
-  std::vector<Datum> args = {Datum(GetInt32Array(10)), Datum(GetInt32Array(9))};
-  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(args));
+  ExecBatch batch({Datum(GetInt32Array(10)), Datum(GetInt32Array(9))}, 10);
+  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(batch));
 
-  args = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
-  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(args));
+  batch.values = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
+  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(batch));
 
-  args = {Datum(GetInt32Array(10))};
-  ASSERT_OK_AND_ASSIGN(auto iterator, ExecBatchIterator::Make(args));
+  batch.values = {Datum(GetInt32Array(10))};
+  ASSERT_OK_AND_ASSIGN(auto iterator, ExecBatchIterator::Make(batch));
   ASSERT_EQ(10, iterator->max_chunksize());
 }
 
 TEST_F(TestExecBatchIterator, ChunkedArrays) {
-  std::vector<Datum> args = {Datum(GetInt32Chunked({0, 20, 10})),
-                             Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
-                             Datum(std::make_shared<Int32Scalar>(5)),
-                             Datum(MakeNullScalar(boolean()))};
-
-  CheckIteration(args, /*chunksize=*/10, {10, 5, 5, 10});
-  CheckIteration(args, /*chunksize=*/20, {15, 5, 10});
-  CheckIteration(args, /*chunksize=*/30, {15, 5, 10});
+  ExecBatch input({Datum(GetInt32Chunked({0, 20, 10})),
+        Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
+        Datum(std::make_shared<Int32Scalar>(5)),
+        Datum(MakeNullScalar(boolean()))}, 30);
+
+  CheckIteration(input, /*chunksize=*/10, {10, 5, 5, 10});
+  CheckIteration(input, /*chunksize=*/20, {15, 5, 10});
+  CheckIteration(input, /*chunksize=*/30, {15, 5, 10});
 }
 
 TEST_F(TestExecBatchIterator, ZeroLengthInputs) {
   auto carr = std::shared_ptr<ChunkedArray>(new ChunkedArray({}, int32()));
 
-  auto CheckArgs = [&](const std::vector<Datum>& args) {
-    auto iterator = ExecBatchIterator::Make(args).ValueOrDie();
+  auto CheckArgs = [&](const ExecBatch& input) {
+    auto iterator = ExecBatchIterator::Make(input).ValueOrDie();
     ExecBatch batch;
     ASSERT_FALSE(iterator->Next(&batch));
   };
 
+  ExecBatch input({}, 0);
+
   // Zero-length ChunkedArray with zero chunks
-  std::vector<Datum> args = {Datum(carr)};
-  CheckArgs(args);
+  input.values = {Datum(carr)};
+  CheckArgs(input);
 
   // Zero-length array
-  args = {Datum(GetInt32Array(0))};
-  CheckArgs(args);
+  input.values = {Datum(GetInt32Array(0))};
+  CheckArgs(input);
 
   // ChunkedArray with single empty chunk
-  args = {Datum(GetInt32Chunked({0}))};
-  CheckArgs(args);
+  input.values = {Datum(GetInt32Chunked({0}))};
+  CheckArgs(input);
 }
 
 // ----------------------------------------------------------------------
@@ -793,13 +795,14 @@ TEST_F(TestExecBatchIterator, ZeroLengthInputs) {
 
 class TestExecSpanIterator : public TestComputeInternals {
  public:
-  void SetupIterator(const std::vector<Datum>& args,
+  void SetupIterator(const ExecBatch& batch,
+                     ValueDescr::Shape output_shape = ValueDescr::ARRAY,
                      int64_t max_chunksize = kDefaultMaxChunksize) {
-    ASSERT_OK(iterator_.Init(args, max_chunksize));
+    ASSERT_OK(iterator_.Init(batch, output_shape, max_chunksize));
   }
-  void CheckIteration(const std::vector<Datum>& args, int chunksize,
+  void CheckIteration(const ExecBatch& input, int chunksize,
                       const std::vector<int>& ex_batch_sizes) {
-    SetupIterator(args, chunksize);
+    SetupIterator(input, ValueDescr::ARRAY, chunksize);
     ExecSpan batch;
     int64_t position = 0;
     for (size_t i = 0; i < ex_batch_sizes.size(); ++i) {
@@ -807,17 +810,17 @@ class TestExecSpanIterator : public TestComputeInternals {
       ASSERT_TRUE(iterator_.Next(&batch));
       ASSERT_EQ(ex_batch_sizes[i], batch.length);
 
-      for (size_t j = 0; j < args.size(); ++j) {
-        switch (args[j].kind()) {
+      for (size_t j = 0; j < input.values.size(); ++j) {
+        switch (input[j].kind()) {
           case Datum::SCALAR:
-            ASSERT_TRUE(args[j].scalar()->Equals(*batch[j].scalar));
+            ASSERT_TRUE(input[j].scalar()->Equals(*batch[j].scalar));
             break;
           case Datum::ARRAY:
-            AssertArraysEqual(*args[j].make_array()->Slice(position, batch.length),
+            AssertArraysEqual(*input[j].make_array()->Slice(position, batch.length),
                               *batch[j].array.ToArray());
             break;
           case Datum::CHUNKED_ARRAY: {
-            const ChunkedArray& carr = *args[j].chunked_array();
+            const ChunkedArray& carr = *input[j].chunked_array();
             if (batch.length == 0) {
               ASSERT_EQ(0, carr.length());
             } else {
@@ -846,10 +849,13 @@ class TestExecSpanIterator : public TestComputeInternals {
 TEST_F(TestExecSpanIterator, Basics) {
   const int64_t length = 100;
 
+  ExecBatch input;
+  input.length = 100;
+
   // Simple case with a single chunk
-  std::vector<Datum> args = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
-                             Datum(std::make_shared<Int32Scalar>(3))};
-  SetupIterator(args);
+  input.values = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
+                  Datum(std::make_shared<Int32Scalar>(3))};
+  SetupIterator(input);
 
   ExecSpan batch;
   ASSERT_TRUE(iterator_.Next(&batch));
@@ -857,62 +863,65 @@ TEST_F(TestExecSpanIterator, Basics) {
   ASSERT_EQ(3, batch.num_values());
   ASSERT_EQ(length, batch.length);
 
-  AssertArraysEqual(*args[0].make_array(), *batch[0].array.ToArray());
-  AssertArraysEqual(*args[1].make_array(), *batch[1].array.ToArray());
-  ASSERT_TRUE(args[2].scalar()->Equals(*batch[2].scalar));
+  AssertArraysEqual(*input[0].make_array(), *batch[0].array.ToArray());
+  AssertArraysEqual(*input[1].make_array(), *batch[1].array.ToArray());
+  ASSERT_TRUE(input[2].scalar()->Equals(*batch[2].scalar));
 
   ASSERT_EQ(length, iterator_.position());
   ASSERT_FALSE(iterator_.Next(&batch));
 
   // Split into chunks of size 16
-  CheckIteration(args, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
+  CheckIteration(input, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
 }
 
 TEST_F(TestExecSpanIterator, InputValidation) {
   ExecSpanIterator iterator;
 
-  std::vector<Datum> args = {Datum(GetInt32Array(10)), Datum(GetInt32Array(9))};
-  ASSERT_RAISES(Invalid, iterator.Init(args));
+  ExecBatch batch({Datum(GetInt32Array(10)), Datum(GetInt32Array(9))}, 10);
+  ASSERT_RAISES(Invalid, iterator.Init(batch));
 
-  args = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
-  ASSERT_RAISES(Invalid, iterator.Init(args));
+  batch.values = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
+  ASSERT_RAISES(Invalid, iterator.Init(batch));
 
-  args = {Datum(GetInt32Array(10))};
-  ASSERT_OK(iterator.Init(args));
+  batch.values = {Datum(GetInt32Array(10))};
+  ASSERT_OK(iterator.Init(batch));
 }
 
 TEST_F(TestExecSpanIterator, ChunkedArrays) {
-  std::vector<Datum> args = {Datum(GetInt32Chunked({0, 20, 10})),
-                             Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
-                             Datum(std::make_shared<Int32Scalar>(5)),
-                             Datum(MakeNullScalar(boolean()))};
-
-  CheckIteration(args, /*chunksize=*/10, {10, 5, 5, 10});
-  CheckIteration(args, /*chunksize=*/20, {15, 5, 10});
-  CheckIteration(args, /*chunksize=*/30, {15, 5, 10});
+  ExecBatch batch({Datum(GetInt32Chunked({0, 20, 10})),
+        Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
+        Datum(std::make_shared<Int32Scalar>(5)),
+        Datum(MakeNullScalar(boolean()))}, 30);
+
+  CheckIteration(batch, /*chunksize=*/10, {10, 5, 5, 10});
+  CheckIteration(batch, /*chunksize=*/20, {15, 5, 10});
+  CheckIteration(batch, /*chunksize=*/30, {15, 5, 10});
 }
 
 TEST_F(TestExecSpanIterator, ZeroLengthInputs) {
   auto carr = std::shared_ptr<ChunkedArray>(new ChunkedArray({}, int32()));
 
-  auto CheckArgs = [&](const std::vector<Datum>& args) {
+  auto CheckArgs = [&](const ExecBatch& batch) {
     ExecSpanIterator iterator;
-    ASSERT_OK(iterator.Init(args));
-    ExecSpan batch;
-    ASSERT_FALSE(iterator.Next(&batch));
+    ASSERT_OK(iterator.Init(batch, ValueDescr::ARRAY));
+    ExecSpan iter_span;
+    ASSERT_FALSE(iterator.Next(&iter_span));
   };
 
+  ExecBatch input;
+  input.length = 0;
+
   // Zero-length ChunkedArray with zero chunks
-  std::vector<Datum> args = {Datum(carr)};
-  CheckArgs(args);
+  input.values = {Datum(carr)};
+  CheckArgs(input);
 
   // Zero-length array
-  args = {Datum(GetInt32Array(0))};
-  CheckArgs(args);
+  input.values = {Datum(GetInt32Array(0))};
+  CheckArgs(input);
 
   // ChunkedArray with single empty chunk
-  args = {Datum(GetInt32Chunked({0}))};
-  CheckArgs(args);
+  input.values = {Datum(GetInt32Chunked({0}))};
+  CheckArgs(input);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index a7508bbc710..482aa917bc2 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -228,17 +228,6 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
     inputs[i] = args[i].descr();
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
-  ARROW_ASSIGN_OR_RAISE(auto implicitly_cast_args, Cast(args, inputs, ctx));
-
-  std::unique_ptr<KernelState> state;
-
-  KernelContext kernel_ctx{ctx};
-  if (kernel->init) {
-    ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
-    kernel_ctx.SetState(state.get());
-  }
-
   std::unique_ptr<detail::KernelExecutor> executor;
   if (kind() == Function::SCALAR) {
     executor = detail::KernelExecutor::MakeScalar();
@@ -249,11 +238,29 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
   } else {
     return Status::NotImplemented("Direct execution of HASH_AGGREGATE functions");
   }
+
+  ARROW_ASSIGN_OR_RAISE(auto kernel, DispatchBest(&inputs));
+  ARROW_ASSIGN_OR_RAISE(std::vector<Datum> args_with_casts, Cast(args, inputs, ctx));
+
+  std::unique_ptr<KernelState> state;
+  KernelContext kernel_ctx{ctx};
+  if (kernel->init) {
+    ARROW_ASSIGN_OR_RAISE(state, kernel->init(&kernel_ctx, {kernel, inputs, options}));
+    kernel_ctx.SetState(state.get());
+  }
+
   RETURN_NOT_OK(executor->Init(&kernel_ctx, {kernel, inputs, options}));
 
   detail::DatumAccumulator listener;
-  RETURN_NOT_OK(executor->Execute(implicitly_cast_args, &listener));
-  const auto out = executor->WrapResults(implicitly_cast_args, listener.values());
+
+  // Set length to 0 unless it's a scalar function (vector functions don't use
+  // it).
+  ExecBatch input(std::move(args_with_casts), 0);
+  if (kind() == Function::SCALAR) {
+    ARROW_ASSIGN_OR_RAISE(input.length, detail::InferBatchLength(input.values));
+  }
+  RETURN_NOT_OK(executor->Execute(input, &listener));
+  const auto out = executor->WrapResults(input.values, listener.values());
 #ifndef NDEBUG
   DCHECK_OK(executor->CheckResultType(out, name_.c_str()));
 #endif
diff --git a/cpp/src/arrow/compute/kernel_test.cc b/cpp/src/arrow/compute/kernel_test.cc
index a63c42d4fde..2d427374426 100644
--- a/cpp/src/arrow/compute/kernel_test.cc
+++ b/cpp/src/arrow/compute/kernel_test.cc
@@ -266,7 +266,7 @@ TEST(OutputType, Constructors) {
   ASSERT_EQ(OutputType::COMPUTED, ty2.kind());
 
   ASSERT_OK_AND_ASSIGN(ValueDescr out_descr2, ty2.Resolve(nullptr, {}));
-  ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr2);
+  ASSERT_EQ(ValueDescr::Array(int32()), out_descr2);
 
   // Copy constructor
   OutputType ty3 = ty1;
@@ -276,7 +276,7 @@ TEST(OutputType, Constructors) {
   OutputType ty4 = ty2;
   ASSERT_EQ(OutputType::COMPUTED, ty4.kind());
   ASSERT_OK_AND_ASSIGN(ValueDescr out_descr4, ty4.Resolve(nullptr, {}));
-  ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr4);
+  ASSERT_EQ(ValueDescr::Array(int32()), out_descr4);
 
   // Move constructor
   OutputType ty5 = std::move(ty1);
@@ -286,7 +286,7 @@ TEST(OutputType, Constructors) {
   OutputType ty6 = std::move(ty4);
   ASSERT_EQ(OutputType::COMPUTED, ty6.kind());
   ASSERT_OK_AND_ASSIGN(ValueDescr out_descr6, ty6.Resolve(nullptr, {}));
-  ASSERT_EQ(ValueDescr::Scalar(int32()), out_descr6);
+  ASSERT_EQ(ValueDescr::Array(int32()), out_descr6);
 
   // ToString
 
@@ -300,7 +300,7 @@ TEST(OutputType, Resolve) {
   OutputType ty1(int32());
 
   ASSERT_OK_AND_ASSIGN(ValueDescr descr, ty1.Resolve(nullptr, {}));
-  ASSERT_EQ(ValueDescr::Scalar(int32()), descr);
+  ASSERT_EQ(ValueDescr::Array(int32()), descr);
 
   ASSERT_OK_AND_ASSIGN(descr,
                        ty1.Resolve(nullptr, {ValueDescr(int8(), ValueDescr::SCALAR)}));
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 216a8dc2e0a..2d6ce45c4aa 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -3267,11 +3267,12 @@ Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Dat
   using arrow::compute::detail::ExecBatchIterator;
   std::unique_ptr<ExecBatchIterator> argument_batch_iterator;
 
+  ARROW_ASSIGN_OR_RAISE(ExecBatch args_batch, ExecBatch::Make(arguments));
+  ARROW_ASSIGN_OR_RAISE(ExecBatch keys_batch, ExecBatch::Make(keys));
+
   if (!arguments.empty()) {
     // Construct and initialize HashAggregateKernels
-    ARROW_ASSIGN_OR_RAISE(auto argument_descrs,
-                          ExecBatch::Make(arguments).Map(
-                              [](ExecBatch batch) { return batch.GetDescriptors(); }));
+    auto argument_descrs = args_batch.GetDescriptors();
 
     ARROW_ASSIGN_OR_RAISE(kernels, GetKernels(ctx, aggregates, argument_descrs));
 
@@ -3285,13 +3286,11 @@ Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Dat
         out_fields, ResolveKernels(aggregates, kernels, states[0], ctx, argument_descrs));
 
     ARROW_ASSIGN_OR_RAISE(argument_batch_iterator,
-                          ExecBatchIterator::Make(arguments, ctx->exec_chunksize()));
+                          ExecBatchIterator::Make(args_batch, ctx->exec_chunksize()));
   }
 
   // Construct Groupers
-  ARROW_ASSIGN_OR_RAISE(auto key_descrs, ExecBatch::Make(keys).Map([](ExecBatch batch) {
-    return batch.GetDescriptors();
-  }));
+  auto key_descrs = keys_batch.GetDescriptors();
 
   std::vector<std::unique_ptr<Grouper>> groupers(task_group->parallelism());
   for (auto& grouper : groupers) {
@@ -3307,7 +3306,7 @@ Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Dat
   }
 
   ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
-                        ExecBatchIterator::Make(keys, ctx->exec_chunksize()));
+                        ExecBatchIterator::Make(keys_batch, ctx->exec_chunksize()));
 
   // start "streaming" execution
   ExecBatch key_batch, argument_batch;
diff --git a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
index 682b141829e..2b834ee2eb3 100644
--- a/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_compare_test.cc
@@ -1198,6 +1198,13 @@ class TestVarArgsCompare : public ::testing::Test {
     return actual;
   }
 
+  void AssertNullArrayZeroLen(VarArgsFunction func, const std::vector<Datum>& args) {
+    auto datum = this->Eval(func, args);
+    ASSERT_TRUE(datum.is_array());
+    ASSERT_EQ(0, datum.array()->length);
+    ASSERT_EQ(Type::NA, datum.type()->id());
+  }
+
   void AssertNullScalar(VarArgsFunction func, const std::vector<Datum>& args) {
     auto datum = this->Eval(func, args);
     ASSERT_TRUE(datum.is_scalar());
@@ -1285,7 +1292,7 @@ TYPED_TEST_SUITE(TestVarArgsCompareBinary, BaseBinaryArrowTypes);
 TYPED_TEST_SUITE(TestVarArgsCompareFixedSizeBinary, CompareFixedSizeBinaryTypes);
 
 TYPED_TEST(TestVarArgsCompareNumeric, MinElementWise) {
-  this->AssertNullScalar(MinElementWise, {});
+  this->AssertNullArrayZeroLen(MinElementWise, {});
   this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")});
 
   if (std::is_same<TypeParam, Date64Type>::value) {
@@ -1547,7 +1554,7 @@ TYPED_TEST(TestVarArgsCompareFloating, MinElementWise) {
 
 TYPED_TEST(TestVarArgsCompareParametricTemporal, MinElementWise) {
   // Temporal kernel is implemented with numeric kernel underneath
-  this->AssertNullScalar(MinElementWise, {});
+  this->AssertNullArrayZeroLen(MinElementWise, {});
   this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")});
 
   this->Assert(MinElementWise, this->scalar("0"), {this->scalar("0")});
@@ -1567,7 +1574,7 @@ TYPED_TEST(TestVarArgsCompareParametricTemporal, MinElementWise) {
 }
 
 TYPED_TEST(TestVarArgsCompareBinary, MinElementWise) {
-  this->AssertNullScalar(MinElementWise, {});
+  this->AssertNullArrayZeroLen(MinElementWise, {});
   this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")});
 
   this->Assert(MinElementWise, this->scalar(R"("")"),
@@ -1634,7 +1641,7 @@ TYPED_TEST(TestVarArgsCompareBinary, MinElementWise) {
 }
 
 TYPED_TEST(TestVarArgsCompareFixedSizeBinary, MinElementWise) {
-  this->AssertNullScalar(MinElementWise, {});
+  this->AssertNullArrayZeroLen(MinElementWise, {});
   this->AssertNullScalar(MinElementWise, {this->scalar("null"), this->scalar("null")});
 
   this->Assert(MinElementWise, this->scalar(R"("aaa")"), {this->scalar(R"("aaa")")});
@@ -1692,7 +1699,7 @@ TYPED_TEST(TestVarArgsCompareFixedSizeBinary, MinElementWise) {
 }
 
 TYPED_TEST(TestVarArgsCompareNumeric, MaxElementWise) {
-  this->AssertNullScalar(MaxElementWise, {});
+  this->AssertNullArrayZeroLen(MaxElementWise, {});
   this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")});
   this->Assert(MaxElementWise, this->scalar("0"), {this->scalar("0")});
 
@@ -1954,7 +1961,7 @@ TYPED_TEST(TestVarArgsCompareFloating, MaxElementWise) {
 
 TYPED_TEST(TestVarArgsCompareParametricTemporal, MaxElementWise) {
   // Temporal kernel is implemented with numeric kernel underneath
-  this->AssertNullScalar(MaxElementWise, {});
+  this->AssertNullArrayZeroLen(MaxElementWise, {});
   this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")});
 
   this->Assert(MaxElementWise, this->scalar("0"), {this->scalar("0")});
@@ -1974,7 +1981,7 @@ TYPED_TEST(TestVarArgsCompareParametricTemporal, MaxElementWise) {
 }
 
 TYPED_TEST(TestVarArgsCompareBinary, MaxElementWise) {
-  this->AssertNullScalar(MaxElementWise, {});
+  this->AssertNullArrayZeroLen(MaxElementWise, {});
   this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")});
 
   this->Assert(MaxElementWise, this->scalar(R"("")"),
@@ -2041,7 +2048,7 @@ TYPED_TEST(TestVarArgsCompareBinary, MaxElementWise) {
 }
 
 TYPED_TEST(TestVarArgsCompareFixedSizeBinary, MaxElementWise) {
-  this->AssertNullScalar(MaxElementWise, {});
+  this->AssertNullArrayZeroLen(MaxElementWise, {});
   this->AssertNullScalar(MaxElementWise, {this->scalar("null"), this->scalar("null")});
 
   this->Assert(MaxElementWise, this->scalar(R"("aaa")"), {this->scalar(R"("aaa")")});
diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc
index ac2e9a7e692..84ff0d6ff4e 100644
--- a/cpp/src/arrow/datum.cc
+++ b/cpp/src/arrow/datum.cc
@@ -258,12 +258,17 @@ std::string Datum::ToString() const {
 }
 
 ValueDescr::Shape GetBroadcastShape(const std::vector<ValueDescr>& args) {
-  for (const auto& descr : args) {
-    if (descr.shape == ValueDescr::ARRAY) {
-      return ValueDescr::ARRAY;
+  // This function to be deleted in ARROW-16577
+  if (args.size() == 0) {
+    return ValueDescr::ARRAY;
+  } else {
+    for (const auto& descr : args) {
+      if (descr.shape == ValueDescr::ARRAY) {
+        return ValueDescr::ARRAY;
+      }
     }
+    return ValueDescr::SCALAR;
   }
-  return ValueDescr::SCALAR;
 }
 
 void PrintTo(const Datum& datum, std::ostream* os) {
diff --git a/cpp/src/arrow/python/udf.cc b/cpp/src/arrow/python/udf.cc
index 7307e9f49c4..227629eb24e 100644
--- a/cpp/src/arrow/python/udf.cc
+++ b/cpp/src/arrow/python/udf.cc
@@ -67,7 +67,6 @@ struct PythonUdf {
         std::shared_ptr<Array> c_data = batch[arg_id].array.ToArray();
         PyObject* data = wrap_array(c_data);
         PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
-        break;
       }
     }
 
@@ -75,11 +74,21 @@ struct PythonUdf {
     RETURN_NOT_OK(CheckPyError());
     // unwrapping the output for expected output type
     if (is_scalar(result.obj())) {
+      if (out->is_array_data()) {
+        return Status::TypeError(
+            "UDF executor expected an array result but a "
+            "scalar was returned");
+      }
       ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> val, unwrap_scalar(result.obj()));
       RETURN_NOT_OK(CheckOutputType(*output_type.type(), *val->type));
       out->value = val;
       return Status::OK();
     } else if (is_array(result.obj())) {
+      if (out->is_scalar()) {
+        return Status::TypeError(
+            "UDF executor expected a scalar result but an "
+            "array was returned");
+      }
       ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> val, unwrap_array(result.obj()));
       RETURN_NOT_OK(CheckOutputType(*output_type.type(), *val->type()));
       out->value = std::move(val->data());

From 9d0a2f19dab2c17710d876e235372fb2e9444f55 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 09:47:56 -0500
Subject: [PATCH 05/15] Remove output length from RandomOptions, pass in batch
 length

---
 cpp/src/arrow/compute/api_scalar.cc           |  6 +--
 cpp/src/arrow/compute/api_scalar.h            | 12 ++---
 cpp/src/arrow/compute/exec.cc                 | 35 ++++++++----
 cpp/src/arrow/compute/exec.h                  | 15 ++++++
 cpp/src/arrow/compute/exec/expression.cc      |  4 +-
 cpp/src/arrow/compute/exec_internal.h         |  3 +-
 cpp/src/arrow/compute/exec_test.cc            | 19 +++----
 cpp/src/arrow/compute/function.cc             | 25 ++++++++-
 cpp/src/arrow/compute/function.h              |  6 +++
 .../compute/kernels/scalar_if_else_test.cc    |  4 +-
 .../arrow/compute/kernels/scalar_random.cc    | 17 ++----
 .../compute/kernels/scalar_random_test.cc     | 53 ++++++++++---------
 .../compute/kernels/scalar_string_test.cc     |  3 +-
 python/pyarrow/_compute.pyx                   | 30 +++++------
 python/pyarrow/includes/libarrow.pxd          |  4 +-
 15 files changed, 139 insertions(+), 97 deletions(-)

diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc
index e6b0ade315f..3bdff691778 100644
--- a/cpp/src/arrow/compute/api_scalar.cc
+++ b/cpp/src/arrow/compute/api_scalar.cc
@@ -369,7 +369,6 @@ static auto kWeekOptionsType = GetFunctionOptionsType<WeekOptions>(
     DataMember("count_from_zero", &WeekOptions::count_from_zero),
     DataMember("first_week_is_fully_in_year", &WeekOptions::first_week_is_fully_in_year));
 static auto kRandomOptionsType = GetFunctionOptionsType<RandomOptions>(
-    DataMember("length", &RandomOptions::length),
     DataMember("initializer", &RandomOptions::initializer),
     DataMember("seed", &RandomOptions::seed));
 
@@ -583,12 +582,11 @@ WeekOptions::WeekOptions(bool week_starts_monday, bool count_from_zero,
       first_week_is_fully_in_year(first_week_is_fully_in_year) {}
 constexpr char WeekOptions::kTypeName[];
 
-RandomOptions::RandomOptions(int64_t length, Initializer initializer, uint64_t seed)
+RandomOptions::RandomOptions(Initializer initializer, uint64_t seed)
     : FunctionOptions(internal::kRandomOptionsType),
-      length(length),
       initializer(initializer),
       seed(seed) {}
-RandomOptions::RandomOptions() : RandomOptions(0, SystemRandom, 0) {}
+RandomOptions::RandomOptions() : RandomOptions(SystemRandom, 0) {}
 constexpr char RandomOptions::kTypeName[];
 
 namespace internal {
diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h
index 7f7b23c9371..7d86a555ec8 100644
--- a/cpp/src/arrow/compute/api_scalar.h
+++ b/cpp/src/arrow/compute/api_scalar.h
@@ -478,20 +478,14 @@ class ARROW_EXPORT RandomOptions : public FunctionOptions {
  public:
   enum Initializer { SystemRandom, Seed };
 
-  static RandomOptions FromSystemRandom(int64_t length) {
-    return RandomOptions{length, SystemRandom, 0};
-  }
-  static RandomOptions FromSeed(int64_t length, uint64_t seed) {
-    return RandomOptions{length, Seed, seed};
-  }
+  static RandomOptions FromSystemRandom() { return RandomOptions{SystemRandom, 0}; }
+  static RandomOptions FromSeed(uint64_t seed) { return RandomOptions{Seed, seed}; }
 
-  RandomOptions(int64_t length, Initializer initializer, uint64_t seed);
+  RandomOptions(Initializer initializer, uint64_t seed);
   RandomOptions();
   static constexpr char const kTypeName[] = "RandomOptions";
   static RandomOptions Defaults() { return RandomOptions(); }
 
-  /// The length of the array returned. Negative is invalid.
-  int64_t length;
   /// The type of initialization for random number generation - system or provided seed.
   Initializer initializer;
   /// The seed value used to initialize the random number generation.
diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index a89973cb263..6ec98e7b3f0 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -230,7 +230,9 @@ Status CheckAllValues(const std::vector<Datum>& values) {
 }
 
 ExecBatchIterator::ExecBatchIterator(const ExecBatch& batch, int64_t max_chunksize)
-    : batch_(batch), position_(0), length_(batch.length),
+    : batch_(batch),
+      position_(0),
+      length_(batch.length),
       max_chunksize_(std::min(batch.length, max_chunksize)) {
   chunk_indexes_.resize(batch.num_values(), 0);
   chunk_positions_.resize(batch.num_values(), 0);
@@ -245,8 +247,7 @@ Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
       return Status::Invalid("Value lengths differed from ExecBatch length");
     }
   }
-  return std::unique_ptr<ExecBatchIterator>(
-      new ExecBatchIterator(batch, max_chunksize));
+  return std::unique_ptr<ExecBatchIterator>(new ExecBatchIterator(batch, max_chunksize));
 }
 
 bool ExecBatchIterator::Next(ExecBatch* batch) {
@@ -304,8 +305,8 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
 // ----------------------------------------------------------------------
 // ExecSpanIterator; to eventually replace ExecBatchIterator
 
-Status ExecSpanIterator::Init(const ExecBatch& batch,
-                              ValueDescr::Shape output_shape, int64_t max_chunksize) {
+Status ExecSpanIterator::Init(const ExecBatch& batch, ValueDescr::Shape output_shape,
+                              int64_t max_chunksize) {
   if (batch.num_values() > 0) {
     // Validate arguments
     ARROW_ASSIGN_OR_RAISE(int64_t inferred_length, InferBatchLength(batch.values));
@@ -729,8 +730,8 @@ class KernelExecutorImpl : public KernelExecutor {
 class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
  public:
   Status Execute(const ExecBatch& batch, ExecListener* listener) override {
-    RETURN_NOT_OK(
-        span_iterator_.Init(batch, output_descr_.shape, exec_context()->exec_chunksize()));
+    RETURN_NOT_OK(span_iterator_.Init(batch, output_descr_.shape,
+                                      exec_context()->exec_chunksize()));
 
     // TODO(wesm): remove if with ARROW-16757
     if (output_descr_.shape != ValueDescr::SCALAR) {
@@ -1047,8 +1048,8 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
   }
 
   Status Execute(const ExecBatch& batch, ExecListener* listener) override {
-    ARROW_ASSIGN_OR_RAISE(
-        batch_iterator_, ExecBatchIterator::Make(batch, exec_context()->exec_chunksize()));
+    ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
+                                               batch, exec_context()->exec_chunksize()));
 
     ExecBatch iter_batch;
     while (batch_iterator_->Next(&iter_batch)) {
@@ -1281,5 +1282,21 @@ Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum
   return CallFunction(func_name, args, /*options=*/nullptr, ctx);
 }
 
+Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
+                           const FunctionOptions* options, ExecContext* ctx) {
+  if (ctx == nullptr) {
+    ExecContext default_ctx;
+    return CallFunction(func_name, batch, options, &default_ctx);
+  }
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<const Function> func,
+                        ctx->func_registry()->GetFunction(func_name));
+  return func->Execute(batch, options, ctx);
+}
+
+Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
+                           ExecContext* ctx) {
+  return CallFunction(func_name, batch, /*options=*/nullptr, ctx);
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index 6301aaaf286..c9be64ad0f5 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -459,6 +459,21 @@ ARROW_EXPORT
 Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
                            ExecContext* ctx = NULLPTR);
 
+/// \brief One-shot invoker for all types of functions.
+///
+/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
+/// and wrapping of outputs.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
+                           const FunctionOptions* options, ExecContext* ctx = NULLPTR);
+
+/// \brief Variant of CallFunction which uses a function's default options.
+///
+/// NB: Some functions require FunctionOptions be provided.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
+                           ExecContext* ctx = NULLPTR);
+
 /// @}
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc
index c20ee2865fc..03545c57e77 100644
--- a/cpp/src/arrow/compute/exec/expression.cc
+++ b/cpp/src/arrow/compute/exec/expression.cc
@@ -597,8 +597,8 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i
   RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
 
   compute::detail::DatumAccumulator listener;
-  RETURN_NOT_OK(executor->Execute(ExecBatch(std::move(arguments),
-                                            input.length), &listener));
+  RETURN_NOT_OK(
+      executor->Execute(ExecBatch(std::move(arguments), input.length), &listener));
   const auto out = executor->WrapResults(arguments, listener.values());
 #ifndef NDEBUG
   DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str()));
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index ce1b9092b12..c382063d70f 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -85,8 +85,7 @@ class ARROW_EXPORT ExecSpanIterator {
   /// \param[in] batch the input ExecBatch
   /// \param[in] max_chunksize the maximum length of each ExecSpan. Depending
   /// on the chunk layout of ChunkedArray.
-  Status Init(const ExecBatch& batch,
-              ValueDescr::Shape output_shape = ValueDescr::ARRAY,
+  Status Init(const ExecBatch& batch, ValueDescr::Shape output_shape = ValueDescr::ARRAY,
               int64_t max_chunksize = kDefaultMaxChunksize);
 
   /// \brief Compute the next span by updating the state of the
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index ba6fb059681..2f7dd3387e0 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -718,7 +718,8 @@ TEST_F(TestExecBatchIterator, Basics) {
 
   // Simple case with a single chunk
   ExecBatch input({Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
-        Datum(std::make_shared<Int32Scalar>(3))}, length);
+                   Datum(std::make_shared<Int32Scalar>(3))},
+                  length);
   SetupIterator(input);
 
   ExecBatch batch;
@@ -756,10 +757,10 @@ TEST_F(TestExecBatchIterator, InputValidation) {
 }
 
 TEST_F(TestExecBatchIterator, ChunkedArrays) {
-  ExecBatch input({Datum(GetInt32Chunked({0, 20, 10})),
-        Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
-        Datum(std::make_shared<Int32Scalar>(5)),
-        Datum(MakeNullScalar(boolean()))}, 30);
+  ExecBatch input({Datum(GetInt32Chunked({0, 20, 10})), Datum(GetInt32Chunked({15, 15})),
+                   Datum(GetInt32Array(30)), Datum(std::make_shared<Int32Scalar>(5)),
+                   Datum(MakeNullScalar(boolean()))},
+                  30);
 
   CheckIteration(input, /*chunksize=*/10, {10, 5, 5, 10});
   CheckIteration(input, /*chunksize=*/20, {15, 5, 10});
@@ -888,10 +889,10 @@ TEST_F(TestExecSpanIterator, InputValidation) {
 }
 
 TEST_F(TestExecSpanIterator, ChunkedArrays) {
-  ExecBatch batch({Datum(GetInt32Chunked({0, 20, 10})),
-        Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
-        Datum(std::make_shared<Int32Scalar>(5)),
-        Datum(MakeNullScalar(boolean()))}, 30);
+  ExecBatch batch({Datum(GetInt32Chunked({0, 20, 10})), Datum(GetInt32Chunked({15, 15})),
+                   Datum(GetInt32Array(30)), Datum(std::make_shared<Int32Scalar>(5)),
+                   Datum(MakeNullScalar(boolean()))},
+                  30);
 
   CheckIteration(batch, /*chunksize=*/10, {10, 5, 5, 10});
   CheckIteration(batch, /*chunksize=*/20, {15, 5, 10});
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index 482aa917bc2..adee77e0572 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -205,13 +205,24 @@ Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) co
 
 Result<Datum> Function::Execute(const std::vector<Datum>& args,
                                 const FunctionOptions* options, ExecContext* ctx) const {
+  return ExecuteImpl(args, /*passed_length=*/-1, options, ctx);
+}
+
+Result<Datum> Function::Execute(const ExecBatch& batch, const FunctionOptions* options,
+                                ExecContext* ctx) const {
+  return ExecuteImpl(batch.values, batch.length, options, ctx);
+}
+
+Result<Datum> Function::ExecuteImpl(const std::vector<Datum>& args, int64_t passed_length,
+                                    const FunctionOptions* options,
+                                    ExecContext* ctx) const {
   if (options == nullptr) {
     RETURN_NOT_OK(CheckOptions(*this, options));
     options = default_options();
   }
   if (ctx == nullptr) {
     ExecContext default_ctx;
-    return Execute(args, options, &default_ctx);
+    return ExecuteImpl(args, passed_length, options, &default_ctx);
   }
 
   util::tracing::Span span;
@@ -257,7 +268,17 @@ Result<Datum> Function::Execute(const std::vector<Datum>& args,
   // it).
   ExecBatch input(std::move(args_with_casts), 0);
   if (kind() == Function::SCALAR) {
-    ARROW_ASSIGN_OR_RAISE(input.length, detail::InferBatchLength(input.values));
+    ARROW_ASSIGN_OR_RAISE(int64_t inferred_length,
+                          detail::InferBatchLength(input.values));
+    if (passed_length == -1) {
+      input.length = inferred_length;
+    } else {
+      // ARROW-16819: will clean up more later
+      if (input.num_values() > 0 && passed_length != inferred_length) {
+        return Status::Invalid("Passed batch length did not equal actual array lengths");
+      }
+      input.length = passed_length;
+    }
   }
   RETURN_NOT_OK(executor->Execute(input, &listener));
   const auto out = executor->WrapResults(input.values, listener.values());
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index a1b01a9dc7a..541497c714d 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -235,6 +235,9 @@ class ARROW_EXPORT Function {
   virtual Result<Datum> Execute(const std::vector<Datum>& args,
                                 const FunctionOptions* options, ExecContext* ctx) const;
 
+  virtual Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
+                                ExecContext* ctx) const;
+
   /// \brief Returns the default options for this function.
   ///
   /// Whatever option semantics a Function has, implementations must guarantee
@@ -252,6 +255,9 @@ class ARROW_EXPORT Function {
         doc_(std::move(doc)),
         default_options_(default_options) {}
 
+  Result<Datum> ExecuteImpl(const std::vector<Datum>& args, int64_t passed_length,
+                            const FunctionOptions* options, ExecContext* ctx) const;
+
   Status CheckArity(const std::vector<InputType>&) const;
   Status CheckArity(const std::vector<ValueDescr>&) const;
 
diff --git a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
index abae98caf6d..7a2837c84fd 100644
--- a/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_if_else_test.cc
@@ -2322,7 +2322,7 @@ TEST(TestCaseWhen, DispatchBest) {
       "case_when", {struct_({field("", boolean())}), decimal128(38, 0), decimal128(1, 1)},
       {struct_({field("", boolean())}), decimal256(39, 1), decimal256(39, 1)});
 
-  ASSERT_RAISES(Invalid, CallFunction("case_when", {}));
+  ASSERT_RAISES(Invalid, CallFunction("case_when", ExecBatch({}, 0)));
   // Too many/too few conditions
   ASSERT_RAISES(
       Invalid, CallFunction("case_when", {MakeStruct({ArrayFromJSON(boolean(), "[]")})}));
@@ -3410,7 +3410,7 @@ TEST(TestChooseKernel, DispatchBest) {
 }
 
 TEST(TestChooseKernel, Errors) {
-  ASSERT_RAISES(Invalid, CallFunction("choose", {}));
+  ASSERT_RAISES(Invalid, CallFunction("choose", ExecBatch({}, 0)));
   ASSERT_RAISES(Invalid, CallFunction("choose", {ArrayFromJSON(int64(), "[]")}));
   ASSERT_RAISES(Invalid, CallFunction("choose", {ArrayFromJSON(utf8(), "[\"a\"]"),
                                                  ArrayFromJSON(int64(), "[0]")}));
diff --git a/cpp/src/arrow/compute/kernels/scalar_random.cc b/cpp/src/arrow/compute/kernels/scalar_random.cc
index 2fe48c9a7a6..ee89f20f946 100644
--- a/cpp/src/arrow/compute/kernels/scalar_random.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_random.cc
@@ -62,9 +62,6 @@ Status ExecRandom(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
 
   random::pcg64_oneseq gen;
   const RandomOptions& options = RandomState::Get(ctx);
-  if (options.length < 0) {
-    return Status::Invalid("Negative number of elements");
-  }
 
   if (options.initializer == RandomOptions::Seed) {
     gen.seed(options.seed);
@@ -72,24 +69,17 @@ Status ExecRandom(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
     std::lock_guard<std::mutex> seed_gen_lock(seed_gen_mutex);
     gen.seed(seed_gen());
   }
-  // TODO(wesm): refactor to use batch length instead of passing length in
-  // options
-  auto out_data = ArrayData::Make(float64(), options.length, 0);
-  out_data->buffers.resize(2, nullptr);
-  ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
-                        ctx->Allocate(options.length * sizeof(double)));
-  double* out_values = out_data->GetMutableValues<double>(1);
-  for (int64_t i = 0; i < options.length; ++i) {
+  double* out_values = out->array_span()->GetValues<double>(1);
+  for (int64_t i = 0; i < batch.length; ++i) {
     out_values[i] = generate_uniform(&gen);
   }
-  out->value = std::move(out_data);
   return Status::OK();
 }
 
 const FunctionDoc random_doc{
     "Generate numbers in the range [0, 1)",
     ("Generated values are uniformly-distributed, double-precision in range [0, 1).\n"
-     "Length of generated data, algorithm and seed can be changed via RandomOptions."),
+     "Algorithm and seed can be changed via RandomOptions."),
     {},
     "RandomOptions"};
 
@@ -103,7 +93,6 @@ void RegisterScalarRandom(FunctionRegistry* registry) {
   ScalarKernel kernel{
       {}, ValueDescr(float64(), ValueDescr::Shape::ARRAY), ExecRandom, RandomState::Init};
   kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
-  kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
   DCHECK_OK(random_func->AddKernel(kernel));
   DCHECK_OK(registry->AddFunction(std::move(random_func)));
 }
diff --git a/cpp/src/arrow/compute/kernels/scalar_random_test.cc b/cpp/src/arrow/compute/kernels/scalar_random_test.cc
index b4003fcbcbf..81c0c90cb6b 100644
--- a/cpp/src/arrow/compute/kernels/scalar_random_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_random_test.cc
@@ -30,19 +30,20 @@ namespace compute {
 
 namespace {
 
-void TestRandomWithOptions(const RandomOptions& random_options) {
-  ASSERT_OK_AND_ASSIGN(Datum result, CallFunction("random", {}, &random_options));
+void TestRandomWithOptions(int64_t length, const RandomOptions& random_options) {
+  ExecBatch input({}, length);
+  ASSERT_OK_AND_ASSIGN(Datum result, CallFunction("random", input, &random_options));
   const auto result_array = result.make_array();
   ValidateOutput(*result_array);
-  ASSERT_EQ(result_array->length(), random_options.length);
+  ASSERT_EQ(result_array->length(), length);
   ASSERT_EQ(result_array->null_count(), 0);
   AssertTypeEqual(result_array->type(), float64());
 
-  if (random_options.length > 0) {
+  if (length > 0) {
     // verify E(X), E(X^2) is near theory
     double sum = 0, square_sum = 0;
     const double* values = result_array->data()->GetValues<double>(1);
-    for (int64_t i = 0; i < random_options.length; ++i) {
+    for (int64_t i = 0; i < length; ++i) {
       const double value = values[i];
       ASSERT_GE(value, 0);
       ASSERT_LT(value, 1);
@@ -51,8 +52,8 @@ void TestRandomWithOptions(const RandomOptions& random_options) {
     }
     const double E_X = 0.5;
     const double E_X2 = 1.0 / 12 + E_X * E_X;
-    ASSERT_NEAR(sum / random_options.length, E_X, E_X * 0.02);
-    ASSERT_NEAR(square_sum / random_options.length, E_X2, E_X2 * 0.02);
+    ASSERT_NEAR(sum / length, E_X, E_X * 0.02);
+    ASSERT_NEAR(square_sum / length, E_X2, E_X2 * 0.02);
   }
 }
 
@@ -60,29 +61,33 @@ void TestRandomWithOptions(const RandomOptions& random_options) {
 
 TEST(TestRandom, Seed) {
   const int kCount = 100000;
-  auto random_options = RandomOptions::FromSeed(/*length=*/kCount, /*seed=*/0);
-  TestRandomWithOptions(random_options);
+  auto random_options = RandomOptions::FromSeed(/*seed=*/0);
+  TestRandomWithOptions(kCount, random_options);
 }
 
 TEST(TestRandom, SystemRandom) {
   const int kCount = 100000;
-  auto random_options = RandomOptions::FromSystemRandom(/*length=*/kCount);
-  TestRandomWithOptions(random_options);
+  auto random_options = RandomOptions::FromSystemRandom();
+  TestRandomWithOptions(kCount, random_options);
 }
 
 TEST(TestRandom, SeedIsDeterministic) {
   const int kCount = 100;
-  auto random_options = RandomOptions::FromSeed(/*length=*/kCount, /*seed=*/0);
-  ASSERT_OK_AND_ASSIGN(Datum first_call, CallFunction("random", {}, &random_options));
-  ASSERT_OK_AND_ASSIGN(Datum second_call, CallFunction("random", {}, &random_options));
+  auto random_options = RandomOptions::FromSeed(/*seed=*/0);
+
+  ExecBatch input({}, kCount);
+  ASSERT_OK_AND_ASSIGN(Datum first_call, CallFunction("random", input, &random_options));
+  ASSERT_OK_AND_ASSIGN(Datum second_call, CallFunction("random", input, &random_options));
   AssertDatumsEqual(first_call, second_call);
 }
 
 TEST(TestRandom, SystemRandomDifferentResultsSingleThreaded) {
   const int kCount = 100;
-  auto random_options = RandomOptions::FromSystemRandom(/*length=*/kCount);
-  ASSERT_OK_AND_ASSIGN(Datum first_datum, CallFunction("random", {}, &random_options));
-  ASSERT_OK_AND_ASSIGN(Datum second_datum, CallFunction("random", {}, &random_options));
+  auto random_options = RandomOptions::FromSystemRandom();
+  ExecBatch input({}, kCount);
+  ASSERT_OK_AND_ASSIGN(Datum first_datum, CallFunction("random", input, &random_options));
+  ASSERT_OK_AND_ASSIGN(Datum second_datum,
+                       CallFunction("random", input, &random_options));
   ASSERT_FALSE(first_datum.Equals(second_datum));
 }
 
@@ -93,12 +98,13 @@ TEST(TestRandom, SystemRandomDifferentResultsMultiThreaded) {
 
   ASSERT_OK_AND_ASSIGN(auto pool, ThreadPool::Make(kThreadCount));
 
-  auto random_options = RandomOptions::FromSystemRandom(/*length=*/kCount);
+  auto random_options = RandomOptions::FromSystemRandom();
+  ExecBatch input({}, kCount);
   std::vector<Future<Datum>> futures;
 
   for (int i = 0; i < kCallCount; ++i) {
     futures.push_back(DeferNotOk(
-        pool->Submit([&]() { return CallFunction("random", {}, &random_options); })));
+        pool->Submit([&]() { return CallFunction("random", input, &random_options); })));
   }
   std::vector<Datum> call_results(kCallCount);
   for (int i = 0; i < kCallCount; ++i) {
@@ -111,12 +117,9 @@ TEST(TestRandom, SystemRandomDifferentResultsMultiThreaded) {
   }
 }
 
-TEST(TestRandom, Length) {
-  auto random_options = RandomOptions::FromSystemRandom(/*length=*/0);
-  TestRandomWithOptions(random_options);
-
-  random_options = RandomOptions::FromSystemRandom(/*length=*/-1);
-  ASSERT_RAISES(Invalid, CallFunction("random", {}, &random_options));
+TEST(TestRandom, ZeroLength) {
+  auto random_options = RandomOptions::FromSystemRandom();
+  TestRandomWithOptions(0, random_options);
 }
 
 }  // namespace compute
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 5332082f4c9..747eb343152 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -676,7 +676,8 @@ TYPED_TEST(TestBaseBinaryKernels, BinaryJoinElementWise) {
                      ty, R"([null, null, null, null])", &options_replace);
 
   // Error cases
-  ASSERT_RAISES(Invalid, CallFunction("binary_join_element_wise", {}, &options));
+  ASSERT_RAISES(Invalid,
+                CallFunction("binary_join_element_wise", ExecBatch({}, 0), &options));
 }
 
 class TestFixedSizeBinaryKernels : public ::testing::Test {
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 908bcc0de70..25cff0b8f5a 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -2007,10 +2007,10 @@ class Utf8NormalizeOptions(_Utf8NormalizeOptions):
 
 
 cdef class _RandomOptions(FunctionOptions):
-    def _set_options(self, length, initializer):
+    def _set_options(self, initializer):
         if initializer == 'system':
             self.wrapped.reset(new CRandomOptions(
-                CRandomOptions.FromSystemRandom(length)))
+                CRandomOptions.FromSystemRandom()))
             return
 
         if not isinstance(initializer, int):
@@ -2024,7 +2024,7 @@ cdef class _RandomOptions(FunctionOptions):
         if initializer < 0:
             initializer += 2**64
         self.wrapped.reset(new CRandomOptions(
-            CRandomOptions.FromSeed(length, initializer)))
+            CRandomOptions.FromSeed(initializer)))
 
 
 class RandomOptions(_RandomOptions):
@@ -2033,8 +2033,6 @@ class RandomOptions(_RandomOptions):
 
     Parameters
     ----------
-    length : int
-        Number of random values to generate.
     initializer : int or str
         How to initialize the underlying random generator.
         If an integer is given, it is used as a seed.
@@ -2043,8 +2041,8 @@ class RandomOptions(_RandomOptions):
         Other values are invalid.
     """
 
-    def __init__(self, length, *, initializer='system'):
-        self._set_options(length, initializer)
+    def __init__(self, *, initializer='system'):
+        self._set_options(initializer)
 
 
 def _group_by(args, keys, aggregations):
@@ -2384,8 +2382,8 @@ cdef class ScalarUdfContext:
 cdef inline CFunctionDoc _make_function_doc(dict func_doc) except *:
     """
     Helper function to generate the FunctionDoc
-    This function accepts a dictionary and expects the 
-    summary(str), description(str) and arg_names(List[str]) keys. 
+    This function accepts a dictionary and expects the
+    summary(str), description(str) and arg_names(List[str]) keys.
     """
     cdef:
         CFunctionDoc f_doc
@@ -2429,11 +2427,11 @@ def _get_scalar_udf_context(memory_pool, batch_length):
 def register_scalar_function(func, function_name, function_doc, in_types,
                              out_type):
     """
-    Register a user-defined scalar function. 
+    Register a user-defined scalar function.
 
     A scalar function is a function that executes elementwise
     operations on arrays or scalars, i.e. a scalar function must
-    be computed row-by-row with no state where each output row 
+    be computed row-by-row with no state where each output row
     is computed only from its corresponding input row.
     In other words, all argument arrays have the same length,
     and the output array is of the same length as the arguments.
@@ -2455,7 +2453,7 @@ def register_scalar_function(func, function_name, function_doc, in_types,
         varargs. The last in_type will be the type of all varargs
         arguments.
     function_name : str
-        Name of the function. This name must be globally unique. 
+        Name of the function. This name must be globally unique.
     function_doc : dict
         A dictionary object with keys "summary" (str),
         and "description" (str).
@@ -2473,20 +2471,20 @@ def register_scalar_function(func, function_name, function_doc, in_types,
     --------
     >>> import pyarrow as pa
     >>> import pyarrow.compute as pc
-    >>> 
+    >>>
     >>> func_doc = {}
     >>> func_doc["summary"] = "simple udf"
     >>> func_doc["description"] = "add a constant to a scalar"
-    >>> 
+    >>>
     >>> def add_constant(ctx, array):
     ...     return pc.add(array, 1, memory_pool=ctx.memory_pool)
-    >>> 
+    >>>
     >>> func_name = "py_add_func"
     >>> in_types = {"array": pa.int64()}
     >>> out_type = pa.int64()
     >>> pc.register_scalar_function(add_constant, func_name, func_doc,
     ...                   in_types, out_type)
-    >>> 
+    >>>
     >>> func = pc.get_function(func_name)
     >>> func.name
     'py_add_func'
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index c55fd315b11..ce3e1bf304b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -2336,10 +2336,10 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CRandomOptions(CRandomOptions)
 
         @staticmethod
-        CRandomOptions FromSystemRandom(int64_t length)
+        CRandomOptions FromSystemRandom()
 
         @staticmethod
-        CRandomOptions FromSeed(int64_t length, uint64_t seed)
+        CRandomOptions FromSeed(uint64_t seed)
 
     cdef enum DatumType" arrow::Datum::type":
         DatumType_NONE" arrow::Datum::NONE"

From 7e8d4779af7dc95d4135dcf853b427328f94b8b0 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 09:51:26 -0500
Subject: [PATCH 06/15] Fix a couple more tests

---
 cpp/src/arrow/compute/kernels/vector_selection_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index bf7eaefb9bc..f98af93eef3 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -914,7 +914,7 @@ TEST_F(TestFilterKernelWithTable, FilterTable) {
 }
 
 TEST(TestFilterMetaFunction, ArityChecking) {
-  ASSERT_RAISES(Invalid, CallFunction("filter", {}));
+  ASSERT_RAISES(Invalid, CallFunction("filter", ExecBatch({}, 0)));
 }
 
 // ----------------------------------------------------------------------
@@ -1729,7 +1729,7 @@ TEST_F(TestTakeKernelWithTable, TakeTable) {
 }
 
 TEST(TestTakeMetaFunction, ArityChecking) {
-  ASSERT_RAISES(Invalid, CallFunction("take", {}));
+  ASSERT_RAISES(Invalid, CallFunction("take", ExecBatch({}, 0)));
 }
 
 // ----------------------------------------------------------------------

From 386e214071e032403ee7080e46af03133c6d19cc Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 10:18:02 -0500
Subject: [PATCH 07/15] Revert a bunch of stuff, fix more things

---
 cpp/src/arrow/compute/exec.cc                 | 170 ++++++++++++------
 cpp/src/arrow/compute/exec_internal.h         |  13 +-
 cpp/src/arrow/compute/exec_test.cc            |  80 ++++-----
 cpp/src/arrow/compute/function.cc             |  19 +-
 cpp/src/arrow/compute/function.h              |   7 +-
 .../arrow/compute/kernels/hash_aggregate.cc   |  16 +-
 .../kernels/scalar_random_benchmark.cc        |   6 +-
 7 files changed, 194 insertions(+), 117 deletions(-)

diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 6ec98e7b3f0..33afdb19284 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -229,25 +229,48 @@ Status CheckAllValues(const std::vector<Datum>& values) {
   return Status::OK();
 }
 
-ExecBatchIterator::ExecBatchIterator(const ExecBatch& batch, int64_t max_chunksize)
-    : batch_(batch),
+ExecBatchIterator::ExecBatchIterator(std::vector<Datum> args, int64_t length,
+                                     int64_t max_chunksize)
+    : args_(std::move(args)),
       position_(0),
-      length_(batch.length),
-      max_chunksize_(std::min(batch.length, max_chunksize)) {
-  chunk_indexes_.resize(batch.num_values(), 0);
-  chunk_positions_.resize(batch.num_values(), 0);
+      length_(length),
+      max_chunksize_(max_chunksize) {
+  chunk_indexes_.resize(args_.size(), 0);
+  chunk_positions_.resize(args_.size(), 0);
 }
 
 Result<std::unique_ptr<ExecBatchIterator>> ExecBatchIterator::Make(
-    const ExecBatch& batch, int64_t max_chunksize) {
-  if (batch.num_values() > 0) {
-    // Validate arguments
-    ARROW_ASSIGN_OR_RAISE(int64_t inferred_length, InferBatchLength(batch.values));
-    if (inferred_length != batch.length) {
-      return Status::Invalid("Value lengths differed from ExecBatch length");
+    std::vector<Datum> args, int64_t max_chunksize) {
+  for (const auto& arg : args) {
+    if (!(arg.is_arraylike() || arg.is_scalar())) {
+      return Status::Invalid(
+          "ExecBatchIterator only works with Scalar, Array, and "
+          "ChunkedArray arguments");
+    }
+  }
+
+  // If the arguments are all scalars, then the length is 1
+  int64_t length = 1;
+
+  bool length_set = false;
+  for (auto& arg : args) {
+    if (arg.is_scalar()) {
+      continue;
+    }
+    if (!length_set) {
+      length = arg.length();
+      length_set = true;
+    } else {
+      if (arg.length() != length) {
+        return Status::Invalid("Array arguments must all be the same length");
+      }
     }
   }
-  return std::unique_ptr<ExecBatchIterator>(new ExecBatchIterator(batch, max_chunksize));
+
+  max_chunksize = std::min(length, max_chunksize);
+
+  return std::unique_ptr<ExecBatchIterator>(
+      new ExecBatchIterator(std::move(args), length, max_chunksize));
 }
 
 bool ExecBatchIterator::Next(ExecBatch* batch) {
@@ -259,14 +282,14 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
   int64_t iteration_size = std::min(length_ - position_, max_chunksize_);
 
   // If length_ is 0, then this loop will never execute
-  for (int i = 0; i < batch_.num_values() && iteration_size > 0; ++i) {
+  for (size_t i = 0; i < args_.size() && iteration_size > 0; ++i) {
     // If the argument is not a chunked array, it's either a Scalar or Array,
     // in which case it doesn't influence the size of this batch. Note that if
     // the args are all scalars the batch length is 1
-    if (batch_[i].kind() != Datum::CHUNKED_ARRAY) {
+    if (args_[i].kind() != Datum::CHUNKED_ARRAY) {
       continue;
     }
-    const ChunkedArray& arg = *batch_[i].chunked_array();
+    const ChunkedArray& arg = *args_[i].chunked_array();
     std::shared_ptr<Array> current_chunk;
     while (true) {
       current_chunk = arg.chunk(chunk_indexes_[i]);
@@ -283,15 +306,15 @@ bool ExecBatchIterator::Next(ExecBatch* batch) {
   }
 
   // Now, fill the batch
-  batch->values.resize(batch_.num_values());
+  batch->values.resize(args_.size());
   batch->length = iteration_size;
-  for (int i = 0; i < batch_.num_values(); ++i) {
-    if (batch_[i].is_scalar()) {
-      batch->values[i] = batch_[i].scalar();
-    } else if (batch_[i].is_array()) {
-      batch->values[i] = batch_[i].array()->Slice(position_, iteration_size);
+  for (size_t i = 0; i < args_.size(); ++i) {
+    if (args_[i].is_scalar()) {
+      batch->values[i] = args_[i].scalar();
+    } else if (args_[i].is_array()) {
+      batch->values[i] = args_[i].array()->Slice(position_, iteration_size);
     } else {
-      const ChunkedArray& carr = *batch_[i].chunked_array();
+      const ChunkedArray& carr = *args_[i].chunked_array();
       const auto& chunk = carr.chunk(chunk_indexes_[i]);
       batch->values[i] = chunk->data()->Slice(chunk_positions_[i], iteration_size);
       chunk_positions_[i] += iteration_size;
@@ -968,32 +991,45 @@ class ScalarExecutor : public KernelExecutorImpl<ScalarKernel> {
   ExecSpanIterator span_iterator_;
 };
 
-class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
- public:
-  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
-    RETURN_NOT_OK(PrepareExecute(batch));
-    Datum out;
-    if (output_descr_.shape == ValueDescr::ARRAY) {
-      // We preallocate (maybe) only for the output of processing the current
-      // batch
-      ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
+Status PackBatchNoChunks(const std::vector<Datum>& args, ExecBatch* out) {
+  int64_t length = 0;
+  for (const auto& arg : args) {
+    switch (arg.kind()) {
+      case Datum::SCALAR:
+      case Datum::ARRAY:
+      case Datum::CHUNKED_ARRAY:
+        length = std::max(arg.length(), length);
+        break;
+      default:
+        DCHECK(false);
+        break;
     }
+  }
+  out->length = length;
+  out->values = args;
+  return Status::OK();
+}
 
-    if (kernel_->null_handling == NullHandling::INTERSECTION &&
-        output_descr_.shape == ValueDescr::ARRAY) {
-      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array()));
-    }
-    RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
-    if (!kernel_->finalize) {
-      // If there is no result finalizer (e.g. for hash-based functions, we can
-      // emit the processed batch right away rather than waiting
-      RETURN_NOT_OK(listener->OnResult(std::move(out)));
+class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
+ public:
+  Status ExecuteImpl(const std::vector<Datum>& args, ExecListener* listener) {
+    RETURN_NOT_OK(PrepareExecute(args));
+    ExecBatch batch;
+    if (kernel_->can_execute_chunkwise) {
+      while (batch_iterator_->Next(&batch)) {
+        RETURN_NOT_OK(ExecuteBatch(batch, listener));
+      }
     } else {
-      results_.emplace_back(std::move(out));
+      RETURN_NOT_OK(PackBatchNoChunks(args, &batch));
+      RETURN_NOT_OK(ExecuteBatch(batch, listener));
     }
     return Finalize(listener);
   }
 
+  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
+    return ExecuteImpl(batch.values, listener);
+  }
+
   Datum WrapResults(const std::vector<Datum>& inputs,
                     const std::vector<Datum>& outputs) override {
     // If execution yielded multiple chunks (because large arrays were split
@@ -1011,6 +1047,29 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
   }
 
  protected:
+  Status ExecuteBatch(const ExecBatch& batch, ExecListener* listener) {
+    Datum out;
+    if (output_descr_.shape == ValueDescr::ARRAY) {
+      // We preallocate (maybe) only for the output of processing the current
+      // batch
+      ARROW_ASSIGN_OR_RAISE(out.value, PrepareOutput(batch.length));
+    }
+
+    if (kernel_->null_handling == NullHandling::INTERSECTION &&
+        output_descr_.shape == ValueDescr::ARRAY) {
+      RETURN_NOT_OK(PropagateNulls(kernel_ctx_, ExecSpan(batch), out.mutable_array()));
+    }
+    RETURN_NOT_OK(kernel_->exec(kernel_ctx_, batch, &out));
+    if (!kernel_->finalize) {
+      // If there is no result finalizer (e.g. for hash-based functions, we can
+      // emit the processed batch right away rather than waiting
+      RETURN_NOT_OK(listener->OnResult(std::move(out)));
+    } else {
+      results_.emplace_back(std::move(out));
+    }
+    return Status::OK();
+  }
+
   Status Finalize(ExecListener* listener) {
     if (kernel_->finalize) {
       // Intermediate results require post-processing after the execution is
@@ -1023,7 +1082,11 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
     return Status::OK();
   }
 
-  Status PrepareExecute(const ExecBatch& batch) {
+  Status PrepareExecute(const std::vector<Datum>& args) {
+    if (kernel_->can_execute_chunkwise) {
+      ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
+                                                 args, exec_context()->exec_chunksize()));
+    }
     output_num_buffers_ = static_cast<int>(output_descr_.type->layout().buffers.size());
 
     // Decide if we need to preallocate memory for this kernel
@@ -1036,6 +1099,7 @@ class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
     return Status::OK();
   }
 
+  std::unique_ptr<ExecBatchIterator> batch_iterator_;
   std::vector<Datum> results_;
 };
 
@@ -1047,15 +1111,19 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
     return KernelExecutorImpl<ScalarAggregateKernel>::Init(ctx, args);
   }
 
-  Status Execute(const ExecBatch& batch, ExecListener* listener) override {
-    ARROW_ASSIGN_OR_RAISE(batch_iterator_, ExecBatchIterator::Make(
-                                               batch, exec_context()->exec_chunksize()));
+  Status Execute(const ExecBatch& args, ExecListener* listener) override {
+    return ExecuteImpl(args.values, listener);
+  }
+
+  Status ExecuteImpl(const std::vector<Datum>& args, ExecListener* listener) {
+    ARROW_ASSIGN_OR_RAISE(
+        batch_iterator_, ExecBatchIterator::Make(args, exec_context()->exec_chunksize()));
 
-    ExecBatch iter_batch;
-    while (batch_iterator_->Next(&iter_batch)) {
+    ExecBatch batch;
+    while (batch_iterator_->Next(&batch)) {
       // TODO: implement parallelism
-      if (iter_batch.length > 0) {
-        RETURN_NOT_OK(Consume(iter_batch));
+      if (batch.length > 0) {
+        RETURN_NOT_OK(Consume(batch));
       }
     }
 
@@ -1073,7 +1141,7 @@ class ScalarAggExecutor : public KernelExecutorImpl<ScalarAggregateKernel> {
 
  private:
   Status Consume(const ExecBatch& batch) {
-    // FIXME(ARROW-11840) don't merge *any* aggregates for every batch
+    // FIXME(ARROW-11840) don't merge *any* aggegates for every batch
     ARROW_ASSIGN_OR_RAISE(
         auto batch_state,
         kernel_->init(kernel_ctx_, {kernel_, *input_descrs_, options_}));
diff --git a/cpp/src/arrow/compute/exec_internal.h b/cpp/src/arrow/compute/exec_internal.h
index c382063d70f..fac78da6db1 100644
--- a/cpp/src/arrow/compute/exec_internal.h
+++ b/cpp/src/arrow/compute/exec_internal.h
@@ -39,18 +39,17 @@ static constexpr int64_t kDefaultMaxChunksize = std::numeric_limits<int64_t>::ma
 
 namespace detail {
 
-/// \brief Break std::vector<Datum> into a sequence of ExecBatch for
-/// kernel execution. The lifetime of the Datum vector must be longer
-/// than the lifetime of this object
+/// \brief Break std::vector<Datum> into a sequence of ExecBatch for kernel
+/// execution
 class ARROW_EXPORT ExecBatchIterator {
  public:
   /// \brief Construct iterator and do basic argument validation
   ///
-  /// \param[in] batch the ExecBatch to iterate over
+  /// \param[in] args the Datum argument, must be all array-like or scalar
   /// \param[in] max_chunksize the maximum length of each ExecBatch. Depending
   /// on the chunk layout of ChunkedArray.
   static Result<std::unique_ptr<ExecBatchIterator>> Make(
-      const ExecBatch& batch, int64_t max_chunksize = kDefaultMaxChunksize);
+      std::vector<Datum> args, int64_t max_chunksize = kDefaultMaxChunksize);
 
   /// \brief Compute the next batch. Always returns at least one batch. Return
   /// false if the iterator is exhausted
@@ -63,9 +62,9 @@ class ARROW_EXPORT ExecBatchIterator {
   int64_t max_chunksize() const { return max_chunksize_; }
 
  private:
-  ExecBatchIterator(const ExecBatch& batch, int64_t max_chunksize);
+  ExecBatchIterator(std::vector<Datum> args, int64_t length, int64_t max_chunksize);
 
-  const ExecBatch& batch_;
+  std::vector<Datum> args_;
   std::vector<int> chunk_indexes_;
   std::vector<int64_t> chunk_positions_;
   int64_t position_;
diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc
index 2f7dd3387e0..bd344fb2297 100644
--- a/cpp/src/arrow/compute/exec_test.cc
+++ b/cpp/src/arrow/compute/exec_test.cc
@@ -663,13 +663,14 @@ TEST_F(TestPropagateNullsSpans, NullOutputTypeNoop) {
 
 class TestExecBatchIterator : public TestComputeInternals {
  public:
-  void SetupIterator(const ExecBatch& input,
+  void SetupIterator(std::vector<Datum> args,
                      int64_t max_chunksize = kDefaultMaxChunksize) {
-    ASSERT_OK_AND_ASSIGN(iterator_, ExecBatchIterator::Make(input, max_chunksize));
+    ASSERT_OK_AND_ASSIGN(iterator_,
+                         ExecBatchIterator::Make(std::move(args), max_chunksize));
   }
-  void CheckIteration(const ExecBatch& input, int chunksize,
+  void CheckIteration(const std::vector<Datum>& args, int chunksize,
                       const std::vector<int>& ex_batch_sizes) {
-    SetupIterator(input, chunksize);
+    SetupIterator(args, chunksize);
     ExecBatch batch;
     int64_t position = 0;
     for (size_t i = 0; i < ex_batch_sizes.size(); ++i) {
@@ -677,17 +678,17 @@ class TestExecBatchIterator : public TestComputeInternals {
       ASSERT_TRUE(iterator_->Next(&batch));
       ASSERT_EQ(ex_batch_sizes[i], batch.length);
 
-      for (size_t j = 0; j < input.values.size(); ++j) {
-        switch (input[j].kind()) {
+      for (size_t j = 0; j < args.size(); ++j) {
+        switch (args[j].kind()) {
           case Datum::SCALAR:
-            ASSERT_TRUE(input[j].scalar()->Equals(batch[j].scalar()));
+            ASSERT_TRUE(args[j].scalar()->Equals(batch[j].scalar()));
             break;
           case Datum::ARRAY:
-            AssertArraysEqual(*input[j].make_array()->Slice(position, batch.length),
+            AssertArraysEqual(*args[j].make_array()->Slice(position, batch.length),
                               *batch[j].make_array());
             break;
           case Datum::CHUNKED_ARRAY: {
-            const ChunkedArray& carr = *input[j].chunked_array();
+            const ChunkedArray& carr = *args[j].chunked_array();
             if (batch.length == 0) {
               ASSERT_EQ(0, carr.length());
             } else {
@@ -717,10 +718,9 @@ TEST_F(TestExecBatchIterator, Basics) {
   const int64_t length = 100;
 
   // Simple case with a single chunk
-  ExecBatch input({Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
-                   Datum(std::make_shared<Int32Scalar>(3))},
-                  length);
-  SetupIterator(input);
+  std::vector<Datum> args = {Datum(GetInt32Array(length)), Datum(GetFloat64Array(length)),
+                             Datum(std::make_shared<Int32Scalar>(3))};
+  SetupIterator(args);
 
   ExecBatch batch;
   ASSERT_TRUE(iterator_->Next(&batch));
@@ -733,62 +733,60 @@ TEST_F(TestExecBatchIterator, Basics) {
   ASSERT_EQ(ValueDescr::Array(float64()), descrs[1]);
   ASSERT_EQ(ValueDescr::Scalar(int32()), descrs[2]);
 
-  AssertArraysEqual(*input[0].make_array(), *batch[0].make_array());
-  AssertArraysEqual(*input[1].make_array(), *batch[1].make_array());
-  ASSERT_TRUE(input[2].scalar()->Equals(batch[2].scalar()));
+  AssertArraysEqual(*args[0].make_array(), *batch[0].make_array());
+  AssertArraysEqual(*args[1].make_array(), *batch[1].make_array());
+  ASSERT_TRUE(args[2].scalar()->Equals(batch[2].scalar()));
 
   ASSERT_EQ(length, iterator_->position());
   ASSERT_FALSE(iterator_->Next(&batch));
 
   // Split into chunks of size 16
-  CheckIteration(input, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
+  CheckIteration(args, /*chunksize=*/16, {16, 16, 16, 16, 16, 16, 4});
 }
 
 TEST_F(TestExecBatchIterator, InputValidation) {
-  ExecBatch batch({Datum(GetInt32Array(10)), Datum(GetInt32Array(9))}, 10);
-  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(batch));
+  std::vector<Datum> args = {Datum(GetInt32Array(10)), Datum(GetInt32Array(9))};
+  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(args));
 
-  batch.values = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
-  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(batch));
+  args = {Datum(GetInt32Array(9)), Datum(GetInt32Array(10))};
+  ASSERT_RAISES(Invalid, ExecBatchIterator::Make(args));
 
-  batch.values = {Datum(GetInt32Array(10))};
-  ASSERT_OK_AND_ASSIGN(auto iterator, ExecBatchIterator::Make(batch));
+  args = {Datum(GetInt32Array(10))};
+  ASSERT_OK_AND_ASSIGN(auto iterator, ExecBatchIterator::Make(args));
   ASSERT_EQ(10, iterator->max_chunksize());
 }
 
 TEST_F(TestExecBatchIterator, ChunkedArrays) {
-  ExecBatch input({Datum(GetInt32Chunked({0, 20, 10})), Datum(GetInt32Chunked({15, 15})),
-                   Datum(GetInt32Array(30)), Datum(std::make_shared<Int32Scalar>(5)),
-                   Datum(MakeNullScalar(boolean()))},
-                  30);
-
-  CheckIteration(input, /*chunksize=*/10, {10, 5, 5, 10});
-  CheckIteration(input, /*chunksize=*/20, {15, 5, 10});
-  CheckIteration(input, /*chunksize=*/30, {15, 5, 10});
+  std::vector<Datum> args = {Datum(GetInt32Chunked({0, 20, 10})),
+                             Datum(GetInt32Chunked({15, 15})), Datum(GetInt32Array(30)),
+                             Datum(std::make_shared<Int32Scalar>(5)),
+                             Datum(MakeNullScalar(boolean()))};
+
+  CheckIteration(args, /*chunksize=*/10, {10, 5, 5, 10});
+  CheckIteration(args, /*chunksize=*/20, {15, 5, 10});
+  CheckIteration(args, /*chunksize=*/30, {15, 5, 10});
 }
 
 TEST_F(TestExecBatchIterator, ZeroLengthInputs) {
   auto carr = std::shared_ptr<ChunkedArray>(new ChunkedArray({}, int32()));
 
-  auto CheckArgs = [&](const ExecBatch& input) {
-    auto iterator = ExecBatchIterator::Make(input).ValueOrDie();
+  auto CheckArgs = [&](const std::vector<Datum>& args) {
+    auto iterator = ExecBatchIterator::Make(args).ValueOrDie();
     ExecBatch batch;
     ASSERT_FALSE(iterator->Next(&batch));
   };
 
-  ExecBatch input({}, 0);
-
   // Zero-length ChunkedArray with zero chunks
-  input.values = {Datum(carr)};
-  CheckArgs(input);
+  std::vector<Datum> args = {Datum(carr)};
+  CheckArgs(args);
 
   // Zero-length array
-  input.values = {Datum(GetInt32Array(0))};
-  CheckArgs(input);
+  args = {Datum(GetInt32Array(0))};
+  CheckArgs(args);
 
   // ChunkedArray with single empty chunk
-  input.values = {Datum(GetInt32Chunked({0}))};
-  CheckArgs(input);
+  args = {Datum(GetInt32Chunked({0}))};
+  CheckArgs(args);
 }
 
 // ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc
index adee77e0572..e4be8b5a356 100644
--- a/cpp/src/arrow/compute/function.cc
+++ b/cpp/src/arrow/compute/function.cc
@@ -205,24 +205,25 @@ Result<const Kernel*> Function::DispatchBest(std::vector<ValueDescr>* values) co
 
 Result<Datum> Function::Execute(const std::vector<Datum>& args,
                                 const FunctionOptions* options, ExecContext* ctx) const {
-  return ExecuteImpl(args, /*passed_length=*/-1, options, ctx);
+  return ExecuteInternal(args, /*passed_length=*/-1, options, ctx);
 }
 
 Result<Datum> Function::Execute(const ExecBatch& batch, const FunctionOptions* options,
                                 ExecContext* ctx) const {
-  return ExecuteImpl(batch.values, batch.length, options, ctx);
+  return ExecuteInternal(batch.values, batch.length, options, ctx);
 }
 
-Result<Datum> Function::ExecuteImpl(const std::vector<Datum>& args, int64_t passed_length,
-                                    const FunctionOptions* options,
-                                    ExecContext* ctx) const {
+Result<Datum> Function::ExecuteInternal(const std::vector<Datum>& args,
+                                        int64_t passed_length,
+                                        const FunctionOptions* options,
+                                        ExecContext* ctx) const {
   if (options == nullptr) {
     RETURN_NOT_OK(CheckOptions(*this, options));
     options = default_options();
   }
   if (ctx == nullptr) {
     ExecContext default_ctx;
-    return ExecuteImpl(args, passed_length, options, &default_ctx);
+    return ExecuteInternal(args, passed_length, options, &default_ctx);
   }
 
   util::tracing::Span span;
@@ -416,5 +417,11 @@ Result<Datum> MetaFunction::Execute(const std::vector<Datum>& args,
   return ExecuteImpl(args, options, ctx);
 }
 
+Result<Datum> MetaFunction::Execute(const ExecBatch& batch,
+                                    const FunctionOptions* options,
+                                    ExecContext* ctx) const {
+  return Execute(batch.values, options, ctx);
+}
+
 }  // namespace compute
 }  // namespace arrow
diff --git a/cpp/src/arrow/compute/function.h b/cpp/src/arrow/compute/function.h
index 541497c714d..6e3c8374335 100644
--- a/cpp/src/arrow/compute/function.h
+++ b/cpp/src/arrow/compute/function.h
@@ -255,8 +255,8 @@ class ARROW_EXPORT Function {
         doc_(std::move(doc)),
         default_options_(default_options) {}
 
-  Result<Datum> ExecuteImpl(const std::vector<Datum>& args, int64_t passed_length,
-                            const FunctionOptions* options, ExecContext* ctx) const;
+  Result<Datum> ExecuteInternal(const std::vector<Datum>& args, int64_t passed_length,
+                                const FunctionOptions* options, ExecContext* ctx) const;
 
   Status CheckArity(const std::vector<InputType>&) const;
   Status CheckArity(const std::vector<ValueDescr>&) const;
@@ -395,6 +395,9 @@ class ARROW_EXPORT MetaFunction : public Function {
   Result<Datum> Execute(const std::vector<Datum>& args, const FunctionOptions* options,
                         ExecContext* ctx) const override;
 
+  Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
+                        ExecContext* ctx) const override;
+
  protected:
   virtual Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
                                     const FunctionOptions* options,
diff --git a/cpp/src/arrow/compute/kernels/hash_aggregate.cc b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
index 2d6ce45c4aa..fc19ad4b7eb 100644
--- a/cpp/src/arrow/compute/kernels/hash_aggregate.cc
+++ b/cpp/src/arrow/compute/kernels/hash_aggregate.cc
@@ -3267,10 +3267,9 @@ Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Dat
   using arrow::compute::detail::ExecBatchIterator;
   std::unique_ptr<ExecBatchIterator> argument_batch_iterator;
 
-  ARROW_ASSIGN_OR_RAISE(ExecBatch args_batch, ExecBatch::Make(arguments));
-  ARROW_ASSIGN_OR_RAISE(ExecBatch keys_batch, ExecBatch::Make(keys));
-
   if (!arguments.empty()) {
+    ARROW_ASSIGN_OR_RAISE(ExecBatch args_batch, ExecBatch::Make(arguments));
+
     // Construct and initialize HashAggregateKernels
     auto argument_descrs = args_batch.GetDescriptors();
 
@@ -3285,11 +3284,13 @@ Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Dat
     ARROW_ASSIGN_OR_RAISE(
         out_fields, ResolveKernels(aggregates, kernels, states[0], ctx, argument_descrs));
 
-    ARROW_ASSIGN_OR_RAISE(argument_batch_iterator,
-                          ExecBatchIterator::Make(args_batch, ctx->exec_chunksize()));
+    ARROW_ASSIGN_OR_RAISE(
+        argument_batch_iterator,
+        ExecBatchIterator::Make(args_batch.values, ctx->exec_chunksize()));
   }
 
   // Construct Groupers
+  ARROW_ASSIGN_OR_RAISE(ExecBatch keys_batch, ExecBatch::Make(keys));
   auto key_descrs = keys_batch.GetDescriptors();
 
   std::vector<std::unique_ptr<Grouper>> groupers(task_group->parallelism());
@@ -3305,8 +3306,9 @@ Result<Datum> GroupBy(const std::vector<Datum>& arguments, const std::vector<Dat
     out_fields.push_back(field("key_" + std::to_string(i++), std::move(key_descr.type)));
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto key_batch_iterator,
-                        ExecBatchIterator::Make(keys_batch, ctx->exec_chunksize()));
+  ARROW_ASSIGN_OR_RAISE(
+      auto key_batch_iterator,
+      ExecBatchIterator::Make(keys_batch.values, ctx->exec_chunksize()));
 
   // start "streaming" execution
   ExecBatch key_batch, argument_batch;
diff --git a/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc b/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc
index 51dbd08a2d3..23b9a7422e2 100644
--- a/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_random_benchmark.cc
@@ -27,10 +27,10 @@ namespace compute {
 
 static void RandomKernel(benchmark::State& state, bool is_seed) {
   const int64_t length = state.range(0);
-  const auto options = is_seed ? RandomOptions::FromSeed(length, 42)
-                               : RandomOptions::FromSystemRandom(length);
+  const auto options =
+      is_seed ? RandomOptions::FromSeed(42) : RandomOptions::FromSystemRandom();
   for (auto _ : state) {
-    ABORT_NOT_OK(CallFunction("random", {}, &options).status());
+    ABORT_NOT_OK(CallFunction("random", ExecBatch({}, length), &options).status());
   }
   state.SetItemsProcessed(state.iterations() * length);
 }

From 6305b0a5ea8ad3cb1cf6b7100f23d80e1b7eaabc Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 10:30:55 -0500
Subject: [PATCH 08/15] C++ unit tests passing again

---
 cpp/src/arrow/compute/exec/expression.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc
index 03545c57e77..2e0fe6ff34b 100644
--- a/cpp/src/arrow/compute/exec/expression.cc
+++ b/cpp/src/arrow/compute/exec/expression.cc
@@ -581,9 +581,14 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i
   auto call = CallNotNull(expr);
 
   std::vector<Datum> arguments(call->arguments.size());
+
+  bool all_scalar = true;
   for (size_t i = 0; i < arguments.size(); ++i) {
     ARROW_ASSIGN_OR_RAISE(
         arguments[i], ExecuteScalarExpression(call->arguments[i], input, exec_context));
+    if (arguments[i].is_array()) {
+      all_scalar = false;
+    }
   }
 
   auto executor = compute::detail::KernelExecutor::MakeScalar();
@@ -597,8 +602,8 @@ Result<Datum> ExecuteScalarExpression(const Expression& expr, const ExecBatch& i
   RETURN_NOT_OK(executor->Init(&kernel_context, {kernel, descrs, options}));
 
   compute::detail::DatumAccumulator listener;
-  RETURN_NOT_OK(
-      executor->Execute(ExecBatch(std::move(arguments), input.length), &listener));
+  RETURN_NOT_OK(executor->Execute(
+      ExecBatch(std::move(arguments), all_scalar ? 1 : input.length), &listener));
   const auto out = executor->WrapResults(arguments, listener.values());
 #ifndef NDEBUG
   DCHECK_OK(executor->CheckResultType(out, call->function_name.c_str()));
@@ -665,7 +670,7 @@ Result<Expression> FoldConstants(Expression expr) {
         if (std::all_of(call->arguments.begin(), call->arguments.end(),
                         [](const Expression& argument) { return argument.literal(); })) {
           // all arguments are literal; we can evaluate this subexpression *now*
-          static const ExecBatch ignored_input = ExecBatch{};
+          static const ExecBatch ignored_input = ExecBatch({}, 1);
           ARROW_ASSIGN_OR_RAISE(Datum constant,
                                 ExecuteScalarExpression(expr, ignored_input));
 

From 44ea16d5d820ebd3d2d2f1e62c33a18418b40e82 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 11:07:07 -0500
Subject: [PATCH 09/15] Clean up Python UDF tests when calling nullary
 functions

---
 cpp/src/arrow/compute/exec.cc        |  4 ++--
 python/pyarrow/_compute.pyx          | 33 ++++++++++++++++++++--------
 python/pyarrow/compute.py            | 24 ++++++++++++++++++++
 python/pyarrow/includes/libarrow.pxd |  6 ++++-
 python/pyarrow/tests/test_compute.py | 29 ++----------------------
 python/pyarrow/tests/test_udf.py     | 21 ++++++++++--------
 6 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc
index 33afdb19284..da226a062d0 100644
--- a/cpp/src/arrow/compute/exec.cc
+++ b/cpp/src/arrow/compute/exec.cc
@@ -1280,7 +1280,7 @@ Result<int64_t> InferBatchLength(const std::vector<Datum>& values) {
         length = arg_length;
       } else {
         if (length != arg_length) {
-          return Status::Invalid("Array lengths were not all the same");
+          return Status::Invalid("Array arguments must all be the same length");
         }
       }
       are_all_scalar = false;
@@ -1290,7 +1290,7 @@ Result<int64_t> InferBatchLength(const std::vector<Datum>& values) {
         length = arg_length;
       } else {
         if (length != arg_length) {
-          return Status::Invalid("Array lengths were not all the same");
+          return Status::Invalid("Array arguments must all be the same length");
         }
       }
       are_all_scalar = false;
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index 25cff0b8f5a..f020a69effb 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -314,7 +314,7 @@ cdef class Function(_Weakrefable):
         return self.base_func.num_kernels()
 
     def call(self, args, FunctionOptions options=None,
-             MemoryPool memory_pool=None):
+             MemoryPool memory_pool=None, length=None):
         """
         Call the function on the given arguments.
 
@@ -328,23 +328,34 @@ cdef class Function(_Weakrefable):
             the right concrete options type.
         memory_pool : pyarrow.MemoryPool, optional
             If not passed, will allocate memory from the default memory pool.
+        length : int, optional
+            Batch size for execution, for nullary (no argument) functions. If
+            not passed, will be inferred from passed data.
         """
         cdef:
             const CFunctionOptions* c_options = NULL
             CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
             CExecContext c_exec_ctx = CExecContext(pool)
-            vector[CDatum] c_args
+            CExecBatch c_batch
             CDatum result
 
-        _pack_compute_args(args, &c_args)
+        _pack_compute_args(args, &c_batch.values)
 
         if options is not None:
             c_options = options.get_options()
 
-        with nogil:
-            result = GetResultValue(
-                self.base_func.Execute(c_args, c_options, &c_exec_ctx)
-            )
+        if length is not None:
+            c_batch.length = length
+            with nogil:
+                result = GetResultValue(
+                    self.base_func.Execute(c_batch, c_options, &c_exec_ctx)
+                )
+        else:
+            with nogil:
+                result = GetResultValue(
+                    self.base_func.Execute(c_batch.values, c_options,
+                                           &c_exec_ctx)
+                )
 
         return wrap_datum(result)
 
@@ -524,7 +535,7 @@ def list_functions():
     return _global_func_registry.list_functions()
 
 
-def call_function(name, args, options=None, memory_pool=None):
+def call_function(name, args, options=None, memory_pool=None, length=None):
     """
     Call a named function.
 
@@ -541,9 +552,13 @@ def call_function(name, args, options=None, memory_pool=None):
         options provided to the function.
     memory_pool : MemoryPool, optional
         memory pool to use for allocations during function execution.
+    length : int, optional
+        Batch size for execution, for nullary (no argument) functions. If not
+        passed, inferred from data.
     """
     func = _global_func_registry.get_function(name)
-    return func.call(args, options=options, memory_pool=memory_pool)
+    return func.call(args, options=options, memory_pool=memory_pool,
+                     length=length)
 
 
 cdef class FunctionOptions(_Weakrefable):
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index b89030004ef..b2ba9e40c67 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -595,6 +595,30 @@ def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
     return call_function("select_k_unstable", [values], options, memory_pool)
 
 
+def random(n, *, initializer='system', options=None, memory_pool=None):
+    """
+    Generate numbers in the range [0, 1).
+
+    Generated values are uniformly-distributed, double-precision
+    in range [0, 1). Algorithm and seed can be changed via RandomOptions.
+
+    Parameters
+    ----------
+    initializer : int or str
+        How to initialize the underlying random generator.
+        If an integer is given, it is used as a seed.
+        If "system" is given, the random generator is initialized with
+        a system-specific source of (hopefully true) randomness.
+        Other values are invalid.
+    options : pyarrow.compute.RandomOptions, optional
+        Alternative way of passing options.
+    memory_pool : pyarrow.MemoryPool, optional
+        If not passed, will allocate memory from the default memory pool.
+    """
+    options = RandomOptions(initializer=initializer)
+    return call_function("random", [], options, memory_pool, length=n)
+
+
 def field(*name_or_index):
     """Reference a column of the dataset.
 
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index ce3e1bf304b..a3cb0448558 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1890,6 +1890,9 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CResult[CDatum] Execute(const vector[CDatum]& args,
                                 const CFunctionOptions* options,
                                 CExecContext* ctx) const
+        CResult[CDatum] Execute(const CExecBatch& args,
+                                const CFunctionOptions* options,
+                                CExecContext* ctx) const
 
     cdef cppclass CScalarFunction" arrow::compute::ScalarFunction"(CFunction):
         vector[const CScalarKernel*] kernels() const
@@ -2536,7 +2539,8 @@ cdef extern from "arrow/compute/exec/exec_plan.h" namespace "arrow::compute" nog
         const shared_ptr[CSchema]& output_schema() const
 
     cdef cppclass CExecBatch "arrow::compute::ExecBatch":
-        pass
+        vector[CDatum] values
+        int64_t length
 
     shared_ptr[CRecordBatchReader] MakeGeneratorReader(
         shared_ptr[CSchema] schema,
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
index 068cea98054..ae1c2f77129 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -149,7 +149,7 @@ def test_option_class_equality():
         pc.PartitionNthOptions(1, null_placement="at_start"),
         pc.CumulativeSumOptions(start=0, skip_nulls=False),
         pc.QuantileOptions(),
-        pc.RandomOptions(10),
+        pc.RandomOptions(),
         pc.ReplaceSliceOptions(0, 1, "a"),
         pc.ReplaceSubstringOptions("a", "b"),
         pc.RoundOptions(2, "towards_infinity"),
@@ -729,31 +729,6 @@ def test_generated_docstrings():
         memory_pool : pyarrow.MemoryPool, optional
             If not passed, will allocate memory from the default memory pool.
         """)
-    # Nullary with options
-    assert pc.random.__doc__ == textwrap.dedent("""\
-        Generate numbers in the range [0, 1).
-
-        Generated values are uniformly-distributed, double-precision """ +
-                                                """in range [0, 1).
-        Length of generated data, algorithm and seed can be changed """ +
-                                                """via RandomOptions.
-
-        Parameters
-        ----------
-        length : int
-            Number of random values to generate.
-        initializer : int or str
-            How to initialize the underlying random generator.
-            If an integer is given, it is used as a seed.
-            If "system" is given, the random generator is initialized with
-            a system-specific source of (hopefully true) randomness.
-            Other values are invalid.
-        options : pyarrow.compute.RandomOptions, optional
-            Alternative way of passing options.
-        memory_pool : pyarrow.MemoryPool, optional
-            If not passed, will allocate memory from the default memory pool.
-        """)
-    # With custom examples
     assert pc.filter.__doc__ == textwrap.dedent("""\
         Filter with a boolean selection filter.
 
@@ -822,7 +797,7 @@ def test_generated_signatures():
     assert str(sig) == "(indices, /, *values, memory_pool=None)"
     # Nullary with options
     sig = inspect.signature(pc.random)
-    assert str(sig) == ("(length, *, initializer='system', "
+    assert str(sig) == ("(n, *, initializer='system', "
                         "options=None, memory_pool=None)")
 
 
diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py
index 33315fc12d4..e711619582d 100644
--- a/python/pyarrow/tests/test_udf.py
+++ b/python/pyarrow/tests/test_udf.py
@@ -240,16 +240,19 @@ def check_scalar_function(func_fixture,
                           batch_length=None):
     function, name = func_fixture
     if batch_length is None:
-        for input in inputs:
-            try:
-                batch_length = len(inputs)
-            except TypeError:
-                pass
+        all_scalar = True
+        for arg in inputs:
+            if isinstance(arg, pa.Array):
+                all_scalar = False
+                batch_length = len(arg)
+        if all_scalar:
+            batch_length = 1
+
     expected_output = function(mock_udf_context(batch_length), *inputs)
     func = pc.get_function(name)
     assert func.name == name
 
-    result = pc.call_function(name, inputs)
+    result = pc.call_function(name, inputs, length=batch_length)
     assert result == expected_output
     # At the moment there is an issue when handling nullary functions.
     # See: ARROW-15286 and ARROW-16290.
@@ -404,7 +407,7 @@ def test_wrong_output_type(wrong_output_type_func_fixture):
 
     with pytest.raises(TypeError,
                        match="Unexpected output type: int"):
-        pc.call_function(func_name, [])
+        pc.call_function(func_name, [], length=1)
 
 
 def test_wrong_output_datatype(wrong_output_datatype_func_fixture):
@@ -424,7 +427,7 @@ def test_wrong_signature(wrong_signature_func_fixture):
                      "but 1 was given")
 
     with pytest.raises(TypeError, match=expected_expr):
-        pc.call_function(func_name, [])
+        pc.call_function(func_name, [], length=1)
 
 
 def test_wrong_datatype_declaration():
@@ -479,7 +482,7 @@ def test_udf_context(unary_func_fixture):
 def test_raising_func(raising_func_fixture):
     _, func_name = raising_func_fixture
     with pytest.raises(MyError, match="error raised by scalar UDF"):
-        pc.call_function(func_name, [])
+        pc.call_function(func_name, [], length=1)
 
 
 def test_scalar_input(unary_func_fixture):

From 71499a5ee7f0c507feab12bf23390a78da862749 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 11:26:31 -0500
Subject: [PATCH 10/15] Quiet some release mode warnings

---
 cpp/src/arrow/array/data.h   | 2 +-
 cpp/src/arrow/compute/exec.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h
index 09125268eb9..df547aedfaf 100644
--- a/cpp/src/arrow/array/data.h
+++ b/cpp/src/arrow/array/data.h
@@ -260,7 +260,7 @@ struct ARROW_EXPORT BufferSpan {
 /// copyable and does not contain any shared_ptr objects. Do not use in public
 /// APIs aside from compute kernels for now
 struct ARROW_EXPORT ArraySpan {
-  const DataType* type;
+  const DataType* type = NULLPTR;
   int64_t length = 0;
   mutable int64_t null_count = kUnknownNullCount;
   int64_t offset = 0;
diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index c9be64ad0f5..aefbcfc244e 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -436,7 +436,7 @@ struct ARROW_EXPORT ExecSpan {
     return result;
   }
 
-  int64_t length;
+  int64_t length = 0;
   std::vector<ExecValue> values;
 };
 

From 7bb030b03953481121cbd587bd824d1ec150d744 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 11:35:50 -0500
Subject: [PATCH 11/15] Fix an R C++ API usage

---
 r/src/altrep.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/src/altrep.cpp b/r/src/altrep.cpp
index eb44173edc2..33fe09d398d 100644
--- a/r/src/altrep.cpp
+++ b/r/src/altrep.cpp
@@ -671,7 +671,7 @@ struct AltrepFactor : public AltrepVectorBase<AltrepFactor> {
                                  Transpose transpose, int* out) {
     using index_type = typename Type::c_type;
 
-    VisitArrayDataInline<Type>(
+    VisitArraySpanInline<Type>(
         *array->data(),
         /*valid_func=*/[&](index_type index) { *out++ = transpose(index) + 1; },
         /*null_func=*/[&]() { *out++ = cpp11::na<int>(); });

From b3794855e267df86ae1ef06f4160d0e78a8d87e5 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 16:30:57 -0500
Subject: [PATCH 12/15] Fix R compilation

---
 r/src/array_to_vector.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp
index 55c383ac549..dccc29537ed 100644
--- a/r/src/array_to_vector.cpp
+++ b/r/src/array_to_vector.cpp
@@ -1188,7 +1188,7 @@ bool ArraysCanFitInteger(ArrayVector arrays) {
   auto i32 = arrow::int32();
   for (const auto& array : arrays) {
     if (all_can_fit) {
-      all_can_fit = arrow::IntegersCanFit(arrow::Datum(array), *i32).ok();
+      all_can_fit = arrow::IntegersCanFit(*array->data(), *i32).ok();
     }
   }
   return all_can_fit;

From 6729a78f482922663a5de94aa371fa6a3406c526 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 16:31:55 -0500
Subject: [PATCH 13/15] Add binary symbol explorer tool to compare Arrow binary
 symbol sizes

---
 cpp/tools/binary_symbol_explore.py | 120 +++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 cpp/tools/binary_symbol_explore.py

diff --git a/cpp/tools/binary_symbol_explore.py b/cpp/tools/binary_symbol_explore.py
new file mode 100644
index 00000000000..dfe81cea846
--- /dev/null
+++ b/cpp/tools/binary_symbol_explore.py
@@ -0,0 +1,120 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import subprocess as sbp
+import sys
+
+try:
+    import pandas as pd
+    HAVE_PANDAS = True
+except ImportError:
+    HAVE_PANDAS = False
+
+SYMBOL_FILTERS = {
+    'std::chrono::duration': 'duration',
+    'std::__cxx11::basic_string': 'std::string',
+    'arrow::ArrayData': 'ArrayData',
+    'arrow::ArraySpan': 'ArraySpan',
+    'arrow::Datum': 'Datum',
+    'arrow::Scalar': 'Scalar',
+    'arrow::Status': 'Status',
+    'arrow::Type': 'Type',
+    'arrow::TimestampType': 'TsT',
+    'arrow::BinaryType': 'BinaryT',
+    'arrow::BooleanType': 'BoolT',
+    'arrow::StringType': 'StringT',
+    'arrow::LargeStringType': 'LStringT',
+    'arrow::DoubleType': 'DoubleT',
+    'arrow::FloatType': 'FloatT',
+    'arrow::Int64Type': 'Int64T',
+    'arrow::UInt64Type': 'UInt64T',
+    'arrow::LargeListType': 'LListT',
+    'arrow::ListType': 'ListT',
+    'arrow::FixedSizeListType': 'FSLT',
+    'arrow::compute::': 'ac::',
+    'ac::internal::': '',
+    'arrow::internal::': 'ai::',
+    '(anonymous namespace)::': '',
+    'internal::applicator::': '',
+    'internal::CastFunctor': 'CastFunctor',
+    'ac::KernelContext*': 'C*',
+    'ArrayData const&': 'A&',
+    'ArraySpan const&': 'A&',
+    'ArrayData*': 'O*',
+    'Scalar const&': 'S&',
+    'Datum const&': 'V&',
+    'Datum*': 'O*',
+    'ac::ExecBatch const&': 'B&',
+    'ac::ExecSpan const&': 'B&',
+    'ac::ExecValue const&': 'V&',
+    'ac::ExecResult*': 'O*',
+    'Type::type': 'T',
+}
+
+
+def filter_symbol(symbol_name):
+    for token, replacement in SYMBOL_FILTERS.items():
+        symbol_name = symbol_name.replace(token, replacement)
+    return symbol_name
+
+
+def get_symbols_and_sizes(object_file):
+    cmd = f"nm --print-size --size-sort {object_file} | c++filt"
+    output = sbp.check_output(cmd, shell=True).decode('utf-8')
+    symbol_sizes = []
+    for x in output.split('\n'):
+        if len(x) == 0:
+            continue
+        _, hex_size, _, symbol_name = x.split(' ', 3)
+        symbol_name = filter_symbol(symbol_name)
+        symbol_sizes.append((symbol_name, int(hex_size, 16)))
+    return dict(symbol_sizes)
+
+
+if __name__ == '__main__':
+    base, contender = sys.argv[1], sys.argv[2]
+
+    base_results = get_symbols_and_sizes(base)
+    contender_results = get_symbols_and_sizes(contender)
+
+    all_symbols = set(base_results.keys()) | set(contender_results.keys())
+
+    diff_table = []
+    for name in all_symbols:
+        if name in base_results and name in contender_results:
+            base_size = base_results[name]
+            contender_size = contender_results[name]
+        elif name in base_results:
+            base_size = base_results[name]
+            contender_size = 0
+        else:
+            base_size = 0
+            contender_size = contender_results[name]
+        diff = contender_size - base_size
+        diff_table.append((name, base_size, contender_size, diff))
+    diff_table.sort(key=lambda x: x[3])
+
+    if HAVE_PANDAS:
+        diff = pd.DataFrame.from_records(diff_table,
+                                         columns=['symbol', 'base',
+                                                  'contender', 'diff'])
+        pd.options.display.max_rows = 1000
+        pd.options.display.max_colwidth = 150
+        print(diff[diff['diff'] > 700])
+    else:
+        # TODO
+        pass

From 7a56dff4875a1992409b40ff22e7af050420ad4d Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 18:37:50 -0500
Subject: [PATCH 14/15] Fix Python docstring warning

---
 cpp/src/arrow/compute/exec.h | 2 +-
 python/pyarrow/compute.py    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/exec.h b/cpp/src/arrow/compute/exec.h
index aefbcfc244e..ba41bfb5b6e 100644
--- a/cpp/src/arrow/compute/exec.h
+++ b/cpp/src/arrow/compute/exec.h
@@ -212,7 +212,7 @@ struct ARROW_EXPORT ExecBatch {
   /// If the array values are of length 0 then the length is 0 regardless of
   /// whether any values are Scalar. In general ExecBatch objects are produced
   /// by ExecBatchIterator which by design does not yield length-0 batches.
-  int64_t length;
+  int64_t length = 0;
 
   /// \brief The sum of bytes in each buffer referenced by the batch
   ///
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index b2ba9e40c67..f591b95c01b 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -604,6 +604,8 @@ def random(n, *, initializer='system', options=None, memory_pool=None):
 
     Parameters
     ----------
+    n : int
+        Number of values to generate, must be greater than or equal to 0
     initializer : int or str
         How to initialize the underlying random generator.
         If an integer is given, it is used as a seed.

From b5f07859299a3dec13e56c4af307b19769a6f554 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wesm@apache.org>
Date: Sun, 12 Jun 2022 21:04:12 -0500
Subject: [PATCH 15/15] Fix broken Python doctest

---
 python/pyarrow/table.pxi | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 380f690a371..3af10b46cc6 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -317,9 +317,7 @@ cdef class ChunkedArray(_PandasConvertible):
           [
             false,
             false,
-            false
-          ],
-          [
+            false,
             false,
             true,
             false
@@ -3323,7 +3321,7 @@ cdef class Table(_PandasConvertible):
 
         Examples
         --------
-        >>> import pyarrow as pa  
+        >>> import pyarrow as pa
         >>> n_legs = pa.array([2, 2, 4, 4, 5, 100])
         >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"])
         >>> names=["n_legs", "animals"]
@@ -4124,7 +4122,7 @@ cdef class Table(_PandasConvertible):
         >>> table = pa.Table.from_pandas(df)
         >>> for i in table.itercolumns():
         ...     print(i.null_count)
-        ... 
+        ...
         2
         1
         """
@@ -4648,7 +4646,7 @@ cdef class Table(_PandasConvertible):
             of the join operation left side.
         right_keys : str or list[str], default None
             The columns from the right_table that should be used as keys
-            on the join operation right side. 
+            on the join operation right side.
             When ``None`` use the same key names as the left table.
         join_type : str, default "left outer"
             The kind of join that should be performed, one of