diff --git a/cpp/src/arrow/compute/kernels/gather_internal.h b/cpp/src/arrow/compute/kernels/gather_internal.h
index 4c161533a72..dfc893a4da2 100644
--- a/cpp/src/arrow/compute/kernels/gather_internal.h
+++ b/cpp/src/arrow/compute/kernels/gather_internal.h
@@ -20,8 +20,14 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <vector>
 
+#include "arrow/array/array_base.h"
 #include "arrow/array/data.h"
+#include "arrow/chunk_resolver.h"
+#include "arrow/chunked_array.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bit_block_counter.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_util.h"
@@ -52,6 +58,15 @@ class GatherBaseCRTP {
   ARROW_DEFAULT_MOVE_AND_ASSIGN(GatherBaseCRTP);
 
  protected:
+  template <typename IndexCType>
+  bool IsSrcValid(const ArraySpan& src_validity, const IndexCType* idx,
+                  int64_t position) const {
+    // Translate position into index on the source
+    const int64_t index = idx[position];
+    ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
+    return src_validity.IsValid(index);
+  }
+
   ARROW_FORCE_INLINE int64_t ExecuteNoNulls(int64_t idx_length) {
     auto* self = static_cast<GatherImpl*>(this);
     for (int64_t position = 0; position < idx_length; position++) {
@@ -76,8 +91,12 @@ class GatherBaseCRTP {
   // doesn't have to be called for resulting null positions. A position is
   // considered null if either the index or the source value is null at that
   // position.
-  template <bool kOutputIsZeroInitialized, typename IndexCType>
-  ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ArraySpan& src_validity,
+  //
+  // ValiditySpan is any class that `GatherImpl::IsSrcValid(src_validity, idx, position)`
+  // can be called with.
+  template <bool kOutputIsZeroInitialized, typename IndexCType,
+            class ValiditySpan = ArraySpan>
+  ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ValiditySpan& src_validity,
                                               int64_t idx_length, const IndexCType* idx,
                                               const ArraySpan& idx_validity,
                                               uint8_t* out_is_valid) {
@@ -116,12 +135,11 @@ class GatherBaseCRTP {
           position += block.length;
         }
       } else {
-        // Source values may be null, so we must do random access into src_validity
+        // Source values may be null, so we must do random access with IsSrcValid()
         if (block.popcount == block.length) {
           // Faster path: indices are not null but source values may be
           for (int64_t i = 0; i < block.length; ++i) {
-            ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
-            if (src_validity.IsValid(idx[position])) {
+            if (self->IsSrcValid(src_validity, idx, position)) {
               // value is not null
               self->WriteValue(position);
               bit_util::SetBit(out_is_valid, position);
@@ -136,9 +154,9 @@ class GatherBaseCRTP {
           // random access in general we have to check the value nullness one by
           // one.
           for (int64_t i = 0; i < block.length; ++i) {
-            ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
             ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr);
-            if (idx_validity.IsValid(position) && src_validity.IsValid(idx[position])) {
+            if (idx_validity.IsValid(position) &&
+                self->IsSrcValid(src_validity, idx, position)) {
               // index is not null && value is not null
               self->WriteValue(position);
               bit_util::SetBit(out_is_valid, position);
@@ -303,4 +321,139 @@ class Gather</*kValueWidthInBits=*/1, IndexCType, /*kWithFactor=*/false>
   }
 };
 
+template <typename IndexCType>
+struct ChunkedValiditySpan {
+  const ChunkedArray& chunks_validity;
+  const TypedChunkLocation<IndexCType>* chunk_location_vec;
+  const bool may_have_nulls;
+
+  ChunkedValiditySpan(const ChunkedArray& chunks_validity,
+                      const TypedChunkLocation<IndexCType>* chunk_location_vec)
+      : chunks_validity(chunks_validity),
+        chunk_location_vec(chunk_location_vec),
+        may_have_nulls(chunks_validity.null_count() > 0) {}
+
+  bool MayHaveNulls() const { return may_have_nulls; }
+
+  bool IsSrcValid(const IndexCType* idx, int64_t position) const {
+    // idx is unused because all the indices have been pre-resolved into
+    // `chunk_location_vec` by ChunkResolver::ResolveMany.
+    ARROW_UNUSED(idx);
+    auto loc = chunk_location_vec[position];
+    return chunks_validity.chunk(static_cast<int>(loc.chunk_index))
+        ->IsValid(loc.index_in_chunk);
+  }
+};
+
+template <int kValueWidthInBits, typename IndexCType, bool kWithFactor>
+class GatherFromChunks
+    : public GatherBaseCRTP<
+          GatherFromChunks<kValueWidthInBits, IndexCType, kWithFactor>> {
+ private:
+  static_assert(!kWithFactor || kValueWidthInBits == 8,
+                "kWithFactor is only supported for kValueWidthInBits == 8");
+  static_assert(kValueWidthInBits == 1 || kValueWidthInBits % 8 == 0);
+  // kValueWidth should not be used if kValueWidthInBits == 1.
+  static constexpr int kValueWidth = kValueWidthInBits / 8;
+
+  // src_residual_bit_offsets_[i] is used to store the bit offset of the first byte (0-7)
+  // in src_chunks_[i] iff kValueWidthInBits == 1.
+  const int* src_residual_bit_offsets_ = NULLPTR;
+  // Pre-computed pointers to the start of the values in each chunk.
+  const uint8_t* const* src_chunks_;
+  // Number indices resolved in chunk_location_vec_.
+  const int64_t idx_length_;
+  const TypedChunkLocation<IndexCType>* chunk_location_vec_;
+
+  uint8_t* out_;
+  int64_t factor_;
+
+ public:
+  void WriteValue(int64_t position) {
+    auto loc = chunk_location_vec_[position];
+    auto* chunk = src_chunks_[loc.chunk_index];
+    if constexpr (kValueWidthInBits == 1) {
+      auto src_offset = src_residual_bit_offsets_[loc.chunk_index];
+      bit_util::SetBitTo(out_, position,
+                         bit_util::GetBit(chunk, src_offset + loc.index_in_chunk));
+    } else if constexpr (kWithFactor) {
+      const int64_t scaled_factor = kValueWidth * factor_;
+      memcpy(out_ + position * scaled_factor, chunk + loc.index_in_chunk * scaled_factor,
+             scaled_factor);
+    } else {
+      memcpy(out_ + position * kValueWidth, chunk + loc.index_in_chunk * kValueWidth,
+             kValueWidth);
+    }
+  }
+
+  void WriteZero(int64_t position) {
+    if constexpr (kValueWidthInBits == 1) {
+      bit_util::ClearBit(out_, position);
+    } else if constexpr (kWithFactor) {
+      const int64_t scaled_factor = kValueWidth * factor_;
+      memset(out_ + position * scaled_factor, 0, scaled_factor);
+    } else {
+      memset(out_ + position * kValueWidth, 0, kValueWidth);
+    }
+  }
+
+  void WriteZeroSegment(int64_t position, int64_t block_length) {
+    if constexpr (kValueWidthInBits == 1) {
+      bit_util::SetBitsTo(out_, position, block_length, false);
+    } else if constexpr (kWithFactor) {
+      const int64_t scaled_factor = kValueWidth * factor_;
+      memset(out_ + position * scaled_factor, 0, block_length * scaled_factor);
+    } else {
+      memset(out_ + position * kValueWidth, 0, block_length * kValueWidth);
+    }
+  }
+
+  bool IsSrcValid(const ChunkedValiditySpan<IndexCType>& src_validity,
+                  const IndexCType* idx, int64_t position) const {
+    return src_validity.IsSrcValid(idx, position);
+  }
+
+ public:
+  GatherFromChunks(const int* src_residual_bit_offsets, const uint8_t* const* src_chunks,
+                   const int64_t idx_length,
+                   const TypedChunkLocation<IndexCType>* chunk_location_vec, uint8_t* out,
+                   int64_t factor = 1)
+      : src_residual_bit_offsets_(src_residual_bit_offsets),
+        src_chunks_(src_chunks),
+        idx_length_(idx_length),
+        chunk_location_vec_(chunk_location_vec),
+        out_(out),
+        factor_(factor) {
+    assert(src_chunks && chunk_location_vec_ && out);
+    if constexpr (kValueWidthInBits == 1) {
+      assert(src_residual_bit_offsets);
+    }
+    assert((kWithFactor || factor == 1) &&
+           "When kWithFactor is false, the factor is assumed to be 1 at compile time");
+  }
+
+  ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); }
+
+  /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized.
+  /// \pre Bits in out_is_valid have to always be zero initialized.
+  /// \post The bits for the valid elements (and only those) are set in out_is_valid.
+  /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null
+  ///       elements have 0s written to them. This might be less efficient than
+  ///       zero-initializing first and calling this->Execute() afterwards.
+  /// \return The number of valid elements in out.
+  template <bool kOutputIsZeroInitialized = false>
+  ARROW_FORCE_INLINE int64_t Execute(const ChunkedArray& src_validity,
+                                     const ArraySpan& idx_validity,
+                                     uint8_t* out_is_valid) {
+    assert(idx_length_ == idx_validity.length);
+    assert(out_is_valid);
+    assert(idx_validity.type->byte_width() == sizeof(IndexCType));
+    ChunkedValiditySpan src_validity_span{src_validity, chunk_location_vec_};
+    assert(src_validity_span.MayHaveNulls() || idx_validity.MayHaveNulls());
+    // idx=NULLPTR because when it's passed to IsSrcValid() defined above, it's not used.
+    return this->template ExecuteWithNulls<kOutputIsZeroInitialized, IndexCType>(
+        src_validity_span, idx_length_, /*idx=*/NULLPTR, idx_validity, out_is_valid);
+  }
+};
+
 }  // namespace arrow::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_selection.cc b/cpp/src/arrow/compute/kernels/vector_selection.cc
index 6c6f1b36b84..ddc96485239 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection.cc
@@ -294,6 +294,8 @@ std::shared_ptr<VectorFunction> MakeIndicesNonZeroFunction(std::string name,
   VectorKernel kernel;
   kernel.null_handling = NullHandling::OUTPUT_NOT_NULL;
   kernel.mem_allocation = MemAllocation::NO_PREALLOCATE;
+  // "array_take" ensures that the output will be be chunked when at least one
+  // input is chunked, so we need to set this to false.
   kernel.output_chunked = false;
   kernel.exec = IndicesNonZeroExec;
   kernel.exec_chunked = IndicesNonZeroExecChunked;
@@ -339,6 +341,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
   VectorKernel take_base;
   take_base.init = TakeState::Init;
   take_base.can_execute_chunkwise = false;
+  take_base.output_chunked = false;
   RegisterSelectionFunction("array_take", array_take_doc, take_base,
                             std::move(take_kernels), GetDefaultTakeOptions(), registry);
 
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
index 194c3591337..1cecbd3f2fd 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
@@ -895,18 +895,23 @@ Status ExtensionFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
 }
 
 // Transform filter to selection indices and then use Take.
-Status FilterWithTakeExec(const ArrayKernelExec& take_exec, KernelContext* ctx,
+Status FilterWithTakeExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
                           const ExecSpan& batch, ExecResult* out) {
-  std::shared_ptr<ArrayData> indices;
+  std::shared_ptr<ArrayData> indices_data;
   RETURN_NOT_OK(GetTakeIndices(batch[1].array,
                                FilterState::Get(ctx).null_selection_behavior,
                                ctx->memory_pool())
-                    .Value(&indices));
+                    .Value(&indices_data));
+
   KernelContext take_ctx(*ctx);
   TakeState state{TakeOptions::NoBoundsCheck()};
   take_ctx.SetState(&state);
-  ExecSpan take_batch({batch[0], ArraySpan(*indices)}, batch.length);
-  return take_exec(&take_ctx, take_batch, out);
+
+  ValuesSpan values(batch[0].array);
+  std::shared_ptr<ArrayData> out_data = out->array_data();
+  RETURN_NOT_OK(take_aaa_exec(&take_ctx, values, *indices_data, &out_data));
+  out->value = std::move(out_data);
+  return Status::OK();
 }
 
 // Due to the special treatment with their Take kernels, we filter Struct and SparseUnion
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
index 7fe8d9b8866..acc6c5a2fc4 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -19,6 +19,7 @@
 #include <limits>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "arrow/array/array_binary.h"
@@ -60,6 +61,7 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
         {std::move(kernel_data.value_type), std::move(kernel_data.selection_type)},
         OutputType(FirstType));
     base_kernel.exec = kernel_data.exec;
+    base_kernel.exec_chunked = kernel_data.chunked_exec;
     DCHECK_OK(func->AddKernel(base_kernel));
   }
   kernels.clear();
@@ -192,20 +194,34 @@ struct Selection {
   };
 
   KernelContext* ctx;
-  const ArraySpan& values;
-  const ArraySpan& selection;
+  const ArraySpan values;
+  const ArraySpan selection;
   int64_t output_length;
   ArrayData* out;
   TypedBufferBuilder<bool> validity_builder;
 
-  Selection(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-            ExecResult* out)
+  Selection(KernelContext* ctx, ArraySpan values, ArraySpan selection,
+            int64_t output_length, ArrayData* out)
       : ctx(ctx),
-        values(batch[0].array),
-        selection(batch[1].array),
+        values(std::move(values)),
+        selection(std::move(selection)),
         output_length(output_length),
-        out(out->array_data().get()),
-        validity_builder(ctx->memory_pool()) {}
+        out(out),
+        validity_builder(ctx->memory_pool()) {
+    // If the selection is an array of indices, the output length should
+    // match the number of indices in the selection array.
+    DCHECK(!is_integer(selection.type->id()) || output_length == selection.length);
+  }
+
+  Selection(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
+            ExecResult* out)
+      : Selection(ctx, batch[0].array, batch[1].array, output_length,
+                  out->array_data().get()) {}
+
+  Selection(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+            std::shared_ptr<ArrayData>* out)
+      : Selection(ctx, values.array(), indices, /*output_length=*/indices.length,
+                  out->get()) {}
 
   virtual ~Selection() = default;
 
@@ -483,9 +499,9 @@ struct VarBinarySelectionImpl : public Selection<VarBinarySelectionImpl<Type>, T
 
   static constexpr int64_t kOffsetLimit = std::numeric_limits<offset_type>::max() - 1;
 
-  VarBinarySelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                         ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit VarBinarySelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         offset_builder(ctx->memory_pool()),
         data_builder(ctx->memory_pool()) {}
 
@@ -557,9 +573,9 @@ struct ListSelectionImpl : public Selection<ListSelectionImpl<Type>, Type> {
   TypedBufferBuilder<offset_type> offset_builder;
   typename TypeTraits<Type>::OffsetBuilderType child_index_builder;
 
-  ListSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                    ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit ListSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         offset_builder(ctx->memory_pool()),
         child_index_builder(ctx->memory_pool()) {}
 
@@ -622,9 +638,9 @@ struct ListViewSelectionImpl : public Selection<ListViewSelectionImpl<Type>, Typ
   TypedBufferBuilder<offset_type> offsets_builder;
   TypedBufferBuilder<offset_type> sizes_builder;
 
-  ListViewSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                        ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit ListViewSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         offsets_builder(ctx->memory_pool()),
         sizes_builder(ctx->memory_pool()) {}
 
@@ -679,9 +695,9 @@ struct DenseUnionSelectionImpl
   std::vector<int8_t> type_codes_;
   std::vector<Int32Builder> child_indices_builders_;
 
-  DenseUnionSelectionImpl(KernelContext* ctx, const ExecSpan& batch,
-                          int64_t output_length, ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit DenseUnionSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         value_offset_buffer_builder_(ctx->memory_pool()),
         child_id_buffer_builder_(ctx->memory_pool()),
         type_codes_(checked_cast<const UnionType&>(*this->values.type).type_codes()),
@@ -760,9 +776,9 @@ struct SparseUnionSelectionImpl
   TypedBufferBuilder<int8_t> child_id_buffer_builder_;
   const int8_t type_code_for_null_;
 
-  SparseUnionSelectionImpl(KernelContext* ctx, const ExecSpan& batch,
-                           int64_t output_length, ExecResult* out)
-      : Base(ctx, batch, output_length, out),
+  template <typename... Args>
+  explicit SparseUnionSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...),
         child_id_buffer_builder_(ctx->memory_pool()),
         type_code_for_null_(
             checked_cast<const UnionType&>(*this->values.type).type_codes()[0]) {}
@@ -811,9 +827,9 @@ struct FSLSelectionImpl : public Selection<FSLSelectionImpl, FixedSizeListType>
   using Base = Selection<FSLSelectionImpl, FixedSizeListType>;
   LIFT_BASE_MEMBERS();
 
-  FSLSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                   ExecResult* out)
-      : Base(ctx, batch, output_length, out), child_index_builder(ctx->memory_pool()) {}
+  template <typename... Args>
+  explicit FSLSelectionImpl(KernelContext* ctx, Args... args)
+      : Base(ctx, std::forward<Args>(args)...), child_index_builder(ctx->memory_pool()) {}
 
   template <typename Adapter>
   Status GenerateOutput() {
@@ -952,69 +968,80 @@ Status MapFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
 
 namespace {
 
-template <typename Impl>
-Status TakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+template <typename SelectionImpl>
+Status TakeAAAExec(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+                   std::shared_ptr<ArrayData>* out) {
+  DCHECK(!values.is_chunked())
+      << "TakeAAAExec kernels can't be called with chunked array values";
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
-  Impl kernel(ctx, batch, /*output_length=*/batch[1].length(), out);
+  SelectionImpl kernel(ctx, values, indices, out);
   return kernel.ExecTake();
 }
 
 }  // namespace
 
-Status VarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<VarBinarySelectionImpl<BinaryType>>(ctx, batch, out);
+Status VarBinaryTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                         const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<VarBinarySelectionImpl<BinaryType>>(ctx, values, indices, out);
 }
 
-Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch,
-                              ExecResult* out) {
-  return TakeExec<VarBinarySelectionImpl<LargeBinaryType>>(ctx, batch, out);
+Status LargeVarBinaryTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                              const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<VarBinarySelectionImpl<LargeBinaryType>>(ctx, values, indices, out);
 }
 
-Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListSelectionImpl<ListType>>(ctx, batch, out);
+Status ListTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                    const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListSelectionImpl<ListType>>(ctx, values, indices, out);
 }
 
-Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListSelectionImpl<LargeListType>>(ctx, batch, out);
+Status LargeListTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                         const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListSelectionImpl<LargeListType>>(ctx, values, indices, out);
 }
 
-Status ListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListViewSelectionImpl<ListViewType>>(ctx, batch, out);
+Status ListViewTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                        const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListViewSelectionImpl<ListViewType>>(ctx, values, indices, out);
 }
 
-Status LargeListViewTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListViewSelectionImpl<LargeListViewType>>(ctx, batch, out);
+Status LargeListViewTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                             const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListViewSelectionImpl<LargeListViewType>>(ctx, values, indices, out);
 }
 
-Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const ArraySpan& values = batch[0].array;
-
+Status FSLTakeExec(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+                   std::shared_ptr<ArrayData>* out) {
   // If a FixedSizeList wraps a fixed-width type we can, in some cases, use
   // FixedWidthTakeExec for a fixed-size list array.
-  if (util::IsFixedWidthLike(values,
+  if (util::IsFixedWidthLike(values.array(),
                              /*force_null_count=*/true,
                              /*exclude_bool_and_dictionary=*/true)) {
-    return FixedWidthTakeExec(ctx, batch, out);
+    return FixedWidthTakeExec(ctx, values, indices, out);
   }
-  return TakeExec<FSLSelectionImpl>(ctx, batch, out);
+  return TakeAAAExec<FSLSelectionImpl>(ctx, values, indices, out);
 }
 
-Status DenseUnionTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<DenseUnionSelectionImpl>(ctx, batch, out);
+Status DenseUnionTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                          const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<DenseUnionSelectionImpl>(ctx, values, indices, out);
 }
 
-Status SparseUnionTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<SparseUnionSelectionImpl>(ctx, batch, out);
+Status SparseUnionTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                           const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<SparseUnionSelectionImpl>(ctx, values, indices, out);
 }
 
-Status StructTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<StructSelectionImpl>(ctx, batch, out);
+Status StructTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                      const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<StructSelectionImpl>(ctx, values, indices, out);
 }
 
-Status MapTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  return TakeExec<ListSelectionImpl<MapType>>(ctx, batch, out);
+Status MapTakeExec(KernelContext* ctx, const ValuesSpan& values, const ArraySpan& indices,
+                   std::shared_ptr<ArrayData>* out) {
+  return TakeAAAExec<ListSelectionImpl<MapType>>(ctx, values, indices, out);
 }
 
 }  // namespace compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 887bf083541..049a0e9abc7 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -17,11 +17,16 @@
 
 #pragma once
 
+#include <cassert>
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "arrow/array/data.h"
+#include "arrow/chunk_resolver.h"
 #include "arrow/compute/api_vector.h"
 #include "arrow/compute/exec.h"
 #include "arrow/compute/function.h"
@@ -33,10 +38,91 @@ namespace arrow::compute::internal {
 using FilterState = OptionsWrapper<FilterOptions>;
 using TakeState = OptionsWrapper<TakeOptions>;
 
+/// \brief A class used to represent the values argument in take kernels.
+///
+/// It can represent either a chunked array or a single array. When the values
+/// are chunked, the class provides a ChunkResolver to resolve the target array
+/// and index in the chunked array.
+class ValuesSpan {
+ private:
+  const std::shared_ptr<ChunkedArray> chunked_ = nullptr;
+  const ArraySpan chunk0_;  // first chunk or the whole array
+  mutable std::optional<ChunkResolver> chunk_resolver_;
+
+ public:
+  explicit ValuesSpan(const std::shared_ptr<ChunkedArray> values)
+      : chunked_(std::move(values)), chunk0_{*chunked_->chunk(0)->data()} {
+    assert(chunked_);
+    assert(chunked_->num_chunks() > 0);
+  }
+
+  explicit ValuesSpan(const ArraySpan& values)  // NOLINT(modernize-pass-by-value)
+      : chunk0_(values) {}
+
+  explicit ValuesSpan(const ArrayData& values) : chunk0_{ArraySpan{values}} {}
+
+  bool is_chunked() const { return chunked_ != nullptr; }
+
+  const ChunkedArray& chunked_array() const {
+    assert(is_chunked());
+    return *chunked_;
+  }
+
+  /// \brief Lazily builds a ChunkResolver from the underlying chunked array.
+  ///
+  /// \note This method is not thread-safe.
+  /// \pre is_chunked()
+  const ChunkResolver& chunk_resolver() const {
+    assert(is_chunked());
+    if (!chunk_resolver_.has_value()) {
+      chunk_resolver_.emplace(chunked_->chunks());
+    }
+    return *chunk_resolver_;
+  }
+
+  const ArraySpan& chunk0() const { return chunk0_; }
+
+  const ArraySpan& array() const {
+    assert(!is_chunked());
+    return chunk0_;
+  }
+
+  const DataType* type() const { return chunk0_.type; }
+
+  int64_t length() const { return is_chunked() ? chunked_->length() : array().length; }
+
+  bool MayHaveNulls() const {
+    return is_chunked() ? chunked_->null_count() != 0 : array().MayHaveNulls();
+  }
+};
+
+/// \brief Type for a single "array_take" kernel function.
+///
+/// Instead of implementing both `ArrayKernelExec` and `ChunkedExec` typed
+/// functions for each configurations of `array_take` parameters, we use
+/// templates wrapping `TakeKernelExec` functions to expose exec functions
+/// that can be registered in the kernel registry.
+///
+/// A `TakeKernelExec` always returns a single array, which is the result of
+/// taking values from a single array (AA->A) or multiple arrays (CA->A). The
+/// wrappers take care of converting the output of a CA call to C or calling
+/// the kernel multiple times to process a CC call.
+using TakeKernelExec = Status (*)(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                                  std::shared_ptr<ArrayData>*);
+
 struct SelectionKernelData {
+  SelectionKernelData(InputType value_type, InputType selection_type,
+                      ArrayKernelExec exec,
+                      VectorKernel::ChunkedExec chunked_exec = NULLPTR)
+      : value_type(std::move(value_type)),
+        selection_type(std::move(selection_type)),
+        exec(exec),
+        chunked_exec(chunked_exec) {}
+
   InputType value_type;
   InputType selection_type;
   ArrayKernelExec exec;
+  VectorKernel::ChunkedExec chunked_exec;
 };
 
 void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
@@ -73,17 +159,32 @@ Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status DenseUnionFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 
-Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status ListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status LargeListViewTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status DenseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status SparseUnionTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status StructTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status MapTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+// Take kernels compatible with the TakeKernelExec signature
+Status VarBinaryTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                         std::shared_ptr<ArrayData>*);
+Status LargeVarBinaryTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                              std::shared_ptr<ArrayData>*);
+Status FixedWidthTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                          std::shared_ptr<ArrayData>*);
+Status FixedWidthTakeChunkedExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                                 std::shared_ptr<ArrayData>*);
+Status ListTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                    std::shared_ptr<ArrayData>*);
+Status LargeListTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                         std::shared_ptr<ArrayData>*);
+Status ListViewTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                        std::shared_ptr<ArrayData>*);
+Status LargeListViewTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                             std::shared_ptr<ArrayData>*);
+Status FSLTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                   std::shared_ptr<ArrayData>*);
+Status DenseUnionTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                          std::shared_ptr<ArrayData>*);
+Status SparseUnionTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                           std::shared_ptr<ArrayData>*);
+Status StructTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                      std::shared_ptr<ArrayData>*);
+Status MapTakeExec(KernelContext*, const ValuesSpan&, const ArraySpan&,
+                   std::shared_ptr<ArrayData>*);
 
 }  // namespace arrow::compute::internal
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index fedafeb5bea..d93b37839fc 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -327,6 +327,105 @@ namespace {
 
 using TakeState = OptionsWrapper<TakeOptions>;
 
+struct ChunkedFixedWidthValuesSpan {
+ private:
+  // src_residual_bit_offsets_[i] is used to store the bit offset of the first byte (0-7)
+  // in src_chunks_[i] iff kValueWidthInBits == 1.
+  std::vector<int> src_residual_bit_offsets;
+  // Pre-computed pointers to the start of the values in each chunk.
+  std::vector<const uint8_t*> src_chunks;
+
+ public:
+  explicit ChunkedFixedWidthValuesSpan(const ChunkedArray& values) {
+    const bool chunk_values_are_bit_sized = values.type()->id() == Type::BOOL;
+    DCHECK_EQ(chunk_values_are_bit_sized, util::FixedWidthInBytes(*values.type()) == -1);
+    if (chunk_values_are_bit_sized) {
+      src_residual_bit_offsets.resize(values.num_chunks());
+    }
+    src_chunks.resize(values.num_chunks());
+
+    for (int i = 0; i < values.num_chunks(); ++i) {
+      const ArraySpan chunk{*values.chunk(i)->data()};
+      DCHECK(util::IsFixedWidthLike(chunk));
+
+      auto offset_pointer = util::OffsetPointerOfFixedBitWidthValues(chunk);
+      if (chunk_values_are_bit_sized) {
+        src_residual_bit_offsets[i] = offset_pointer.first;
+      } else {
+        DCHECK_EQ(offset_pointer.first, 0);
+      }
+      src_chunks[i] = offset_pointer.second;
+    }
+  }
+
+  const int* src_residual_bit_offsets_data() const {
+    return src_residual_bit_offsets.empty() ? nullptr : src_residual_bit_offsets.data();
+  }
+
+  const uint8_t* const* src_chunks_data() const { return src_chunks.data(); }
+};
+
+/// \brief Buffer for chunk locations resolved against a chunked array.
+struct BoundedLocationBuffer {
+ private:
+  std::unique_ptr<Buffer> chunk_location_buffer = NULLPTR;
+
+  Status Allocate(int64_t n_locations, int64_t sizeof_location, MemoryPool* pool) {
+    ARROW_ASSIGN_OR_RAISE(chunk_location_buffer,
+                          AllocateBuffer(n_locations * sizeof_location, pool));
+    return Status::OK();
+  }
+
+ public:
+  ~BoundedLocationBuffer() = default;
+
+  template <typename IndexCType>
+  Status InitWithCapacity(int64_t n_locations, MemoryPool* pool) {
+    RETURN_NOT_OK(Allocate(n_locations, sizeof(TypedChunkLocation<IndexCType>), pool));
+    return Status::OK();
+  }
+
+  /// \brief The capacity in terms of number of resolved chunk locations.
+  ///
+  /// One location is needed for each index.
+  template <typename IndexCType>
+  int64_t Capacity() const {
+    return chunk_location_buffer->size() / sizeof(TypedChunkLocation<IndexCType>);
+  }
+
+  /// \pre idx_length <= Capacity<IndexCType>()
+  template <typename IndexCType>
+  Status ResolveIndices(const ChunkResolver& chunk_resolver, int64_t idx_length,
+                        const IndexCType* idx, IndexCType chunk_hint) {
+    DCHECK_LE(idx_length, Capacity<IndexCType>());
+    auto* chunk_location_vec = mutable_chunk_location_vec<IndexCType>();
+    // All indices are resolved in one go without checking the validity bitmap.
+    // This is OK as long the output corresponding to the invalid indices is not used.
+    bool enough_precision = chunk_resolver.ResolveMany<IndexCType>(
+        /*n_indices=*/idx_length, /*logical_index_vec=*/idx, chunk_location_vec,
+        chunk_hint);
+    if (ARROW_PREDICT_FALSE(!enough_precision)) {
+      return Status::IndexError("IndexCType is too small");
+    }
+    return Status::OK();
+  }
+
+  template <typename IndexCType>
+  TypedChunkLocation<IndexCType>* mutable_chunk_location_vec() {
+    return chunk_location_buffer->mutable_data_as<TypedChunkLocation<IndexCType>>();
+  }
+
+  template <typename IndexCType>
+  const TypedChunkLocation<IndexCType>* chunk_location_vec() const {
+    return chunk_location_buffer->data_as<TypedChunkLocation<IndexCType>>();
+  }
+
+  template <typename IndexCType>
+  IndexCType chunk_index(int64_t position) const {
+    return chunk_location_vec<IndexCType>()[position].chunk_index;
+  }
+};
+
 // ----------------------------------------------------------------------
 // Implement optimized take for primitive types from boolean to
 // 1/2/4/8/16/32-byte C-type based types and fixed-size binary (0 or more
@@ -358,15 +457,21 @@ template <typename IndexCType, typename ValueBitWidthConstant,
 struct FixedWidthTakeImpl {
   static constexpr int kValueWidthInBits = ValueBitWidthConstant::value;
 
-  static Status Exec(KernelContext* ctx, const ArraySpan& values,
+  static Status Exec(KernelContext* ctx, const ValuesSpan& values,
                      const ArraySpan& indices, ArrayData* out_arr, int64_t factor) {
 #ifndef NDEBUG
-    int64_t bit_width = util::FixedWidthInBits(*values.type);
+    int64_t bit_width = util::FixedWidthInBits(*values.type());
     DCHECK(WithFactor::value || (kValueWidthInBits == bit_width && factor == 1));
     DCHECK(!WithFactor::value ||
            (factor > 0 && kValueWidthInBits == 8 &&  // factors are used with bytes
             static_cast<int64_t>(factor * kValueWidthInBits) == bit_width));
 #endif
+    return values.is_chunked() ? ChunkedExec(ctx, values, indices, out_arr, factor)
+                               : Exec(ctx, values.array(), indices, out_arr, factor);
+  }
+
+  static Status Exec(KernelContext* ctx, const ArraySpan& values,
+                     const ArraySpan& indices, ArrayData* out_arr, int64_t factor) {
     const bool out_has_validity = values.MayHaveNulls() || indices.MayHaveNulls();
 
     const uint8_t* src;
@@ -396,10 +501,89 @@ struct FixedWidthTakeImpl {
     out_arr->null_count = out_arr->length - valid_count;
     return Status::OK();
   }
+
+  static Status ChunkedExec(KernelContext* ctx, const ValuesSpan& values,
+                            const ArraySpan& indices, ArrayData* out_arr,
+                            int64_t factor) {
+    constexpr int64_t kIndexBlockCapacityInBytes = 16 * 1024;
+    // Must be a multiple of 8 so `GatherFromChunks` can always be
+    // constructed with byte-aligned output pointers in the loop.
+    constexpr int64_t kIndexBlockCapacity =
+        kIndexBlockCapacityInBytes / sizeof(IndexCType);
+    static_assert((kIndexBlockCapacity * kValueWidthInBits) % 8 == 0);
+
+    ChunkedFixedWidthValuesSpan chunked_values{values.chunked_array()};
+    BoundedLocationBuffer location_buffer;
+    // TODO(felipecrv): find a way to share the buffer on TakeCC kernel
+    RETURN_NOT_OK(location_buffer.InitWithCapacity<IndexCType>(
+        /*n_locations=*/std::min(kIndexBlockCapacity, indices.length),
+        ctx->memory_pool()));
+
+    return DoChunkedExec(ctx, values, chunked_values, indices, &location_buffer, out_arr,
+                         factor);
+  }
+
+  // \pre location_buffer is initialized
+  // \pre location_buffer->Capacity() is a multiple of 8
+  static Status DoChunkedExec(KernelContext* ctx, const ValuesSpan& values,
+                              const ChunkedFixedWidthValuesSpan& chunked_values,
+                              const ArraySpan& indices,
+                              BoundedLocationBuffer* location_buffer, ArrayData* out_arr,
+                              int64_t factor) {
+    const bool out_has_validity =
+        values.chunked_array().null_count() > 0 || indices.MayHaveNulls();
+
+    const auto& chunk_resolver = values.chunk_resolver();
+    const auto location_buffer_capacity = location_buffer->Capacity<IndexCType>();
+    const auto* idx = indices.GetValues<IndexCType>(1);
+    uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr);
+    int64_t valid_count = 0;
+    IndexCType chunk_hint = 0;
+    int64_t idx_offset = 0;
+    while (idx_offset < indices.length) {
+      const int64_t block_length =
+          std::min(location_buffer_capacity, indices.length - idx_offset);
+
+      RETURN_NOT_OK(location_buffer->ResolveIndices<IndexCType>(
+          chunk_resolver, /*idx_length=*/block_length, idx, chunk_hint));
+      arrow::internal::GatherFromChunks<kValueWidthInBits, IndexCType, WithFactor::value>
+          gather{chunked_values.src_residual_bit_offsets_data(),
+                 chunked_values.src_chunks_data(),
+                 /*idx_length=*/block_length,
+                 location_buffer->chunk_location_vec<IndexCType>(),
+                 out,
+                 factor};
+      if (out_has_validity) {
+        DCHECK_EQ(out_arr->offset, 0);
+        // out_is_valid must be zero-initiliazed, because Gather::Execute
+        // saves time by not having to ClearBit on every null element.
+        auto out_is_valid = out_arr->GetMutableValues<uint8_t>(0);
+        memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length));
+        valid_count += gather.template Execute<OutputIsZeroInitialized::value>(
+            /*src_validity=*/values.chunked_array(), /*idx_validity=*/indices,
+            out_is_valid);
+      } else {
+        valid_count += gather.Execute();
+      }
+      // Prepare for the next iteration
+      chunk_hint = location_buffer->chunk_index<IndexCType>(block_length - 1);
+      idx_offset += block_length;
+      if constexpr (WithFactor::value) {
+        static_assert(kValueWidthInBits == 8);
+        out += block_length * factor;
+      } else {
+        out += (block_length * kValueWidthInBits) / 8;
+        // The last `out` produced in this loop might not be byte-aligned,
+        // but that is not a poblem because no value is written to it.
+      }
+    }
+    out_arr->null_count = out_arr->length - valid_count;
+    return Status::OK();
+  }
 };
 
 template <template <typename...> class TakeImpl, typename... Args>
-Status TakeIndexDispatch(KernelContext* ctx, const ArraySpan& values,
+Status TakeIndexDispatch(KernelContext* ctx, const ValuesSpan& values,
                          const ArraySpan& indices, ArrayData* out, int64_t factor = 1) {
   // With the simplifying assumption that boundschecking has taken place
   // already at a higher level, we can now assume that the index values are all
@@ -421,69 +605,68 @@ Status TakeIndexDispatch(KernelContext* ctx, const ArraySpan& values,
 
 }  // namespace
 
-Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const ArraySpan& values = batch[0].array;
-  const ArraySpan& indices = batch[1].array;
-
+Status FixedWidthTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                          const ArraySpan& indices, std::shared_ptr<ArrayData>* out_arr) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(indices, values.length));
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
 
-  ArrayData* out_arr = out->array_data().get();
-  DCHECK(util::IsFixedWidthLike(values));
+  DCHECK(util::IsFixedWidthLike(values.chunk0()));
   // When we know for sure that values nor indices contain nulls, we can skip
   // allocating the validity bitmap altogether and save time and space.
   const bool allocate_validity = values.MayHaveNulls() || indices.MayHaveNulls();
   RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
-      ctx, indices.length, /*source=*/values, allocate_validity, out_arr));
-  switch (util::FixedWidthInBits(*values.type)) {
+      ctx, indices.length, /*source=*/values.chunk0(), allocate_validity,
+      out_arr->get()));
+  switch (util::FixedWidthInBits(*values.type())) {
     case 0:
-      DCHECK(values.type->id() == Type::FIXED_SIZE_BINARY ||
-             values.type->id() == Type::FIXED_SIZE_LIST);
+      DCHECK(values.type()->id() == Type::FIXED_SIZE_BINARY ||
+             values.type()->id() == Type::FIXED_SIZE_LIST);
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 0>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 1:
       // Zero-initialize the data buffer for the output array when the bit-width is 1
       // (e.g. Boolean array) to avoid having to ClearBit on every null element.
       // This might be profitable for other types as well, but we take the most
       // conservative approach for now.
-      memset(out_arr->buffers[1]->mutable_data(), 0, out_arr->buffers[1]->size());
+      memset((*out_arr)->buffers[1]->mutable_data(), 0, (*out_arr)->buffers[1]->size());
       return TakeIndexDispatch<
           FixedWidthTakeImpl, std::integral_constant<int, 1>, /*OutputIsZeroInitialized=*/
-          std::true_type>(ctx, values, indices, out_arr);
+          std::true_type>(ctx, values, indices, out_arr->get());
     case 8:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 8>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 16:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 16>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 32:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 32>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 64:
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 64>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 128:
       // For INTERVAL_MONTH_DAY_NANO, DECIMAL128
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 128>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
     case 256:
       // For DECIMAL256
       return TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 256>>(
-          ctx, values, indices, out_arr);
+          ctx, values, indices, out_arr->get());
   }
-  if (ARROW_PREDICT_TRUE(values.type->id() == Type::FIXED_SIZE_BINARY ||
-                         values.type->id() == Type::FIXED_SIZE_LIST)) {
-    int64_t byte_width = util::FixedWidthInBytes(*values.type);
+  if (ARROW_PREDICT_TRUE(values.type()->id() == Type::FIXED_SIZE_BINARY ||
+                         values.type()->id() == Type::FIXED_SIZE_LIST)) {
+    int64_t byte_width = util::FixedWidthInBytes(*values.type());
     // 0-length fixed-size binary or lists were handled above on `case 0`
     DCHECK_GT(byte_width, 0);
     return TakeIndexDispatch<FixedWidthTakeImpl,
                              /*ValueBitWidth=*/std::integral_constant<int, 8>,
                              /*OutputIsZeroInitialized=*/std::false_type,
-                             /*WithFactor=*/std::true_type>(ctx, values, indices, out_arr,
+                             /*WithFactor=*/std::true_type>(ctx, values, indices,
+                                                            out_arr->get(),
                                                             /*factor=*/byte_width);
   }
-  return Status::NotImplemented("Unsupported primitive type for take: ", *values.type);
+  return Status::NotImplemented("Unsupported primitive type for take: ", *values.type());
 }
 
 namespace {
@@ -491,41 +674,45 @@ namespace {
 // ----------------------------------------------------------------------
 // Null take
 
-Status NullTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+Status NullTakeExec(KernelContext* ctx, const ValuesSpan& values,
+                    const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
   if (TakeState::Get(ctx).boundscheck) {
-    RETURN_NOT_OK(CheckIndexBounds(batch[1].array, batch[0].length()));
+    RETURN_NOT_OK(CheckIndexBounds(indices, values.length()));
   }
   // batch.length doesn't take into account the take indices
-  auto new_length = batch[1].array.length;
-  out->value = std::make_shared<NullArray>(new_length)->data();
+  auto new_length = indices.length;
+  *out = NullArray{new_length}.data();
   return Status::OK();
 }
 
 // ----------------------------------------------------------------------
 // Dictionary take
 
-Status DictionaryTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  DictionaryArray values(batch[0].array.ToArrayData());
+Status DictionaryTake(KernelContext* ctx, const ValuesSpan& values,
+                      const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  DictionaryArray values_dict(values.array().ToArrayData());
   Datum result;
-  RETURN_NOT_OK(Take(Datum(values.indices()), batch[1].array.ToArrayData(),
+  RETURN_NOT_OK(Take(Datum(values_dict.indices()), indices.ToArrayData(),
                      TakeState::Get(ctx), ctx->exec_context())
                     .Value(&result));
-  DictionaryArray taken_values(values.type(), result.make_array(), values.dictionary());
-  out->value = taken_values.data();
+  DictionaryArray taken_values(values_dict.type(), result.make_array(),
+                               values_dict.dictionary());
+  *out = taken_values.data();
   return Status::OK();
 }
 
 // ----------------------------------------------------------------------
 // Extension take
 
-Status ExtensionTake(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  ExtensionArray values(batch[0].array.ToArrayData());
+Status ExtensionTake(KernelContext* ctx, const ValuesSpan& values,
+                     const ArraySpan& indices, std::shared_ptr<ArrayData>* out) {
+  ExtensionArray values_data(values.array().ToArrayData());
   Datum result;
-  RETURN_NOT_OK(Take(Datum(values.storage()), batch[1].array.ToArrayData(),
+  RETURN_NOT_OK(Take(Datum(values_data.storage()), indices.ToArrayData(),
                      TakeState::Get(ctx), ctx->exec_context())
                     .Value(&result));
-  ExtensionArray taken_values(values.type(), result.make_array());
-  out->value = taken_values.data();
+  ExtensionArray taken_values(values_data.type(), result.make_array());
+  *out = taken_values.data();
   return Status::OK();
 }
 
@@ -545,7 +732,7 @@ const FunctionDoc take_doc(
     {"input", "indices"}, "TakeOptions");
 
 // Metafunction for dispatching to different Take implementations other than
-// Array-Array.
+// [Chunked]Array-[Chunked]Array.
 class TakeMetaFunction : public MetaFunction {
  public:
   TakeMetaFunction()
@@ -558,18 +745,6 @@ class TakeMetaFunction : public MetaFunction {
     return array_take_func->Execute(args, &options, ctx);
   }
 
-  static Result<std::shared_ptr<Array>> ChunkedArrayAsArray(
-      const std::shared_ptr<ChunkedArray>& values, MemoryPool* pool) {
-    switch (values->num_chunks()) {
-      case 0:
-        return MakeArrayOfNull(values->type(), /*length=*/0, pool);
-      case 1:
-        return values->chunk(0);
-      default:
-        return Concatenate(values->chunks(), pool);
-    }
-  }
-
  private:
   static Result<std::shared_ptr<ArrayData>> TakeAAA(const std::vector<Datum>& args,
                                                     const TakeOptions& options,
@@ -580,59 +755,37 @@ class TakeMetaFunction : public MetaFunction {
     return result.array();
   }
 
-  static Result<std::shared_ptr<ArrayData>> TakeCAA(
-      const std::shared_ptr<ChunkedArray>& values, const Array& indices,
-      const TakeOptions& options, ExecContext* ctx) {
-    ARROW_ASSIGN_OR_RAISE(auto values_array,
-                          ChunkedArrayAsArray(values, ctx->memory_pool()));
-    std::vector<Datum> args = {std::move(values_array), indices};
-    return TakeAAA(args, options, ctx);
+  static Result<std::shared_ptr<ChunkedArray>> TakeCAC(const std::vector<Datum>& args,
+                                                       const TakeOptions& options,
+                                                       ExecContext* ctx) {
+    // "array_take" can handle CA->C cases directly
+    // (via their VectorKernel::exec_chunked)
+    DCHECK_EQ(args[0].kind(), Datum::CHUNKED_ARRAY);
+    DCHECK_EQ(args[1].kind(), Datum::ARRAY);
+    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake(args, options, ctx));
+    return result.chunked_array();
   }
 
-  static Result<std::shared_ptr<ChunkedArray>> TakeCAC(
-      const std::shared_ptr<ChunkedArray>& values, const Array& indices,
-      const TakeOptions& options, ExecContext* ctx) {
-    ARROW_ASSIGN_OR_RAISE(auto new_chunk, TakeCAA(values, indices, options, ctx));
-    return std::make_shared<ChunkedArray>(MakeArray(std::move(new_chunk)));
-  }
-
-  static Result<std::shared_ptr<ChunkedArray>> TakeCCC(
-      const std::shared_ptr<ChunkedArray>& values,
-      const std::shared_ptr<ChunkedArray>& indices, const TakeOptions& options,
-      ExecContext* ctx) {
-    // XXX: for every chunk in indices, values are gathered from all chunks in values to
-    // form a new chunk in the result. Performing this concatenation is not ideal, but
-    // greatly simplifies the implementation before something more efficient is
-    // implemented.
-    ARROW_ASSIGN_OR_RAISE(auto values_array,
-                          ChunkedArrayAsArray(values, ctx->memory_pool()));
-    std::vector<Datum> args = {std::move(values_array), {}};
-    std::vector<std::shared_ptr<Array>> new_chunks;
-    new_chunks.resize(indices->num_chunks());
-    for (int i = 0; i < indices->num_chunks(); i++) {
-      args[1] = indices->chunk(i);
-      // XXX: this loop can use TakeCAA once it can handle ChunkedArray
-      // without concatenating first
-      ARROW_ASSIGN_OR_RAISE(auto chunk, TakeAAA(args, options, ctx));
-      new_chunks[i] = MakeArray(chunk);
-    }
-    return std::make_shared<ChunkedArray>(std::move(new_chunks), values->type());
+  static Result<std::shared_ptr<ChunkedArray>> TakeCCC(const std::vector<Datum>& args,
+                                                       const TakeOptions& options,
+                                                       ExecContext* ctx) {
+    // "array_take" can handle CC->C cases directly
+    // (via their VectorKernel::exec_chunked)
+    DCHECK_EQ(args[0].kind(), Datum::CHUNKED_ARRAY);
+    DCHECK_EQ(args[1].kind(), Datum::CHUNKED_ARRAY);
+    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake(args, options, ctx));
+    return result.chunked_array();
   }
 
-  static Result<std::shared_ptr<ChunkedArray>> TakeACC(const Array& values,
-                                                       const ChunkedArray& indices,
+  static Result<std::shared_ptr<ChunkedArray>> TakeACC(const std::vector<Datum>& args,
                                                        const TakeOptions& options,
                                                        ExecContext* ctx) {
-    auto num_chunks = indices.num_chunks();
-    std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
-    std::vector<Datum> args = {values, {}};
-    for (int i = 0; i < num_chunks; i++) {
-      // Take with that indices chunk
-      args[1] = indices.chunk(i);
-      ARROW_ASSIGN_OR_RAISE(auto chunk, TakeAAA(args, options, ctx));
-      new_chunks[i] = MakeArray(chunk);
-    }
-    return std::make_shared<ChunkedArray>(std::move(new_chunks), values.type());
+    // "array_take" can handle AC->C cases directly
+    // (via their VectorKernel::exec_chunked)
+    DCHECK_EQ(args[0].kind(), Datum::ARRAY);
+    DCHECK_EQ(args[1].kind(), Datum::CHUNKED_ARRAY);
+    ARROW_ASSIGN_OR_RAISE(auto result, CallArrayTake(args, options, ctx));
+    return result.chunked_array();
   }
 
   static Result<std::shared_ptr<RecordBatch>> TakeRAR(const RecordBatch& batch,
@@ -657,9 +810,10 @@ class TakeMetaFunction : public MetaFunction {
                                                 ExecContext* ctx) {
     auto ncols = table->num_columns();
     std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
-
+    std::vector<Datum> args = {/*placeholder*/ {}, indices};
     for (int j = 0; j < ncols; j++) {
-      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCAC(table->column(j), indices, options, ctx));
+      args[0] = table->column(j);
+      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCAC(args, options, ctx));
     }
     return Table::Make(table->schema(), std::move(columns));
   }
@@ -669,8 +823,10 @@ class TakeMetaFunction : public MetaFunction {
       const TakeOptions& options, ExecContext* ctx) {
     auto ncols = table->num_columns();
     std::vector<std::shared_ptr<ChunkedArray>> columns(ncols);
+    std::vector<Datum> args = {/*placeholder*/ {}, indices};
     for (int j = 0; j < ncols; j++) {
-      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCCC(table->column(j), indices, options, ctx));
+      args[0] = table->column(j);
+      ARROW_ASSIGN_OR_RAISE(columns[j], TakeCCC(args, options, ctx));
     }
     return Table::Make(table->schema(), std::move(columns));
   }
@@ -683,18 +839,11 @@ class TakeMetaFunction : public MetaFunction {
     const auto& take_opts = static_cast<const TakeOptions&>(*options);
     switch (args[0].kind()) {
       case Datum::ARRAY:
-        if (index_kind == Datum::ARRAY) {
-          return TakeAAA(args, take_opts, ctx);
-        } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeACC(*args[0].make_array(), *args[1].chunked_array(), take_opts, ctx);
-        }
-        break;
       case Datum::CHUNKED_ARRAY:
-        if (index_kind == Datum::ARRAY) {
-          return TakeCAC(args[0].chunked_array(), *args[1].make_array(), take_opts, ctx);
-        } else if (index_kind == Datum::CHUNKED_ARRAY) {
-          return TakeCCC(args[0].chunked_array(), args[1].chunked_array(), take_opts,
-                         ctx);
+        // "array_take" can handle AA->A, AC->C, CA->C, CC->C cases directly
+        // (via their VectorKernel::exec and VectorKernel::exec_chunked)
+        if (index_kind == Datum::ARRAY || index_kind == Datum::CHUNKED_ARRAY) {
+          return CallArrayTake(args, take_opts, ctx);
         }
         break;
       case Datum::RECORD_BATCH:
@@ -716,12 +865,253 @@ class TakeMetaFunction : public MetaFunction {
     return Status::NotImplemented(
         "Unsupported types for take operation: "
         "values=",
-        args[0].ToString(), "indices=", args[1].ToString());
+        args[0].ToString(), ", indices=", args[1].ToString());
   }
 };
 
 // ----------------------------------------------------------------------
 
+/// \brief Prepare the output array like ExecuteArrayKernel::PrepareOutput()
+std::shared_ptr<ArrayData> PrepareOutput(const ExecBatch& batch, int64_t length) {
+  DCHECK_EQ(batch.length, length);
+  auto out = std::make_shared<ArrayData>(batch.values[0].type(), length);
+  out->buffers.resize(batch.values[0].type()->layout().buffers.size());
+  return out;
+}
+
+Result<std::shared_ptr<Array>> ChunkedArrayAsArray(
+    const std::shared_ptr<ChunkedArray>& values, MemoryPool* pool) {
+  switch (values->num_chunks()) {
+    case 0:
+      return MakeArrayOfNull(values->type(), /*length=*/0, pool);
+    case 1:
+      return values->chunk(0);
+    default:
+      return Concatenate(values->chunks(), pool);
+  }
+}
+
+Status CallXAAKernel(TakeKernelExec take_xaa_exec, KernelContext* ctx,
+                     const ValuesSpan& values, const ArraySpan& indices, Datum* out) {
+  DCHECK_EQ(out->kind(), Datum::ARRAY);
+  auto out_arr = out->array();
+  RETURN_NOT_OK(take_xaa_exec(ctx, values, indices, &out_arr));
+  out->value = std::move(out_arr);
+  return Status::OK();
+}
+
+Status TakeACCChunkedExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
+                          const ExecBatch& batch, Datum* out) {
+  ValuesSpan values{*batch.values[0].array()};
+  auto& indices = batch.values[1].chunked_array();
+  auto num_chunks = indices->num_chunks();
+  std::vector<std::shared_ptr<Array>> new_chunks(num_chunks);
+  for (int i = 0; i < num_chunks; i++) {
+    // Take with that indices chunk
+    auto& indices_chunk_data = indices->chunk(i)->data();
+    ArraySpan indices_chunk{*indices_chunk_data};
+    Datum result = PrepareOutput(batch, values.length());
+    RETURN_NOT_OK(CallXAAKernel(take_aaa_exec, ctx, values, indices_chunk, &result));
+    new_chunks[i] = MakeArray(result.array());
+  }
+  out->value = std::make_shared<ChunkedArray>(std::move(new_chunks),
+                                              values.type()->GetSharedPtr());
+  return Status::OK();
+}
+
+Status ArrayTakeExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
+                     const ExecSpan& span, ExecResult* out) {
+  ValuesSpan values{span[0].array};
+  auto& indices = span[1].array;
+  std::shared_ptr<ArrayData> out_arr = out->array_data();
+  RETURN_NOT_OK(take_aaa_exec(ctx, values, indices, &out_arr));
+  out->value = std::move(out_arr);
+  return Status::OK();
+}
+
+template <TakeKernelExec kTakeAAAExec>
+struct ArrayTakeExecFunctor {
+  static Status Exec(KernelContext* ctx, const ExecSpan& span, ExecResult* out) {
+    return ArrayTakeExec(kTakeAAAExec, ctx, span, out);
+  }
+};
+
+/// \brief Generic (slower) VectorKernel::exec_chunked (`CA->C`, `CC->C`, and `AC->C`).
+///
+/// This function concatenates the chunks of the values and then calls the `AA->A` take
+/// kernel to handle the `CA->C` cases. The ArrayData returned by the `AA->A` kernel is
+/// converted to a ChunkedArray with a single chunk to honor the `CA->C` contract.
+///
+/// For `CC->C` cases, it concatenates the chunks of the values and calls the `AA->A` take
+/// kernel for each chunk of the indices, producing a new chunked array with the same
+/// shape as the indices.
+///
+/// `AC->C` cases are trivially delegated to TakeACCChunkedExec without any concatenation.
+///
+/// \param take_aaa_exec The `AA->A` take kernel to use.
+Status GenericTakeChunkedExec(TakeKernelExec take_aaa_exec, KernelContext* ctx,
+                              const ExecBatch& batch, Datum* out) {
+  const auto& args = batch.values;
+  if (args[0].kind() == Datum::CHUNKED_ARRAY) {
+    const auto& values_chunked = args[0].chunked_array();
+    ARROW_ASSIGN_OR_RAISE(auto values_array,
+                          ChunkedArrayAsArray(values_chunked, ctx->memory_pool()));
+    const ValuesSpan values(*values_array->data());
+    if (args[1].kind() == Datum::ARRAY) {
+      // CA->C
+      const auto& indices_data = args[1].array();
+      const ArraySpan indices(*indices_data);
+      DCHECK_EQ(values_array->length(), batch.length);
+      {
+        // AA->A
+        RETURN_NOT_OK(CallXAAKernel(take_aaa_exec, ctx, values, indices, out));
+        out->value = std::make_shared<ChunkedArray>(MakeArray(out->array()));
+      }
+      return Status::OK();
+    } else if (args[1].kind() == Datum::CHUNKED_ARRAY) {
+      // CC->C
+      const auto& chunked_indices = args[1].chunked_array();
+      std::vector<std::shared_ptr<Array>> new_chunks;
+      for (int i = 0; i < chunked_indices->num_chunks(); i++) {
+        // AA->A
+        const ArraySpan indices_chunk{*chunked_indices->chunk(i)->data()};
+        Datum result = PrepareOutput(batch, values_array->length());
+        RETURN_NOT_OK(CallXAAKernel(take_aaa_exec, ctx, values, indices_chunk, &result));
+        new_chunks.push_back(MakeArray(result.array()));
+      }
+      DCHECK(out->is_array());
+      out->value =
+          std::make_shared<ChunkedArray>(std::move(new_chunks), values_chunked->type());
+      return Status::OK();
+    }
+  } else {
+    // VectorKernel::exec_chunked are only called when at least one of the inputs is
+    // chunked, so we should be able to assume that args[1] is a chunked array when
+    // everything is wired up correctly.
+    if (args[1].kind() == Datum::CHUNKED_ARRAY) {
+      // AC->C
+      return TakeACCChunkedExec(take_aaa_exec, ctx, batch, out);
+    } else {
+      DCHECK(false) << "Unexpected kind for array_take's exec_chunked kernel: values="
+                    << args[0].ToString() << ", indices=" << args[1].ToString();
+    }
+  }
+  return Status::NotImplemented(
+      "Unsupported kinds for 'array_take', try using 'take': "
+      "values=",
+      args[0].ToString(), ", indices=", args[1].ToString());
+}
+
+template <TakeKernelExec kTakeAAAExec>
+struct GenericTakeChunkedExecFunctor {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return GenericTakeChunkedExec(kTakeAAAExec, ctx, batch, out);
+  }
+};
+
+/// \brief Specialized (faster) VectorKernel::exec_chunked (`CA->C`, `CC->C`, `AC->C`).
+///
+/// This function doesn't ever need to concatenate the chunks of the values, so it can be
+/// more efficient than GenericTakeChunkedExec that can only delegate to the `AA->A` take
+/// kernels.
+///
+/// For `CA->C` cases, it can call the `CA->A` take kernel directly [1] and trivially
+/// convert the result to a ChunkedArray of a single chunk to honor the `CA->C` contract.
+///
+/// For `CC->C` cases it can call the `CA->A` take kernel for each chunk of the indices to
+/// get each chunk that becomes the ChunkedArray output.
+///
+/// `AC->C` cases are trivially delegated to TakeACCChunkedExec.
+///
+/// \param take_xaa_exec The `AA->A` and `CA->A` take kernel to use.
+Status SpecialTakeChunkedExec(TakeKernelExec take_xaa_exec, KernelContext* ctx,
+                              const ExecBatch& batch, Datum* out) {
+  auto* pool = ctx->memory_pool();
+  const auto& args = batch.values;
+  if (args[0].kind() == Datum::CHUNKED_ARRAY) {
+    const auto& values_chunked = args[0].chunked_array();
+    std::shared_ptr<Array> single_chunk = nullptr;
+    if (values_chunked->num_chunks() == 0 || values_chunked->length() == 0) {
+      ARROW_ASSIGN_OR_RAISE(single_chunk,
+                            MakeArrayOfNull(values_chunked->type(), /*length=*/0, pool));
+    } else if (values_chunked->num_chunks() == 1) {
+      single_chunk = values_chunked->chunk(0);
+    }
+
+    if (args[1].kind() == Datum::ARRAY) {
+      // CA->C
+      ArraySpan indices(*args[1].array());
+      if (single_chunk) {
+        // AA->A
+        DCHECK_EQ(single_chunk->length(), batch.length);
+        ValuesSpan single_chunk_values(*single_chunk->data());
+        // If the ChunkedArray was cheaply converted to a single chunk,
+        // we can use the AA->A take kernel directly.
+        RETURN_NOT_OK(
+            CallXAAKernel(take_xaa_exec, ctx, single_chunk_values, indices, out));
+        out->value = std::make_shared<ChunkedArray>(MakeArray(out->array()));
+        return Status::OK();
+      }
+      // Instead of concatenating the chunks, we call the CA->A take kernel
+      // which has a more efficient implementation for this case. At this point,
+      // that implementation doesn't have to care about empty or single-chunk
+      // ChunkedArrays.
+      ValuesSpan values(values_chunked);
+      auto out_arr = out->array();
+      RETURN_NOT_OK(take_xaa_exec(ctx, values, indices, &out_arr));
+      out->value = std::make_shared<ChunkedArray>(MakeArray(std::move(out_arr)));
+      return Status::OK();
+    } else {
+      Datum result;
+      // CC->C
+      const auto& indices = args[1].chunked_array();
+      std::vector<std::shared_ptr<Array>> new_chunks;
+      for (int i = 0; i < indices->num_chunks(); i++) {
+        const ArraySpan indices_chunk{*indices->chunk(i)->data()};
+        result = PrepareOutput(batch, values_chunked->length());
+        if (single_chunk) {
+          ValuesSpan single_chunk_values(*single_chunk->data());
+          // If the ChunkedArray was cheaply converted to a single chunk,
+          // we can use the AA->A take kernel directly.
+          RETURN_NOT_OK(CallXAAKernel(take_xaa_exec, ctx, single_chunk_values,
+                                      indices_chunk, &result));
+        } else {
+          ValuesSpan values(values_chunked);
+          RETURN_NOT_OK(
+              CallXAAKernel(take_xaa_exec, ctx, values, indices_chunk, &result));
+        }
+        new_chunks.push_back(MakeArray(result.array()));
+      }
+      DCHECK(out->is_array());
+      out->value =
+          std::make_shared<ChunkedArray>(std::move(new_chunks), values_chunked->type());
+      return Status::OK();
+    }
+  } else {
+    // VectorKernel::exec_chunked are only called when at least one of the inputs is
+    // chunked, so we should be able to assume that args[1] is a chunked array when
+    // everything is wired up correctly.
+    if (args[1].kind() == Datum::CHUNKED_ARRAY) {
+      // AC->C
+      return TakeACCChunkedExec(take_xaa_exec, ctx, batch, out);
+    } else {
+      DCHECK(false) << "Unexpected kind for array_take's exec_chunked kernel: values="
+                    << args[0].ToString() << ", indices=" << args[1].ToString();
+    }
+  }
+  return Status::NotImplemented(
+      "Unsupported kinds for 'array_take', try using 'take': "
+      "values=",
+      args[0].ToString(), ", indices=", args[1].ToString());
+}
+
+template <TakeKernelExec kTakeXAAExec>
+struct SpecialTakeChunkedExecFunctor {
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    return SpecialTakeChunkedExec(kTakeXAAExec, ctx, batch, out);
+  }
+};
+
 }  // namespace
 
 const TakeOptions* GetDefaultTakeOptions() {
@@ -737,22 +1127,54 @@ void PopulateTakeKernels(std::vector<SelectionKernelData>* out) {
   auto take_indices = match::Integer();
 
   *out = {
-      {InputType(match::Primitive()), take_indices, FixedWidthTakeExec},
-      {InputType(match::BinaryLike()), take_indices, VarBinaryTakeExec},
-      {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec},
-      {InputType(match::FixedSizeBinaryLike()), take_indices, FixedWidthTakeExec},
-      {InputType(null()), take_indices, NullTakeExec},
-      {InputType(Type::DICTIONARY), take_indices, DictionaryTake},
-      {InputType(Type::EXTENSION), take_indices, ExtensionTake},
-      {InputType(Type::LIST), take_indices, ListTakeExec},
-      {InputType(Type::LARGE_LIST), take_indices, LargeListTakeExec},
-      {InputType(Type::LIST_VIEW), take_indices, ListViewTakeExec},
-      {InputType(Type::LARGE_LIST_VIEW), take_indices, LargeListViewTakeExec},
-      {InputType(Type::FIXED_SIZE_LIST), take_indices, FSLTakeExec},
-      {InputType(Type::DENSE_UNION), take_indices, DenseUnionTakeExec},
-      {InputType(Type::SPARSE_UNION), take_indices, SparseUnionTakeExec},
-      {InputType(Type::STRUCT), take_indices, StructTakeExec},
-      {InputType(Type::MAP), take_indices, MapTakeExec},
+      SelectionKernelData{InputType(match::Primitive()), take_indices,
+                          ArrayTakeExecFunctor<FixedWidthTakeExec>::Exec,
+                          SpecialTakeChunkedExecFunctor<FixedWidthTakeExec>::Exec},
+      SelectionKernelData{InputType(match::BinaryLike()), take_indices,
+                          ArrayTakeExecFunctor<VarBinaryTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<VarBinaryTakeExec>::Exec},
+      SelectionKernelData{InputType(match::LargeBinaryLike()), take_indices,
+                          ArrayTakeExecFunctor<LargeVarBinaryTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<LargeVarBinaryTakeExec>::Exec},
+      SelectionKernelData{InputType(match::FixedSizeBinaryLike()), take_indices,
+                          ArrayTakeExecFunctor<FixedWidthTakeExec>::Exec,
+                          SpecialTakeChunkedExecFunctor<FixedWidthTakeExec>::Exec},
+      SelectionKernelData{InputType(null()), take_indices,
+                          ArrayTakeExecFunctor<NullTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<NullTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::DICTIONARY), take_indices,
+                          ArrayTakeExecFunctor<DictionaryTake>::Exec,
+                          GenericTakeChunkedExecFunctor<DictionaryTake>::Exec},
+      SelectionKernelData{InputType(Type::EXTENSION), take_indices,
+                          ArrayTakeExecFunctor<ExtensionTake>::Exec,
+                          GenericTakeChunkedExecFunctor<ExtensionTake>::Exec},
+      SelectionKernelData{InputType(Type::LIST), take_indices,
+                          ArrayTakeExecFunctor<ListTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<ListTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::LARGE_LIST), take_indices,
+                          ArrayTakeExecFunctor<LargeListTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<LargeListTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::LIST_VIEW), take_indices,
+                          ArrayTakeExecFunctor<ListViewTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<ListViewTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::LARGE_LIST_VIEW), take_indices,
+                          ArrayTakeExecFunctor<LargeListViewTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<LargeListViewTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::FIXED_SIZE_LIST), take_indices,
+                          ArrayTakeExecFunctor<FSLTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<FSLTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::DENSE_UNION), take_indices,
+                          ArrayTakeExecFunctor<DenseUnionTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<DenseUnionTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::SPARSE_UNION), take_indices,
+                          ArrayTakeExecFunctor<SparseUnionTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<SparseUnionTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::STRUCT), take_indices,
+                          ArrayTakeExecFunctor<StructTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<StructTakeExec>::Exec},
+      SelectionKernelData{InputType(Type::MAP), take_indices,
+                          ArrayTakeExecFunctor<MapTakeExec>::Exec,
+                          GenericTakeChunkedExecFunctor<MapTakeExec>::Exec},
   };
 }