diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 026bb5c77e0..a0096f8822b 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -501,6 +501,7 @@ set(ARROW_UTIL_SRCS
     util/decimal.cc
     util/delimiting.cc
     util/dict_util.cc
+    util/fixed_width_internal.cc
     util/float16.cc
     util/formatting.cc
     util/future.cc
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
index 8825d697fdf..d5e5e5ad289 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc
@@ -40,6 +40,7 @@
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_ops.h"
+#include "arrow/util/fixed_width_internal.h"
 
 namespace arrow {
 
@@ -158,9 +159,11 @@ class PrimitiveFilterImpl {
   PrimitiveFilterImpl(const ArraySpan& values, const ArraySpan& filter,
                       FilterOptions::NullSelectionBehavior null_selection,
                       ArrayData* out_arr)
-      : byte_width_(values.type->byte_width()),
+      : byte_width_(util::FixedWidthInBytes(*values.type)),
         values_is_valid_(values.buffers[0].data),
-        values_data_(values.buffers[1].data),
+        // No offset applied for boolean because it's a bitmap
+        values_data_(kIsBoolean ? values.buffers[1].data
+                                : util::OffsetPointerOfFixedWidthValues(values)),
         values_null_count_(values.null_count),
         values_offset_(values.offset),
         values_length_(values.length),
@@ -169,17 +172,13 @@ class PrimitiveFilterImpl {
     if constexpr (kByteWidth >= 0 && !kIsBoolean) {
       DCHECK_EQ(kByteWidth, byte_width_);
     }
-    if constexpr (!kIsBoolean) {
-      // No offset applied for boolean because it's a bitmap
-      values_data_ += values.offset * byte_width();
-    }
 
+    DCHECK_EQ(out_arr->offset, 0);
     if (out_arr->buffers[0] != nullptr) {
       // May be unallocated if neither filter nor values contain nulls
       out_is_valid_ = out_arr->buffers[0]->mutable_data();
     }
-    out_data_ = out_arr->buffers[1]->mutable_data();
-    DCHECK_EQ(out_arr->offset, 0);
+    out_data_ = util::MutableFixedWidthValuesPointer(out_arr);
     out_length_ = out_arr->length;
     out_position_ = 0;
   }
@@ -416,7 +415,7 @@ class PrimitiveFilterImpl {
     out_position_ += length;
   }
 
-  constexpr int32_t byte_width() const {
+  constexpr int64_t byte_width() const {
     if constexpr (kByteWidth >= 0) {
       return kByteWidth;
     } else {
@@ -425,7 +424,7 @@ class PrimitiveFilterImpl {
   }
 
  private:
-  int32_t byte_width_;
+  int64_t byte_width_;
   const uint8_t* values_is_valid_;
   const uint8_t* values_data_;
   int64_t values_null_count_;
@@ -439,6 +438,8 @@ class PrimitiveFilterImpl {
   int64_t out_position_;
 };
 
+}  // namespace
+
 Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const ArraySpan& values = batch[0].array;
   const ArraySpan& filter = batch[1].array;
@@ -468,9 +469,10 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
   // validity bitmap.
   const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero;
 
-  const int bit_width = values.type->bit_width();
-  RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, output_length, bit_width,
-                                              allocate_validity, out_arr));
+  DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false));
+  const int64_t bit_width = util::FixedWidthInBits(*values.type);
+  RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
+      ctx, output_length, /*source=*/values, allocate_validity, out_arr));
 
   switch (bit_width) {
     case 1:
@@ -505,6 +507,8 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
   return Status::OK();
 }
 
+namespace {
+
 // ----------------------------------------------------------------------
 // Optimized filter for base binary types (32-bit and 64-bit)
 
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
index a0fe2808e3e..9f50dcafdbe 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc
@@ -37,6 +37,7 @@
 #include "arrow/util/bit_block_counter.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/fixed_width_internal.h"
 #include "arrow/util/int_util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/ree_util.h"
@@ -65,24 +66,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 
-Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
-                                     bool allocate_validity, ArrayData* out) {
-  // Preallocate memory
-  out->length = length;
-  out->buffers.resize(2);
-
-  if (allocate_validity) {
-    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length));
-  }
-  if (bit_width == 1) {
-    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
-  } else {
-    ARROW_ASSIGN_OR_RAISE(out->buffers[1],
-                          ctx->Allocate(bit_util::BytesForBits(length * bit_width)));
-  }
-  return Status::OK();
-}
-
 namespace {
 
 /// \brief Iterate over a REE filter, emitting ranges of a plain values array that
@@ -564,39 +547,6 @@ struct VarBinarySelectionImpl : public Selection<VarBinarySelectionImpl<Type>, T
   }
 };
 
-struct FSBSelectionImpl : public Selection<FSBSelectionImpl, FixedSizeBinaryType> {
-  using Base = Selection<FSBSelectionImpl, FixedSizeBinaryType>;
-  LIFT_BASE_MEMBERS();
-
-  TypedBufferBuilder<uint8_t> data_builder;
-
-  FSBSelectionImpl(KernelContext* ctx, const ExecSpan& batch, int64_t output_length,
-                   ExecResult* out)
-      : Base(ctx, batch, output_length, out), data_builder(ctx->memory_pool()) {}
-
-  template <typename Adapter>
-  Status GenerateOutput() {
-    FixedSizeBinaryArray typed_values(this->values.ToArrayData());
-    int32_t value_size = typed_values.byte_width();
-
-    RETURN_NOT_OK(data_builder.Reserve(value_size * output_length));
-    Adapter adapter(this);
-    return adapter.Generate(
-        [&](int64_t index) {
-          auto val = typed_values.GetView(index);
-          data_builder.UnsafeAppend(reinterpret_cast<const uint8_t*>(val.data()),
-                                    value_size);
-          return Status::OK();
-        },
-        [&]() {
-          data_builder.UnsafeAppend(value_size, static_cast<uint8_t>(0x00));
-          return Status::OK();
-        });
-  }
-
-  Status Finish() override { return data_builder.Finish(&out->buffers[1]); }
-};
-
 template <typename Type>
 struct ListSelectionImpl : public Selection<ListSelectionImpl<Type>, Type> {
   using offset_type = typename Type::offset_type;
@@ -909,6 +859,24 @@ Status LargeListFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult
 }
 
 Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ArraySpan& values = batch[0].array;
+
+  // If a FixedSizeList wraps a fixed-width type we can, in some cases, use
+  // PrimitiveFilterExec for a fixed-size list array.
+  if (util::IsFixedWidthLike(values,
+                             /*force_null_count=*/true,
+                             /*extra_predicate=*/[](auto& fixed_width_type) {
+                               // DICTIONARY is fixed-width but not supported by
+                               // PrimitiveFilterExec.
+                               return fixed_width_type.id() != Type::DICTIONARY;
+                             })) {
+    const auto byte_width = util::FixedWidthInBytes(*values.type);
+    // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec
+    // might not handle it correctly.
+    if (byte_width > 0) {
+      return PrimitiveFilterExec(ctx, batch, out);
+    }
+  }
   return FilterExec<FSLSelectionImpl>(ctx, batch, out);
 }
 
@@ -942,23 +910,6 @@ Status LargeVarBinaryTakeExec(KernelContext* ctx, const ExecSpan& batch,
   return TakeExec<VarBinarySelectionImpl<LargeBinaryType>>(ctx, batch, out);
 }
 
-Status FSBTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
-  const ArraySpan& values = batch[0].array;
-  const auto byte_width = values.type->byte_width();
-  // Use primitive Take implementation (presumably faster) for some byte widths
-  switch (byte_width) {
-    case 1:
-    case 2:
-    case 4:
-    case 8:
-    case 16:
-    case 32:
-      return PrimitiveTakeExec(ctx, batch, out);
-    default:
-      return TakeExec<FSBSelectionImpl>(ctx, batch, out);
-  }
-}
-
 Status ListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   return TakeExec<ListSelectionImpl<ListType>>(ctx, batch, out);
 }
@@ -968,6 +919,19 @@ Status LargeListTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
 }
 
 Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+  const ArraySpan& values = batch[0].array;
+
+  // If a FixedSizeList wraps a fixed-width type we can, in some cases, use
+  // FixedWidthTakeExec for a fixed-size list array.
+  if (util::IsFixedWidthLike(values,
+                             /*force_null_count=*/true,
+                             /*extra_predicate=*/[](auto& fixed_width_type) {
+                               // DICTIONARY is fixed-width but not supported by
+                               // FixedWidthTakeExec.
+                               return fixed_width_type.id() != Type::DICTIONARY;
+                             })) {
+    return FixedWidthTakeExec(ctx, batch, out);
+  }
   return TakeExec<FSLSelectionImpl>(ctx, batch, out);
 }
 
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.h b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
index 95f3e51cd67..c5075d6dfe8 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_internal.h
+++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.h
@@ -45,12 +45,6 @@ void RegisterSelectionFunction(const std::string& name, FunctionDoc doc,
                                const FunctionOptions* default_options,
                                FunctionRegistry* registry);
 
-/// \brief Allocate an ArrayData for a primitive array with a given length and bit width
-///
-/// \param[in] bit_width 1 or a multiple of 8
-Status PreallocatePrimitiveArrayData(KernelContext* ctx, int64_t length, int bit_width,
-                                     bool allocate_validity, ArrayData* out);
-
 /// \brief Callback type for VisitPlainxREEFilterOutputSegments.
 ///
 /// position is the logical position in the values array relative to its offset.
@@ -70,6 +64,7 @@ void VisitPlainxREEFilterOutputSegments(
     FilterOptions::NullSelectionBehavior null_selection,
     const EmitREEFilterSegment& emit_segment);
 
+Status PrimitiveFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status ListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeListFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status FSLFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
@@ -78,8 +73,7 @@ Status MapFilterExec(KernelContext*, const ExecSpan&, ExecResult*);
 
 Status VarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeVarBinaryTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status PrimitiveTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
-Status FSBTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
+Status FixedWidthTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status ListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status LargeListTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
 Status FSLTakeExec(KernelContext*, const ExecSpan&, ExecResult*);
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
index 5cd37108284..b23b2c624c6 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc
@@ -19,6 +19,7 @@
 #include <cstring>
 #include <limits>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "arrow/array/builder_primitive.h"
@@ -37,6 +38,8 @@
 #include "arrow/util/bit_block_counter.h"
 #include "arrow/util/bit_run_reader.h"
 #include "arrow/util/bit_util.h"
+#include "arrow/util/fixed_width_internal.h"
+#include "arrow/util/gather_internal.h"
 #include "arrow/util/int_util.h"
 #include "arrow/util/ree_util.h"
 
@@ -323,235 +326,88 @@ namespace {
 using TakeState = OptionsWrapper<TakeOptions>;
 
 // ----------------------------------------------------------------------
-// Implement optimized take for primitive types from boolean to 1/2/4/8-byte
-// C-type based types. Use common implementation for every byte width and only
-// generate code for unsigned integer indices, since after boundschecking to
-// check for negative numbers in the indices we can safely reinterpret_cast
-// signed integers as unsigned.
-
-/// \brief The Take implementation for primitive (fixed-width) types does not
-/// use the logical Arrow type but rather the physical C type. This way we
-/// only generate one take function for each byte width.
-///
-/// This function assumes that the indices have been boundschecked.
-template <typename IndexCType, typename ValueWidthConstant>
-struct PrimitiveTakeImpl {
-  static constexpr int kValueWidth = ValueWidthConstant::value;
-
-  static void Exec(const ArraySpan& values, const ArraySpan& indices,
-                   ArrayData* out_arr) {
-    DCHECK_EQ(values.type->byte_width(), kValueWidth);
-    const auto* values_data =
-        values.GetValues<uint8_t>(1, 0) + kValueWidth * values.offset;
-    const uint8_t* values_is_valid = values.buffers[0].data;
-    auto values_offset = values.offset;
-
-    const auto* indices_data = indices.GetValues<IndexCType>(1);
-    const uint8_t* indices_is_valid = indices.buffers[0].data;
-    auto indices_offset = indices.offset;
-
-    auto out = out_arr->GetMutableValues<uint8_t>(1, 0) + kValueWidth * out_arr->offset;
-    auto out_is_valid = out_arr->buffers[0]->mutable_data();
-    auto out_offset = out_arr->offset;
-    DCHECK_EQ(out_offset, 0);
-
-    // If either the values or indices have nulls, we preemptively zero out the
-    // out validity bitmap so that we don't have to use ClearBit in each
-    // iteration for nulls.
-    if (values.null_count != 0 || indices.null_count != 0) {
-      bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
-    }
-
-    auto WriteValue = [&](int64_t position) {
-      memcpy(out + position * kValueWidth,
-             values_data + indices_data[position] * kValueWidth, kValueWidth);
-    };
-
-    auto WriteZero = [&](int64_t position) {
-      memset(out + position * kValueWidth, 0, kValueWidth);
-    };
-
-    auto WriteZeroSegment = [&](int64_t position, int64_t length) {
-      memset(out + position * kValueWidth, 0, kValueWidth * length);
-    };
+// Implement optimized take for primitive types from boolean to
+// 1/2/4/8/16/32-byte C-type based types and fixed-size binary (0 or more
+// bytes).
+//
+// Use one specialization for each of these primitive byte-widths so the
+// compiler can specialize the memcpy to dedicated CPU instructions and for
+// fixed-width binary use the 1-byte specialization but pass WithFactor=true
+// that makes the kernel consider the factor parameter provided at runtime.
+//
+// Only unsigned index types need to be instantiated since after
+// boundschecking to check for negative numbers in the indices we can safely
+// reinterpret_cast signed integers as unsigned.
 
-    OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
-                                                indices.length);
-    int64_t position = 0;
-    int64_t valid_count = 0;
-    while (position < indices.length) {
-      BitBlockCount block = indices_bit_counter.NextBlock();
-      if (values.null_count == 0) {
-        // Values are never null, so things are easier
-        valid_count += block.popcount;
-        if (block.popcount == block.length) {
-          // Fastest path: neither values nor index nulls
-          bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
-          for (int64_t i = 0; i < block.length; ++i) {
-            WriteValue(position);
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some indices but not all are null
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
-              // index is not null
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              WriteValue(position);
-            } else {
-              WriteZero(position);
-            }
-            ++position;
-          }
-        } else {
-          WriteZeroSegment(position, block.length);
-          position += block.length;
-        }
-      } else {
-        // Values have nulls, so we must do random access into the values bitmap
-        if (block.popcount == block.length) {
-          // Faster path: indices are not null but values may be
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
-              // value is not null
-              WriteValue(position);
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              ++valid_count;
-            } else {
-              WriteZero(position);
-            }
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some but not all indices are null. Since we are doing
-          // random access in general we have to check the value nullness one by
-          // one.
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position) &&
-                bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
-              // index is not null && value is not null
-              WriteValue(position);
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              ++valid_count;
-            } else {
-              WriteZero(position);
-            }
-            ++position;
-          }
-        } else {
-          WriteZeroSegment(position, block.length);
-          position += block.length;
-        }
-      }
+/// \brief The Take implementation for primitive types and fixed-width binary.
+///
+/// Also note that this function can also handle fixed-size-list arrays if
+/// they fit the criteria described in fixed_width_internal.h, so use the
+/// function defined in that file to access values and destination pointers
+/// and DO NOT ASSUME `values.type()` is a primitive type.
+///
+/// \pre the indices have been boundschecked
+template <typename IndexCType, typename ValueBitWidthConstant,
+          typename OutputIsZeroInitialized = std::false_type,
+          typename WithFactor = std::false_type>
+struct FixedWidthTakeImpl {
+  static constexpr int kValueWidthInBits = ValueBitWidthConstant::value;
+
+  // offset returned is defined as number of kValueWidthInBits blocks
+  static std::pair<int64_t, const uint8_t*> SourceOffsetAndValuesPointer(
+      const ArraySpan& values) {
+    if constexpr (kValueWidthInBits == 1) {
+      DCHECK_EQ(values.type->id(), Type::BOOL);
+      return {values.offset, values.GetValues<uint8_t>(1, 0)};
+    } else {
+      static_assert(kValueWidthInBits % 8 == 0,
+                    "kValueWidthInBits must be 1 or a multiple of 8");
+      return {0, util::OffsetPointerOfFixedWidthValues(values)};
     }
-    out_arr->null_count = out_arr->length - valid_count;
   }
-};
 
-template <typename IndexCType>
-struct BooleanTakeImpl {
-  static void Exec(const ArraySpan& values, const ArraySpan& indices,
-                   ArrayData* out_arr) {
-    const uint8_t* values_data = values.buffers[1].data;
-    const uint8_t* values_is_valid = values.buffers[0].data;
-    auto values_offset = values.offset;
-
-    const auto* indices_data = indices.GetValues<IndexCType>(1);
-    const uint8_t* indices_is_valid = indices.buffers[0].data;
-    auto indices_offset = indices.offset;
-
-    auto out = out_arr->buffers[1]->mutable_data();
-    auto out_is_valid = out_arr->buffers[0]->mutable_data();
-    auto out_offset = out_arr->offset;
-
-    // If either the values or indices have nulls, we preemptively zero out the
-    // out validity bitmap so that we don't have to use ClearBit in each
-    // iteration for nulls.
-    if (values.null_count != 0 || indices.null_count != 0) {
-      bit_util::SetBitsTo(out_is_valid, out_offset, indices.length, false);
-    }
-    // Avoid uninitialized data in values array
-    bit_util::SetBitsTo(out, out_offset, indices.length, false);
-
-    auto PlaceDataBit = [&](int64_t loc, IndexCType index) {
-      bit_util::SetBitTo(out, out_offset + loc,
-                         bit_util::GetBit(values_data, values_offset + index));
-    };
-
-    OptionalBitBlockCounter indices_bit_counter(indices_is_valid, indices_offset,
-                                                indices.length);
-    int64_t position = 0;
+  static void Exec(KernelContext* ctx, const ArraySpan& values, const ArraySpan& indices,
+                   ArrayData* out_arr, size_t factor) {
+#ifndef NDEBUG
+    int64_t bit_width = util::FixedWidthInBits(*values.type);
+    DCHECK(WithFactor::value || (kValueWidthInBits == bit_width && factor == 1));
+    DCHECK(!WithFactor::value ||
+           (factor > 0 && kValueWidthInBits == 8 &&  // factors are used with bytes
+            static_cast<int64_t>(factor * kValueWidthInBits) == bit_width));
+#endif
+    const bool out_has_validity = values.MayHaveNulls() || indices.MayHaveNulls();
+
+    const uint8_t* src;
+    int64_t src_offset;
+    std::tie(src_offset, src) = SourceOffsetAndValuesPointer(values);
+    uint8_t* out = util::MutableFixedWidthValuesPointer(out_arr);
     int64_t valid_count = 0;
-    while (position < indices.length) {
-      BitBlockCount block = indices_bit_counter.NextBlock();
-      if (values.null_count == 0) {
-        // Values are never null, so things are easier
-        valid_count += block.popcount;
-        if (block.popcount == block.length) {
-          // Fastest path: neither values nor index nulls
-          bit_util::SetBitsTo(out_is_valid, out_offset + position, block.length, true);
-          for (int64_t i = 0; i < block.length; ++i) {
-            PlaceDataBit(position, indices_data[position]);
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some but not all indices are null
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
-              // index is not null
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              PlaceDataBit(position, indices_data[position]);
-            }
-            ++position;
-          }
-        } else {
-          position += block.length;
-        }
-      } else {
-        // Values have nulls, so we must do random access into the values bitmap
-        if (block.popcount == block.length) {
-          // Faster path: indices are not null but values may be
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(values_is_valid,
-                                 values_offset + indices_data[position])) {
-              // value is not null
-              bit_util::SetBit(out_is_valid, out_offset + position);
-              PlaceDataBit(position, indices_data[position]);
-              ++valid_count;
-            }
-            ++position;
-          }
-        } else if (block.popcount > 0) {
-          // Slow path: some but not all indices are null. Since we are doing
-          // random access in general we have to check the value nullness one by
-          // one.
-          for (int64_t i = 0; i < block.length; ++i) {
-            if (bit_util::GetBit(indices_is_valid, indices_offset + position)) {
-              // index is not null
-              if (bit_util::GetBit(values_is_valid,
-                                   values_offset + indices_data[position])) {
-                // value is not null
-                PlaceDataBit(position, indices_data[position]);
-                bit_util::SetBit(out_is_valid, out_offset + position);
-                ++valid_count;
-              }
-            }
-            ++position;
-          }
-        } else {
-          position += block.length;
-        }
-      }
+    arrow::internal::Gather<kValueWidthInBits, IndexCType, WithFactor::value> gather{
+        /*src_length=*/values.length,
+        src,
+        src_offset,
+        /*idx_length=*/indices.length,
+        /*idx=*/indices.GetValues<IndexCType>(1),
+        out,
+        factor};
+    if (out_has_validity) {
+      DCHECK_EQ(out_arr->offset, 0);
+      // out_is_valid must be zero-initiliazed, because Gather::Execute
+      // saves time by not having to ClearBit on every null element.
+      auto out_is_valid = out_arr->GetMutableValues<uint8_t>(0);
+      memset(out_is_valid, 0, bit_util::BytesForBits(out_arr->length));
+      valid_count = gather.template Execute<OutputIsZeroInitialized::value>(
+          /*src_validity=*/values, /*idx_validity=*/indices, out_is_valid);
+    } else {
+      valid_count = gather.Execute();
     }
     out_arr->null_count = out_arr->length - valid_count;
   }
 };
 
 template <template <typename...> class TakeImpl, typename... Args>
-void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
-                       ArrayData* out) {
+void TakeIndexDispatch(KernelContext* ctx, const ArraySpan& values,
+                       const ArraySpan& indices, ArrayData* out, size_t factor = 1) {
   // With the simplifying assumption that boundschecking has taken place
   // already at a higher level, we can now assume that the index values are all
   // non-negative. Thus, we can interpret signed integers as unsigned and avoid
@@ -559,22 +415,20 @@ void TakeIndexDispatch(const ArraySpan& values, const ArraySpan& indices,
   // width.
   switch (indices.type->byte_width()) {
     case 1:
-      return TakeImpl<uint8_t, Args...>::Exec(values, indices, out);
+      return TakeImpl<uint8_t, Args...>::Exec(ctx, values, indices, out, factor);
     case 2:
-      return TakeImpl<uint16_t, Args...>::Exec(values, indices, out);
+      return TakeImpl<uint16_t, Args...>::Exec(ctx, values, indices, out, factor);
     case 4:
-      return TakeImpl<uint32_t, Args...>::Exec(values, indices, out);
+      return TakeImpl<uint32_t, Args...>::Exec(ctx, values, indices, out, factor);
     case 8:
-      return TakeImpl<uint64_t, Args...>::Exec(values, indices, out);
-    default:
-      DCHECK(false) << "Invalid indices byte width";
-      break;
+      return TakeImpl<uint64_t, Args...>::Exec(ctx, values, indices, out, factor);
   }
+  DCHECK(false) << "Invalid indices byte width";
 }
 
 }  // namespace
 
-Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
+Status FixedWidthTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
   const ArraySpan& values = batch[0].array;
   const ArraySpan& indices = batch[1].array;
 
@@ -583,48 +437,74 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
   }
 
   ArrayData* out_arr = out->array_data().get();
-
-  const int bit_width = values.type->bit_width();
-
-  // TODO: When neither values nor indices contain nulls, we can skip
-  // allocating the validity bitmap altogether and save time and space. A
-  // streamlined PrimitiveTakeImpl would need to be written that skips all
-  // interactions with the output validity bitmap, though.
-  RETURN_NOT_OK(PreallocatePrimitiveArrayData(ctx, indices.length, bit_width,
-                                              /*allocate_validity=*/true, out_arr));
-  switch (bit_width) {
+  DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false, [](const auto& type) {
+    return type.id() != Type::DICTIONARY;
+  }));
+  // When we know for sure that values nor indices contain nulls, we can skip
+  // allocating the validity bitmap altogether and save time and space.
+  const bool allocate_validity = values.MayHaveNulls() || indices.MayHaveNulls();
+  RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData(
+      ctx, indices.length, /*source=*/values, allocate_validity, out_arr));
+  switch (util::FixedWidthInBits(*values.type)) {
+    case 0:
+      DCHECK(values.type->id() == Type::FIXED_SIZE_BINARY ||
+             values.type->id() == Type::FIXED_SIZE_LIST);
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 0>>(
+          ctx, values, indices, out_arr);
+      break;
     case 1:
-      TakeIndexDispatch<BooleanTakeImpl>(values, indices, out_arr);
+      // Zero-initialize the data buffer for the output array when the bit-width is 1
+      // (e.g. Boolean array) to avoid having to ClearBit on every null element.
+      // This might be profitable for other types as well, but we take the most
+      // conservative approach for now.
+      memset(out_arr->buffers[1]->mutable_data(), 0, out_arr->buffers[1]->size());
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 1>,
+                        /*OutputIsZeroInitialized=*/
+                        std::true_type>(ctx, values, indices, out_arr);
       break;
     case 8:
-      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 1>>(
-          values, indices, out_arr);
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 8>>(
+          ctx, values, indices, out_arr);
       break;
     case 16:
-      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 2>>(
-          values, indices, out_arr);
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 16>>(
+          ctx, values, indices, out_arr);
       break;
     case 32:
-      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 4>>(
-          values, indices, out_arr);
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 32>>(
+          ctx, values, indices, out_arr);
       break;
     case 64:
-      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 8>>(
-          values, indices, out_arr);
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 64>>(
+          ctx, values, indices, out_arr);
       break;
     case 128:
       // For INTERVAL_MONTH_DAY_NANO, DECIMAL128
-      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 16>>(
-          values, indices, out_arr);
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 128>>(
+          ctx, values, indices, out_arr);
       break;
     case 256:
       // For DECIMAL256
-      TakeIndexDispatch<PrimitiveTakeImpl, std::integral_constant<int, 32>>(
-          values, indices, out_arr);
+      TakeIndexDispatch<FixedWidthTakeImpl, std::integral_constant<int, 256>>(
+          ctx, values, indices, out_arr);
       break;
     default:
-      return Status::NotImplemented("Unsupported primitive type for take: ",
-                                    *values.type);
+      if (values.type->id() == Type::FIXED_SIZE_BINARY ||
+          values.type->id() == Type::FIXED_SIZE_LIST) {
+        int64_t byte_width = util::FixedWidthInBytes(*values.type);
+        // 0-length fixed-size binary or lists were handled above on `case 0`
+        DCHECK_GT(byte_width, 0);
+        TakeIndexDispatch<FixedWidthTakeImpl,
+                          /*ValueBitWidth=*/std::integral_constant<int, 8>,
+                          /*OutputIsZeroInitialized=*/std::false_type,
+                          /*WithFactor=*/std::true_type>(
+            ctx, values, indices, out_arr,
+            /*factor=*/static_cast<size_t>(byte_width));
+      } else {
+        return Status::NotImplemented("Unsupported primitive type for take: ",
+                                      *values.type);
+      }
+      break;
   }
   return Status::OK();
 }
@@ -877,13 +757,11 @@ void PopulateTakeKernels(std::vector<SelectionKernelData>* out) {
   auto take_indices = match::Integer();
 
   *out = {
-      {InputType(match::Primitive()), take_indices, PrimitiveTakeExec},
+      {InputType(match::Primitive()), take_indices, FixedWidthTakeExec},
       {InputType(match::BinaryLike()), take_indices, VarBinaryTakeExec},
       {InputType(match::LargeBinaryLike()), take_indices, LargeVarBinaryTakeExec},
-      {InputType(Type::FIXED_SIZE_BINARY), take_indices, FSBTakeExec},
+      {InputType(match::FixedSizeBinaryLike()), take_indices, FixedWidthTakeExec},
       {InputType(null()), take_indices, NullTakeExec},
-      {InputType(Type::DECIMAL128), take_indices, PrimitiveTakeExec},
-      {InputType(Type::DECIMAL256), take_indices, PrimitiveTakeExec},
       {InputType(Type::DICTIONARY), take_indices, DictionaryTake},
       {InputType(Type::EXTENSION), take_indices, ExtensionTake},
       {InputType(Type::LIST), take_indices, ListTakeExec},
diff --git a/cpp/src/arrow/compute/kernels/vector_selection_test.cc b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
index ec94b328ea3..9cca898d1d3 100644
--- a/cpp/src/arrow/compute/kernels/vector_selection_test.cc
+++ b/cpp/src/arrow/compute/kernels/vector_selection_test.cc
@@ -23,6 +23,7 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/array/builder_nested.h"
 #include "arrow/array/concatenate.h"
 #include "arrow/chunked_array.h"
 #include "arrow/compute/api.h"
@@ -32,6 +33,7 @@
 #include "arrow/testing/gtest_util.h"
 #include "arrow/testing/random.h"
 #include "arrow/testing/util.h"
+#include "arrow/util/fixed_width_test_util.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -726,7 +728,37 @@ TEST_F(TestFilterKernelWithLargeList, FilterListInt32) {
                      "[[1,2], null, null]");
 }
 
-class TestFilterKernelWithFixedSizeList : public TestFilterKernel {};
+class TestFilterKernelWithFixedSizeList : public TestFilterKernel {
+ protected:
+  std::vector<std::shared_ptr<Array>> five_length_filters_ = {
+      ArrayFromJSON(boolean(), "[false, false, false, false, false]"),
+      ArrayFromJSON(boolean(), "[true, true, true, true, true]"),
+      ArrayFromJSON(boolean(), "[false, true, true, false, true]"),
+      ArrayFromJSON(boolean(), "[null, true, null, false, true]"),
+  };
+
+  void AssertFilterOnNestedLists(const std::shared_ptr<DataType>& inner_type,
+                                 const std::vector<int>& list_sizes) {
+    using NLG = ::arrow::util::internal::NestedListGenerator;
+    constexpr int64_t kLength = 5;
+    // Create two equivalent lists: one as a FixedSizeList and another as a List.
+    ASSERT_OK_AND_ASSIGN(auto fsl_list,
+                         NLG::NestedFSLArray(inner_type, list_sizes, kLength));
+    ASSERT_OK_AND_ASSIGN(auto list,
+                         NLG::NestedListArray(inner_type, list_sizes, kLength));
+
+    ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`");
+
+    for (auto& filter : five_length_filters_) {
+      // Use the Filter on ListType as the reference implementation.
+      ASSERT_OK_AND_ASSIGN(auto expected_list,
+                           Filter(*list, *filter, /*options=*/emit_null_));
+      ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(expected_list, fsl_list->type()));
+      auto expected_fsl_array = expected_fsl.make_array();
+      this->AssertFilter(fsl_list, filter, expected_fsl_array);
+    }
+  }
+};
 
 TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) {
   std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]";
@@ -740,6 +772,33 @@ TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) {
                      "[[1, null, 3], [7, 8, null]]");
 }
 
+TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListVarWidth) {
+  std::string list_json =
+      R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])";
+  this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 0, 0]", "[]");
+  this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 1, null]",
+                     R"([["two", "", "three"], ["four", "five", "six"], null])");
+  this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 0, 1, null]",
+                     R"([["four", "five", "six"], null])");
+  this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[1, 1, 1, 1]", list_json);
+  this->AssertFilter(fixed_size_list(utf8(), 3), list_json, "[0, 1, 0, 1]",
+                     R"([["two", "", "three"], ["seven", "eight", ""]])");
+}
+
+TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListModuloNesting) {
+  using NLG = ::arrow::util::internal::NestedListGenerator;
+  const std::vector<std::shared_ptr<DataType>> value_types = {
+      int16(),
+      int32(),
+      int64(),
+  };
+  NLG::VisitAllNestedListConfigurations(
+      value_types, [this](const std::shared_ptr<DataType>& inner_type,
+                          const std::vector<int>& list_sizes) {
+        this->AssertFilterOnNestedLists(inner_type, list_sizes);
+      });
+}
+
 class TestFilterKernelWithMap : public TestFilterKernel {};
 
 TEST_F(TestFilterKernelWithMap, FilterMapStringToInt32) {
@@ -1034,29 +1093,34 @@ Status TakeJSON(const std::shared_ptr<DataType>& type, const std::string& values
       .Value(out);
 }
 
+void DoCheckTake(const std::shared_ptr<Array>& values,
+                 const std::shared_ptr<Array>& indices,
+                 const std::shared_ptr<Array>& expected) {
+  AssertTakeArrays(values, indices, expected);
+
+  // Check sliced values
+  ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(values->type(), 2));
+  ASSERT_OK_AND_ASSIGN(auto values_sliced,
+                       Concatenate({values_filler, values, values_filler}));
+  values_sliced = values_sliced->Slice(2, values->length());
+  AssertTakeArrays(values_sliced, indices, expected);
+
+  // Check sliced indices
+  ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(indices->type(), int8_t{0}));
+  ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3));
+  ASSERT_OK_AND_ASSIGN(auto indices_sliced,
+                       Concatenate({indices_filler, indices, indices_filler}));
+  indices_sliced = indices_sliced->Slice(3, indices->length());
+  AssertTakeArrays(values, indices_sliced, expected);
+}
+
 void CheckTake(const std::shared_ptr<DataType>& type, const std::string& values_json,
                const std::string& indices_json, const std::string& expected_json) {
   auto values = ArrayFromJSON(type, values_json);
   auto expected = ArrayFromJSON(type, expected_json);
-
   for (auto index_type : {int8(), uint32()}) {
     auto indices = ArrayFromJSON(index_type, indices_json);
-    AssertTakeArrays(values, indices, expected);
-
-    // Check sliced values
-    ASSERT_OK_AND_ASSIGN(auto values_filler, MakeArrayOfNull(type, 2));
-    ASSERT_OK_AND_ASSIGN(auto values_sliced,
-                         Concatenate({values_filler, values, values_filler}));
-    values_sliced = values_sliced->Slice(2, values->length());
-    AssertTakeArrays(values_sliced, indices, expected);
-
-    // Check sliced indices
-    ASSERT_OK_AND_ASSIGN(auto zero, MakeScalar(index_type, int8_t{0}));
-    ASSERT_OK_AND_ASSIGN(auto indices_filler, MakeArrayFromScalar(*zero, 3));
-    ASSERT_OK_AND_ASSIGN(auto indices_sliced,
-                         Concatenate({indices_filler, indices, indices_filler}));
-    indices_sliced = indices_sliced->Slice(3, indices->length());
-    AssertTakeArrays(values, indices_sliced, expected);
+    DoCheckTake(values, indices, expected);
   }
 }
 
@@ -1082,6 +1146,15 @@ void ValidateTakeImpl(const std::shared_ptr<Array>& values,
   for (int64_t i = 0; i < indices->length(); ++i) {
     if (typed_indices->IsNull(i) || typed_values->IsNull(typed_indices->Value(i))) {
       ASSERT_TRUE(result->IsNull(i)) << i;
+      // The value of a null element is undefined, but right
+      // out of the Take kernel it is expected to be 0.
+      if constexpr (is_primitive(ValuesType::type_id)) {
+        if constexpr (ValuesType::type_id == Type::BOOL) {
+          ASSERT_EQ(typed_result->Value(i), false);
+        } else {
+          ASSERT_EQ(typed_result->Value(i), 0);
+        }
+      }
     } else {
       ASSERT_FALSE(result->IsNull(i)) << i;
       ASSERT_EQ(typed_result->GetView(i), typed_values->GetView(typed_indices->Value(i)))
@@ -1427,7 +1500,25 @@ TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) {
   CheckTake(large_list(int32()), list_json, "[null, 1, 2, 0]", "[null, [1,2], null, []]");
 }
 
-class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped<FixedSizeListType> {};
+class TestTakeKernelWithFixedSizeList : public TestTakeKernelTyped<FixedSizeListType> {
+ protected:
+  void CheckTakeOnNestedLists(const std::shared_ptr<DataType>& inner_type,
+                              const std::vector<int>& list_sizes, int64_t length) {
+    using NLG = ::arrow::util::internal::NestedListGenerator;
+    // Create two equivalent lists: one as a FixedSizeList and another as a List.
+    ASSERT_OK_AND_ASSIGN(auto fsl_list,
+                         NLG::NestedFSLArray(inner_type, list_sizes, length));
+    ASSERT_OK_AND_ASSIGN(auto list, NLG::NestedListArray(inner_type, list_sizes, length));
+
+    ARROW_SCOPED_TRACE("CheckTakeOnNestedLists of type `", *fsl_list->type(), "`");
+
+    auto indices = ArrayFromJSON(int64(), "[1, 2, 4]");
+    // Use the Take on ListType as the reference implementation.
+    ASSERT_OK_AND_ASSIGN(auto expected_list, Take(*list, *indices));
+    ASSERT_OK_AND_ASSIGN(auto expected_fsl, Cast(*expected_list, fsl_list->type()));
+    DoCheckTake(fsl_list, indices, expected_fsl);
+  }
+};
 
 TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) {
   std::string list_json = "[null, [1, null, 3], [4, 5, 6], [7, 8, null]]";
@@ -1440,15 +1531,55 @@ TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) {
   CheckTake(fixed_size_list(int32(), 3), list_json, "[3, 0, 0, 3]",
             "[[7, 8, null], null, null, [7, 8, null]]");
   CheckTake(fixed_size_list(int32(), 3), list_json, "[0, 1, 2, 3]", list_json);
+
+  // No nulls in inner list values trigger the use of FixedWidthTakeExec() in
+  // FSLTakeExec()
+  std::string no_nulls_list_json = "[[0, 0, 0], [1, 2, 3], [4, 5, 6], [7, 8, 9]]";
   CheckTake(
-      fixed_size_list(int32(), 3), list_json, "[2, 2, 2, 2, 2, 2, 1]",
-      "[[4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [1, null, 3]]");
+      fixed_size_list(int32(), 3), no_nulls_list_json, "[2, 2, 2, 2, 2, 2, 1]",
+      "[[4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [4, 5, 6], [1, 2, 3]]");
 
   this->TestNoValidityBitmapButUnknownNullCount(fixed_size_list(int32(), 3),
                                                 "[[1, null, 3], [4, 5, 6], [7, 8, null]]",
                                                 "[0, 1, 0]");
 }
 
+TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListVarWidth) {
+  std::string list_json =
+      R"([["zero", "one", ""], ["two", "", "three"], ["four", "five", "six"], ["seven", "eight", ""]])";
+  CheckTake(fixed_size_list(utf8(), 3), list_json, "[]", "[]");
+  CheckTake(fixed_size_list(utf8(), 3), list_json, "[3, 2, 1]",
+            R"([["seven", "eight", ""], ["four", "five", "six"], ["two", "", "three"]])");
+  CheckTake(fixed_size_list(utf8(), 3), list_json, "[null, 2, 0]",
+            R"([null, ["four", "five", "six"], ["zero", "one", ""]])");
+  CheckTake(fixed_size_list(utf8(), 3), list_json, R"([null, null])", "[null, null]");
+  CheckTake(
+      fixed_size_list(utf8(), 3), list_json, "[3, 0, 0,3]",
+      R"([["seven", "eight", ""], ["zero", "one", ""], ["zero", "one", ""], ["seven", "eight", ""]])");
+  CheckTake(fixed_size_list(utf8(), 3), list_json, "[0, 1, 2, 3]", list_json);
+  CheckTake(fixed_size_list(utf8(), 3), list_json, "[2, 2, 2, 2, 2, 2, 1]",
+            R"([
+                 ["four", "five", "six"], ["four", "five", "six"],
+                 ["four", "five", "six"], ["four", "five", "six"],
+                 ["four", "five", "six"], ["four", "five", "six"],
+                 ["two", "", "three"]
+               ])");
+}
+
+TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListModuloNesting) {
+  using NLG = ::arrow::util::internal::NestedListGenerator;
+  const std::vector<std::shared_ptr<DataType>> value_types = {
+      int16(),
+      int32(),
+      int64(),
+  };
+  NLG::VisitAllNestedListConfigurations(
+      value_types, [this](const std::shared_ptr<DataType>& inner_type,
+                          const std::vector<int>& list_sizes) {
+        this->CheckTakeOnNestedLists(inner_type, list_sizes, /*length=*/5);
+      });
+}
+
 class TestTakeKernelWithMap : public TestTakeKernelTyped<MapType> {};
 
 TEST_F(TestTakeKernelWithMap, TakeMapStringToInt32) {
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index e26efba2859..087e4e3879e 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -56,6 +56,7 @@ add_arrow_test(utility-test
                compression_test.cc
                decimal_test.cc
                float16_test.cc
+               fixed_width_test.cc
                formatting_util_test.cc
                key_value_metadata_test.cc
                hashing_test.cc
diff --git a/cpp/src/arrow/util/fixed_width_internal.cc b/cpp/src/arrow/util/fixed_width_internal.cc
new file mode 100644
index 00000000000..aacadb5e366
--- /dev/null
+++ b/cpp/src/arrow/util/fixed_width_internal.cc
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/result.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/fixed_width_internal.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/small_vector.h"
+
+namespace arrow::util {
+
+using ::arrow::internal::checked_cast;
+
+bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count) {
+  return IsFixedWidthLike(source, force_null_count,
+                          [](const DataType& type) { return true; });
+}
+
+namespace internal {
+
+int64_t FixedWidthInBytesFallback(const FixedSizeListType& fixed_size_list_type) {
+  auto* fsl = &fixed_size_list_type;
+  int64_t list_size = fsl->list_size();
+  for (auto type = fsl->value_type().get();;) {
+    if (type->id() == Type::FIXED_SIZE_LIST) {
+      fsl = checked_cast<const FixedSizeListType*>(type);
+      list_size *= fsl->list_size();
+      type = fsl->value_type().get();
+      continue;
+    }
+    if (type->id() != Type::BOOL && is_fixed_width(type->id())) {
+      const int64_t flat_byte_width = list_size * type->byte_width();
+      DCHECK_GE(flat_byte_width, 0);
+      return flat_byte_width;
+    }
+    break;
+  }
+  return -1;
+}
+
+Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx,
+                                      int64_t length, const ArraySpan& source,
+                                      bool allocate_validity, ArrayData* out) {
+  DCHECK(!source.MayHaveNulls() || allocate_validity)
+      << "allocate_validity cannot be false if source may have nulls";
+  DCHECK_EQ(source.type->id(), out->type->id());
+  auto* type = source.type;
+  out->length = length;
+  if (type->id() == Type::FIXED_SIZE_LIST) {
+    out->buffers.resize(1);
+    out->child_data = {std::make_shared<ArrayData>()};
+  } else {
+    out->buffers.resize(2);
+  }
+  if (allocate_validity) {
+    ARROW_ASSIGN_OR_RAISE(out->buffers[0], ctx->AllocateBitmap(length));
+  }
+
+  if (type->id() == Type::BOOL) {
+    ARROW_ASSIGN_OR_RAISE(out->buffers[1], ctx->AllocateBitmap(length));
+    return Status::OK();
+  }
+  if (is_fixed_width(type->id())) {
+    if (type->id() == Type::DICTIONARY) {
+      return Status::NotImplemented(
+          "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type);
+    }
+    ARROW_ASSIGN_OR_RAISE(out->buffers[1],
+                          ctx->Allocate(length * source.type->byte_width()));
+    return Status::OK();
+  }
+  if (type->id() == Type::FIXED_SIZE_LIST) {
+    auto& fsl_type = checked_cast<const FixedSizeListType&>(*type);
+    auto& value_type = fsl_type.value_type();
+    if (ARROW_PREDICT_FALSE(value_type->id() == Type::BOOL)) {
+      return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", fsl_type);
+    }
+    if (ARROW_PREDICT_FALSE(value_type->id() == Type::DICTIONARY)) {
+      return Status::NotImplemented(
+          "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type);
+    }
+    if (source.child_data[0].MayHaveNulls()) {
+      return Status::Invalid(
+          "PreallocateFixedWidthArrayData: "
+          "FixedSizeList may have null values in child array: ",
+          fsl_type);
+    }
+    auto* child_values = out->child_data[0].get();
+    child_values->type = value_type;
+    return PreallocateFixedWidthArrayData(ctx, length * fsl_type.list_size(),
+                                          /*source=*/source.child_data[0],
+                                          /*allocate_validity=*/false,
+                                          /*out=*/child_values);
+  }
+  return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", *type);
+}
+
+const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& source) {
+  using OffsetAndListSize = std::pair<int64_t, int64_t>;
+  auto get_offset = [](auto pair) { return pair.first; };
+  auto get_list_size = [](auto pair) { return pair.second; };
+  ::arrow::internal::SmallVector<OffsetAndListSize, 1> stack;
+
+  DCHECK_NE(source.type->id(), Type::BOOL);
+
+  int64_t list_size = 1;
+  auto* array = &source;
+  while (array->type->id() == Type::FIXED_SIZE_LIST) {
+    list_size *= checked_cast<const FixedSizeListType*>(array->type)->list_size();
+    stack.emplace_back(array->offset, list_size);
+    array = &array->child_data[0];
+  }
+  // Now that innermost values were reached, pop the stack and calculate the offset
+  // in bytes of the innermost values buffer by considering the offset at each
+  // level of nesting.
+  DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type));
+  DCHECK(array == &source || !array->MayHaveNulls())
+      << "OffsetPointerOfFixedWidthValues: array is expected to be flat or have no "
+         "nulls in the arrays nested by FIXED_SIZE_LIST.";
+  int64_t value_width = array->type->byte_width();
+  int64_t offset_in_bytes = array->offset * value_width;
+  for (auto it = stack.rbegin(); it != stack.rend(); ++it) {
+    value_width *= get_list_size(*it);
+    offset_in_bytes += get_offset(*it) * value_width;
+  }
+  return value_width < 0 ? nullptr : array->GetValues<uint8_t>(1, offset_in_bytes);
+}
+
+uint8_t* MutableFixedWidthValuesPointerFallback(ArrayData* mutable_fsl_array) {
+  DCHECK_EQ(mutable_fsl_array->type->id(), Type::FIXED_SIZE_LIST);
+  auto* array = mutable_fsl_array;
+  do {
+    DCHECK_EQ(array->offset, 0);
+    DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of "
+                                           << mutable_fsl_array->type->ToString(true);
+    array = array->child_data[0].get();
+  } while (array->type->id() == Type::FIXED_SIZE_LIST);
+  DCHECK_EQ(array->offset, 0);
+  DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type));
+  return array->GetMutableValues<uint8_t>(1, 0);
+}
+
+}  // namespace internal
+}  // namespace arrow::util
diff --git a/cpp/src/arrow/util/fixed_width_internal.h b/cpp/src/arrow/util/fixed_width_internal.h
new file mode 100644
index 00000000000..8077d60656c
--- /dev/null
+++ b/cpp/src/arrow/util/fixed_width_internal.h
@@ -0,0 +1,364 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow::compute {
+// XXX: remove dependency on compute::KernelContext
+class KernelContext;
+}  // namespace arrow::compute
+
+namespace arrow::util {
+
+/// \brief Checks if the given array has a fixed-width type or if it's an array of
+/// fixed-size list that can be flattened to an array of fixed-width values.
+///
+/// Fixed-width types are the ones defined by the is_fixed_width() predicate in
+/// type_traits.h. They are all the types that passes any of the following
+/// predicates:
+///
+///  - is_primitive()
+///  - is_fixed_size_binary()
+///  - is_dictionary()
+///
+/// At least 3 types in this set require special care:
+///  - `Type::BOOL` is fixed-width, but it's a 1-bit type and pointers to first bit
+///    in boolean buffers are not always aligned to byte boundaries.
+///  - `Type::DICTIONARY` is fixed-width because the indices are fixed-width, but the
+///    dictionary values are not necessarily fixed-width and have to be managed
+///    by separate operations.
+///  - Type::FIXED_SIZE_BINARY unlike other fixed-width types, fixed-size binary
+///    values are defined by a size attribute that is not known at compile time.
+///    The other types have power-of-2 byte widths, while fixed-size binary can
+///    have any byte width including 0.
+///
+/// Additionally, we say that a type is "fixed-width like" if it's a fixed-width as
+/// defined above, or if it's a fixed-size list (or nested fixed-size lists) and
+/// the innermost type is fixed-width and the following restrictions also apply:
+///  - The value type of the innermost fixed-size list is not BOOL (it has to be excluded
+///    because a 1-bit type doesn't byte-align)
+///  - Only the top-level array may have nulls, all the inner array have to be completely
+///    free of nulls so we don't need to manage internal validity bitmaps.
+///
+/// Take the following `fixed_size_list<fixed_size_list<int32, 2>, 3>` array as an
+/// example:
+///
+///     [
+///       [[1, 2], [3,  4], [ 5,  6]],
+///       null,
+///       [[7, 8], [9, 10], [11, 12]]
+///     ]
+///
+/// in memory, it would look like:
+///
+///     {
+///        type: fixed_size_list<fixed_size_list<int32, 2>, 3>,
+///        length: 3,
+///        null_count: 1,
+///        offset: 0,
+///        buffers: [
+///          0: [0b00000101]
+///        ],
+///        child_data: [
+///          0: {
+///            type: fixed_size_list<int32, 2>,
+///            length: 9,
+///            null_count: 0,
+///            offset: 0,
+///            buffers: [0: NULL],
+///            child_data: [
+///              0: {
+///                type: int32,
+///                length: 18,
+///                null_count: 0,
+///                offset: 0,
+///                buffers: [
+///                  0: NULL,
+///                  1: [ 1,  2,  3,  4,  5,  6,
+///                       0,  0,  0,  0,  0,  0
+///                       7,  8,  9, 10, 11, 12 ]
+///                ],
+///                child_data: []
+///              }
+///            ]
+///          }
+///        ]
+///     }
+///
+/// This layout fits the fixed-width like definition because the innermost type
+/// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't
+/// have nulls. The validity bitmap is only needed at the top-level array.
+///
+/// Writing to this array can be done in the same way writing to a flat fixed-width
+/// array is done, by:
+/// 1. Updating the validity bitmap at the top-level array if nulls are present.
+/// 2. Updating a continuous fixed-width block of memory through a single pointer.
+///
+/// The length of this block of memory is the product of the list sizes in the
+/// `FixedSizeList` types and the byte width of the innermost fixed-width type:
+///
+///     3 * 2 * 4 = 24 bytes
+///
+/// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by
+/// simply setting the validity bit to 1 and writing the 24-byte sequence of
+/// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`.
+///
+/// The length of the top-level array fully defines the lengths that all the nested
+/// arrays must have, which makes defining all the lengths as easy as defining the
+/// length of the top-level array.
+///
+///     length = 3
+///     child_data[0].length == 3 * 3 == 9
+///     child_data[0].child_data[0].length == 3 * 3 * 2 == 18
+///
+///     child_data[0].child_data[0].buffers[1].size() >=
+///       (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72)
+///
+/// Dealing with offsets is a bit involved. Let's say the array described above has
+/// the offsets 2, 5, and 7:
+///
+///     {
+///        type: fixed_size_list<fixed_size_list<int32, 2>, 3>,
+///        offset: 2,
+///        ...
+///        child_data: [
+///          0: {
+///            type: fixed_size_list<int32, 2>,
+///            offset: 5,
+///            ...
+///            child_data: [
+///              0: {
+///                type: int32,
+///                offset: 7,
+///                buffers: [
+///                  0: NULL,
+///                  1: [ 1, 1, 1, 1, 1, 1, 1,      // 7 values skipped
+///                       0,1, 0,1, 0,1, 0,1, 0,1,  // 5 [x,x] values skipped
+///
+///                       0,0,0,0,0,1,  //
+///                       0,0,0,0,0,1,  // 2 [[x,x], [x,x], [x,x]] values skipped
+///
+///                       1,  2,  3,  4,  5,  6,  //
+///                       0,  0,  0,  0,  0,  0   // the actual values
+///                       7,  8,  9, 10, 11, 12   //
+///                     ]
+///                ],
+///              }
+///            ]
+///          }
+///        ]
+///     }
+///
+/// The offset of the innermost values buffer, in bytes, is calculated as:
+///
+///     ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes
+///
+/// In general, the formula to calculate the offset of the innermost values buffer is:
+///
+///     ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off)
+///        * sizeof(innermost_type)
+///
+/// `OffsetPointerOfFixedWidthValues()` can calculate this byte offset and return the
+/// pointer to the first relevant byte of the innermost values buffer.
+///
+/// \param source The array to check
+/// \param force_null_count If true, GetNullCount() is used instead of null_count
+ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source,
+                                   bool force_null_count = false);
+
+/// \brief Checks if the given array has a fixed-width type or if it's an array of
+/// fixed-size list that can be flattened to an array of fixed-width values.
+///
+/// This function is a more general version of
+/// `IsFixedWidthLike(const ArraySpan&, bool)` that allows the caller to further
+/// restrict the inner value types that should be considered fixed-width.
+///
+/// \param source The array to check
+/// \param force_null_count If true, GetNullCount() is used instead of null_count
+/// \param extra_predicate A DataType predicate that can be used to further
+///                        restrict the types that are considered fixed-width
+template <class ExtraPred>
+inline bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count,
+                             ExtraPred extra_predicate) {
+  const auto* type = source.type;
+  // BOOL is considered fixed-width if not nested under FIXED_SIZE_LIST.
+  if (is_fixed_width(type->id()) && extra_predicate(*type)) {
+    return true;
+  }
+  if (type->id() == Type::FIXED_SIZE_LIST) {
+    // All the inner arrays must not contain any nulls.
+    const auto* values = &source.child_data[0];
+    while ((force_null_count ? values->GetNullCount() : values->null_count) == 0) {
+      type = values->type;
+      if (type->id() == Type::FIXED_SIZE_LIST) {
+        values = &values->child_data[0];
+        continue;
+      }
+      // BOOL has to be excluded because it's not byte-aligned.
+      return type->id() != Type::BOOL && is_fixed_width(type->id()) &&
+             extra_predicate(*type);
+    }
+  }
+  return false;
+}
+
+namespace internal {
+ARROW_EXPORT int64_t FixedWidthInBytesFallback(const FixedSizeListType&);
+}
+
+/// \brief Get the fixed-width in bytes of a type if it is a fixed-width like
+/// type, but not BOOL.
+///
+/// If the array is a FixedSizeList (of any level of nesting), the byte width of
+/// the values is the product of all fixed-list sizes and the byte width of the
+/// innermost fixed-width value type.
+///
+/// IsFixedWidthLike(array) performs more checks than this function and should
+/// be used to guarantee that, if type is not BOOL, this function will not return -1.
+///
+/// NOTE: this function translates `DataType::bit_width()` to bytes differently from
+/// `DataType::byte_width()`. `DataType::byte_width()` will return 0 for
+/// BOOL, while this function will return `-1`. This is done because 0 is
+/// a valid return value for FIXED_SIZE_LIST with size 0 or `FIXED_SIZE_BINARY` with
+/// size 0.
+///
+/// \pre The instance of the array where this type is from must pass
+///      `IsFixedWidthLike(array)` and should not be BOOL.
+/// \return The fixed-byte width of the values or -1 if the type is BOOL or not
+///         fixed-width like. 0 is a valid return value as fixed-size-lists
+///         and fixed-size-binary with size 0 are allowed.
+inline int64_t FixedWidthInBytes(const DataType& type) {
+  auto type_id = type.id();
+  if (is_fixed_width(type_id)) {
+    const int32_t num_bits = type.bit_width();
+    return (type_id == Type::BOOL) ? -1 : num_bits / 8;
+  }
+  if (type_id == Type::FIXED_SIZE_LIST) {
+    auto& fsl = ::arrow::internal::checked_cast<const FixedSizeListType&>(type);
+    return internal::FixedWidthInBytesFallback(fsl);
+  }
+  return -1;
+}
+
+/// \brief Get the fixed-width in bits of a type if it is a fixed-width like
+/// type.
+///
+/// \return The bit-width of the values or -1
+/// \see FixedWidthInBytes
+inline int64_t FixedWidthInBits(const DataType& type) {
+  auto type_id = type.id();
+  if (is_fixed_width(type_id)) {
+    return type.bit_width();
+  }
+  const int64_t byte_width = FixedWidthInBytes(type);
+  if (ARROW_PREDICT_FALSE(byte_width < 0)) {
+    return -1;
+  }
+  return byte_width * 8;
+}
+
+namespace internal {
+
+/// \brief Allocate an ArrayData for a type that is fixed-width like.
+///
+/// This function performs the same checks performed by
+/// `IsFixedWidthLike(source, false)`. If `source.type` is not a simple
+/// fixed-width type, caller should make sure it passes the
+/// `IsFixedWidthLike(source)` checks. That guarantees that it's possible to
+/// allocate an array that can serve as a destination for a kernel that writes values
+/// through a single pointer to fixed-width byte blocks.
+///
+/// \param[in] length The length of the array to allocate (unrelated to the length of
+///                   the source array)
+/// \param[in] source The source array that carries the type information and the
+///                   validity bitmaps that are relevant for the type validation
+///                   when the source is a FixedSizeList.
+/// \see IsFixedWidthLike
+ARROW_EXPORT Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx,
+                                                   int64_t length,
+                                                   const ArraySpan& source,
+                                                   bool allocate_validity,
+                                                   ArrayData* out);
+
+/// \pre same as OffsetPointerOfFixedWidthValues
+/// \pre source.type->id() != Type::BOOL
+ARROW_EXPORT const uint8_t* OffsetPointerOfFixedWidthValuesFallback(
+    const ArraySpan& source);
+
+/// \pre same as MutableFixedWidthValuesPointer
+/// \pre mutable_array->type->id() == Type::FIXED_SIZE_LIST
+ARROW_EXPORT uint8_t* MutableFixedWidthValuesPointerFallback(
+    ArrayData* mutable_fsl_array);
+
+}  // namespace internal
+
+/// \brief Get the pointer to the fixed-width values of a fixed-width like array.
+///
+/// This function might return NULLPTR if the type of the array is BOOL or
+/// if the pre-conditions listed are not satisfied. The converse is not true
+/// (i.e. not getting NULLPTR doesn't guarantee that source is a fixed-width
+/// like array).
+///
+/// \pre `IsFixedWidthLike(source)` or the more restrictive
+///      is_fixed_width(*mutable_array->type) SHOULD be true
+/// \return The pointer to the fixed-width values of an array or NULLPTR
+///         if pre-conditions are not satisfied.
+inline const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) {
+  auto type_id = source.type->id();
+  if (ARROW_PREDICT_TRUE(is_fixed_width(type_id))) {
+    if (ARROW_PREDICT_FALSE(type_id == Type::BOOL)) {
+      // BOOL arrays are bit-packed, thus a byte-aligned pointer cannot be produced in the
+      // general case. Returning something for BOOL arrays that happen to byte-align
+      // because offset=0 would create too much confusion.
+      return nullptr;
+    }
+    return source.GetValues<uint8_t>(1, 0) + source.offset * source.type->byte_width();
+  }
+  return internal::OffsetPointerOfFixedWidthValuesFallback(source);
+}
+
+/// \brief Get the mutable pointer to the fixed-width values of an array
+///        allocated by PreallocateFixedWidthArrayData.
+///
+/// \pre mutable_array->offset and the offset of child array (if it's a
+///      FixedSizeList) MUST be 0 (recursively).
+/// \pre IsFixedWidthLike(ArraySpan(mutable_array)) or the more restrictive
+///      is_fixed_width(*mutable_array->type) MUST be true
+/// \return The mutable pointer to the fixed-width byte blocks of the array. If
+///         pre-conditions are not satisfied, the return values is undefined.
+inline uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array) {
+  auto type_id = mutable_array->type->id();
+  if (ARROW_PREDICT_FALSE(type_id == Type::FIXED_SIZE_LIST)) {
+    return internal::MutableFixedWidthValuesPointerFallback(mutable_array);
+  }
+  assert(mutable_array->offset == 0);
+  // BOOL is allowed here only because the offset is expected to be 0,
+  // so the byte-aligned pointer also points to the first *bit* of the buffer.
+  assert(is_fixed_width(type_id));
+  return mutable_array->GetMutableValues<uint8_t>(1, 0);
+}
+
+}  // namespace arrow::util
diff --git a/cpp/src/arrow/util/fixed_width_test.cc b/cpp/src/arrow/util/fixed_width_test.cc
new file mode 100644
index 00000000000..f82ccfacdf6
--- /dev/null
+++ b/cpp/src/arrow/util/fixed_width_test.cc
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// #include <cmath>
+// #include <string>
+
+#include <gtest/gtest.h>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+#include "arrow/util/fixed_width_internal.h"
+
+namespace arrow::util {
+
+namespace {
+bool NotBool(const DataType& type) { return type.id() != Type::BOOL; }
+bool NotInt32(const DataType& type) { return type.id() != Type::INT32; }
+}  // namespace
+
+class TestFixedWidth : public ::testing::Test {
+ protected:
+  std::shared_ptr<Array> bool_array_array_;
+  std::shared_ptr<Array> int_array_array_;
+  std::shared_ptr<Array> fsl_bool_array_;
+  std::shared_ptr<Array> fsl_int_array_;
+  std::shared_ptr<Array> fsl_int_nulls_array_;
+  std::shared_ptr<Array> fsl_int_inner_nulls_array_;
+
+  std::shared_ptr<DataType> fsl(int32_t list_size,
+                                const std::shared_ptr<DataType>& value_type) {
+    return fixed_size_list(value_type, list_size);
+  }
+
+ public:
+  void SetUp() override {
+    bool_array_array_ = ArrayFromJSON(boolean(), "[true, false, null]");
+    int_array_array_ = ArrayFromJSON(int32(), "[1, 0, null]");
+    fsl_bool_array_ = ArrayFromJSON(fsl(2, boolean()), "[[true, false]]");
+    fsl_int_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3]]");
+    fsl_int_nulls_array_ = ArrayFromJSON(fsl(2, int32()), "[[1, 0], null, [1, 2]]");
+    fsl_int_inner_nulls_array_ =
+        ArrayFromJSON(fsl(2, int32()), "[[1, 0], [2, 3], [null, 2]]");
+  }
+};
+
+TEST_F(TestFixedWidth, IsFixedWidth) {
+  auto arr = ArraySpan{*bool_array_array_->data()};
+  // force_null_count doesn't matter because nulls at the top-level
+  // of the array are allowed by IsFixedWidthLike.
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false));
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true));
+
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotInt32));
+  ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool));
+
+  arr = ArraySpan{*int_array_array_->data()};
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false));
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true));
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, NotBool));
+}
+
+TEST_F(TestFixedWidth, IsFixedWidthLike) {
+  auto arr = ArraySpan{*fsl_bool_array_->data()};
+  // bools wrapped by fixed-size-list are not fixed-width because the
+  // innermost data buffer is a bitmap and won't byte-align.
+  ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false));
+  ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true));
+
+  arr = ArraySpan{*fsl_int_array_->data()};
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false));
+  arr.null_count = kUnknownNullCount;
+  // force_null_count=true isn't necessary because nulls at the top-level
+  // of the array are allowed by IsFixedWidthLike.
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false));
+
+  arr.child_data[0].null_count = kUnknownNullCount;
+  // inner nulls are not allowed by IsFixedWidthLike...
+  ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false));
+  // ...but forcing null counting at on every internal array increases
+  // the chances of IsFixedWidthLike returning true.
+  ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/true));
+  // Excluding INT32 from the internal array checks.
+  ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true, NotInt32));
+
+  arr = ArraySpan{*fsl_int_nulls_array_->data()};
+  // Nulls at the top-level of the array are allowed by IsFixedWidthLike.
+  //
+  // TODO(GH-10157): ArrayFromJSON uses FixedSizeListBuilder which currently
+  // produces nulls on the child data if one of the list-typed elements is null.
+  // ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false));
+
+  arr = ArraySpan{*fsl_int_inner_nulls_array_->data()};
+  // Inner nulls are not allowed by IsFixedWidthLike.
+  ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true));
+}
+
+TEST_F(TestFixedWidth, MeasureWidthInBytes) {
+  auto b = boolean();
+  auto i8 = int8();
+  auto i32 = int32();
+  auto varlen = utf8();
+  ASSERT_EQ(FixedWidthInBytes(*b), -1);
+  ASSERT_EQ(FixedWidthInBytes(*i8), 1);
+  ASSERT_EQ(FixedWidthInBytes(*i32), 4);
+
+  ASSERT_EQ(FixedWidthInBytes(*varlen), -1);
+  ASSERT_EQ(FixedWidthInBytes(*varlen), -1);
+
+  ASSERT_EQ(FixedWidthInBytes(*fsl(0, b)), -1);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(3, b)), -1);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(5, b)), -1);
+
+  ASSERT_EQ(FixedWidthInBytes(*fsl(0, i8)), 0);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(3, i8)), 3);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(5, i8)), 5);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(0, i32)), 0);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(3, i32)), 3 * 4);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(5, i32)), 5 * 4);
+
+  ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i8))), 0);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i8))), 2 * 3);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i8))), 2 * 5);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(0, i32))), 0);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(3, i32))), 2 * 3 * 4);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(2, fsl(5, i32))), 2 * 5 * 4);
+
+  ASSERT_EQ(FixedWidthInBytes(*fsl(0, varlen)), -1);
+  ASSERT_EQ(FixedWidthInBytes(*fsl(2, varlen)), -1);
+}
+
+TEST_F(TestFixedWidth, MeasureWidthInBits) {
+  auto b = boolean();
+  auto i8 = int8();
+  auto i32 = int32();
+  auto varlen = utf8();
+  ASSERT_EQ(FixedWidthInBits(*b), 1);
+  ASSERT_EQ(FixedWidthInBits(*i8), 8);
+  ASSERT_EQ(FixedWidthInBits(*i32), 4 * 8);
+
+  ASSERT_EQ(FixedWidthInBits(*varlen), -1);
+  ASSERT_EQ(FixedWidthInBits(*varlen), -1);
+
+  ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), -1);
+  ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), -1);
+  ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), -1);
+
+  ASSERT_EQ(FixedWidthInBits(*fsl(0, i8)), 0);
+  ASSERT_EQ(FixedWidthInBits(*fsl(3, i8)), 3 * 8);
+  ASSERT_EQ(FixedWidthInBits(*fsl(5, i8)), 5 * 8);
+  ASSERT_EQ(FixedWidthInBits(*fsl(0, i32)), 0);
+  ASSERT_EQ(FixedWidthInBits(*fsl(3, i32)), 4 * 3 * 8);
+  ASSERT_EQ(FixedWidthInBits(*fsl(5, i32)), 4 * 5 * 8);
+
+  ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i8))), 0);
+  ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i8))), 2 * 3 * 8);
+  ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i8))), 2 * 5 * 8);
+  ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(0, i32))), 0);
+  ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(3, i32))), 2 * 3 * 4 * 8);
+  ASSERT_EQ(FixedWidthInBits(*fsl(2, fsl(5, i32))), 2 * 5 * 4 * 8);
+
+  ASSERT_EQ(FixedWidthInBits(*fsl(0, varlen)), -1);
+  ASSERT_EQ(FixedWidthInBits(*fsl(2, varlen)), -1);
+}
+
+}  // namespace arrow::util
diff --git a/cpp/src/arrow/util/fixed_width_test_util.h b/cpp/src/arrow/util/fixed_width_test_util.h
new file mode 100644
index 00000000000..ca141b7ca2c
--- /dev/null
+++ b/cpp/src/arrow/util/fixed_width_test_util.h
@@ -0,0 +1,203 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/builder_primitive.h"
+#include "arrow/builder.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+
+namespace arrow::util::internal {
+
+class NestedListGenerator {
+ public:
+  /// \brief Create a nested FixedSizeListType.
+  ///
+  /// \return `fixed_size_list(fixed_size_list(..., sizes[1]), sizes[0])`
+  static std::shared_ptr<DataType> NestedFSLType(
+      const std::shared_ptr<DataType>& inner_type, const std::vector<int>& sizes) {
+    auto type = inner_type;
+    for (auto it = sizes.rbegin(); it != sizes.rend(); it++) {
+      type = fixed_size_list(std::move(type), *it);
+    }
+    return type;
+  }
+
+  /// \brief Create a nested FixedListType.
+  ///
+  /// \return `list(list(...))`
+  static std::shared_ptr<DataType> NestedListType(
+      const std::shared_ptr<DataType>& inner_type, size_t depth) {
+    auto list_type = list(inner_type);
+    for (size_t i = 1; i < depth; i++) {
+      list_type = list(std::move(list_type));
+    }
+    return list_type;
+  }
+
+ private:
+  template <typename ArrowType>
+  static Status AppendNumeric(ArrayBuilder* builder, int64_t* next_value) {
+    using NumericBuilder = ::arrow::NumericBuilder<ArrowType>;
+    using value_type = typename NumericBuilder::value_type;
+    auto* numeric_builder = ::arrow::internal::checked_cast<NumericBuilder*>(builder);
+    auto cast_next_value =
+        static_cast<value_type>(*next_value % std::numeric_limits<value_type>::max());
+    RETURN_NOT_OK(numeric_builder->Append(cast_next_value));
+    *next_value += 1;
+    return Status::OK();
+  }
+
+  // Append([...[[*next_inner_value++, *next_inner_value++, ...]]...])
+  static Status AppendNestedList(ArrayBuilder* nested_builder, const int* list_sizes,
+                                 int64_t* next_inner_value) {
+    using ::arrow::internal::checked_cast;
+    ArrayBuilder* builder = nested_builder;
+    auto type = builder->type();
+    if (type->id() == Type::FIXED_SIZE_LIST || type->id() == Type::LIST) {
+      const int list_size = *list_sizes;
+      if (type->id() == Type::FIXED_SIZE_LIST) {
+        auto* fsl_builder = checked_cast<FixedSizeListBuilder*>(builder);
+        assert(list_size == checked_cast<FixedSizeListType&>(*type).list_size());
+        RETURN_NOT_OK(fsl_builder->Append());
+        builder = fsl_builder->value_builder();
+      } else {  // type->id() == Type::LIST)
+        auto* list_builder = checked_cast<ListBuilder*>(builder);
+        RETURN_NOT_OK(list_builder->Append(/*is_valid=*/true, list_size));
+        builder = list_builder->value_builder();
+      }
+      list_sizes++;
+      for (int i = 0; i < list_size; i++) {
+        RETURN_NOT_OK(AppendNestedList(builder, list_sizes, next_inner_value));
+      }
+    } else {
+      switch (type->id()) {
+        case Type::INT8:
+          RETURN_NOT_OK(AppendNumeric<Int8Type>(builder, next_inner_value));
+          break;
+        case Type::INT16:
+          RETURN_NOT_OK(AppendNumeric<Int16Type>(builder, next_inner_value));
+          break;
+        case Type::INT32:
+          RETURN_NOT_OK(AppendNumeric<Int32Type>(builder, next_inner_value));
+          break;
+        case Type::INT64:
+          RETURN_NOT_OK(AppendNumeric<Int64Type>(builder, next_inner_value));
+          break;
+        default:
+          return Status::NotImplemented("Unsupported type: ", *type);
+      }
+    }
+    return Status::OK();
+  }
+
+  static Result<std::shared_ptr<Array>> NestedListArray(
+      ArrayBuilder* nested_builder, const std::vector<int>& list_sizes, int64_t length) {
+    int64_t next_inner_value = 0;
+    for (int64_t i = 0; i < length; i++) {
+      RETURN_NOT_OK(
+          AppendNestedList(nested_builder, list_sizes.data(), &next_inner_value));
+    }
+    return nested_builder->Finish();
+  }
+
+ public:
+  static Result<std::shared_ptr<Array>> NestedFSLArray(
+      const std::shared_ptr<DataType>& inner_type, const std::vector<int>& list_sizes,
+      int64_t length) {
+    auto nested_type = NestedFSLType(inner_type, list_sizes);
+    ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type));
+    return NestedListArray(builder.get(), list_sizes, length);
+  }
+
+  static Result<std::shared_ptr<Array>> NestedListArray(
+      const std::shared_ptr<DataType>& inner_type, const std::vector<int>& list_sizes,
+      int64_t length) {
+    auto nested_type = NestedListType(inner_type, list_sizes.size());
+    ARROW_ASSIGN_OR_RAISE(auto builder, MakeBuilder(nested_type));
+    return NestedListArray(builder.get(), list_sizes, length);
+  }
+
+  /// \brief Generate all possible nested list configurations of depth 1 to max_depth.
+  ///
+  /// Each configuration consists of a single inner value type and a list of sizes.
+  /// Both can be used with NestedFSLArray and NestedListArray to generate test data.
+  ///
+  /// The product of the list sizes and the size of the inner value type is always a power
+  /// of 2 no greater than max_power_of_2_size. For max_depth=3 and
+  /// max_power_of_2_size=32, this generates 108 configurations.
+  ///
+  /// \tparam Visit a function type with signature
+  ///     void(const std::shared_ptr<DataType>& inner_type,
+  ///          const std::vector<int>& list_sizes)
+  template <class Visit>
+  static void VisitAllNestedListConfigurations(
+      const std::vector<std::shared_ptr<DataType>>& inner_value_types, Visit&& visit,
+      int max_depth = 3, int max_power_of_2_size = 32) {
+    for (int depth = 1; depth <= max_depth; depth++) {
+      for (auto& type : inner_value_types) {
+        assert(is_fixed_width(*type));
+        int value_width = type->byte_width();
+
+        std::vector<int> list_sizes;  // stack of list sizes
+        auto pop = [&]() {            // pop the list_sizes stack
+          assert(!list_sizes.empty());
+          value_width /= list_sizes.back();
+          list_sizes.pop_back();
+        };
+        auto next = [&]() {  // double the top of the stack
+          assert(!list_sizes.empty());
+          value_width *= 2;
+          list_sizes.back() *= 2;
+          return value_width;
+        };
+        auto push_1s = [&]() {  // fill the stack with 1s
+          while (list_sizes.size() < static_cast<size_t>(depth)) {
+            list_sizes.push_back(1);
+          }
+        };
+
+        // Loop invariants:
+        //   value_width == product(list_sizes) * type->byte_width()
+        //   value_width is a power-of-2 (1, 2, 4, 8, 16, max_power_of_2_size=32)
+        push_1s();
+        do {
+          // for (auto x : list_sizes) printf("%d * ", x);
+          // printf("(%s) %d = %2d\n", type->name().c_str(), type->byte_width(),
+          // value_width);
+          visit(type, list_sizes);
+          // Advance to the next test case
+          while (!list_sizes.empty()) {
+            if (next() <= max_power_of_2_size) {
+              push_1s();
+              break;
+            }
+            pop();
+          }
+        } while (!list_sizes.empty());
+      }
+    }
+  }
+};
+
+}  // namespace arrow::util::internal
diff --git a/cpp/src/arrow/util/gather_internal.h b/cpp/src/arrow/util/gather_internal.h
new file mode 100644
index 00000000000..f52383d7a1f
--- /dev/null
+++ b/cpp/src/arrow/util/gather_internal.h
@@ -0,0 +1,289 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include "arrow/array/data.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/macros.h"
+
+// Implementation helpers for kernels that need to load/gather data from
+// multiple, arbitrary indices.
+//
+// https://en.wikipedia.org/wiki/Gather/scatter_(vector_addressing)
+
+namespace arrow::internal {
+inline namespace gather_internal {
+
+// CRTP [1] base class for Gather that provides a gathering loop in terms of
+// Write*() methods that must be implemented by the derived class.
+//
+// [1] https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern
+template <class GatherImpl>
+class GatherBaseCRTP {
+ public:
+  // Output offset is not supported by Gather and idx is supposed to have offset
+  // pre-applied. idx_validity parameters on functions can use the offset they
+  // carry to read the validity bitmap as bitmaps can't have pre-applied offsets
+  // (they might not align to byte boundaries).
+  GatherBaseCRTP() = default;
+  GatherBaseCRTP(const GatherBaseCRTP&) = delete;
+  GatherBaseCRTP(GatherBaseCRTP&&) = delete;
+  GatherBaseCRTP& operator=(const GatherBaseCRTP&) = delete;
+  GatherBaseCRTP& operator=(GatherBaseCRTP&&) = delete;
+
+ protected:
+  ARROW_FORCE_INLINE int64_t ExecuteNoNulls(int64_t idx_length) {
+    auto* self = static_cast<GatherImpl*>(this);
+    for (int64_t position = 0; position < idx_length; position++) {
+      self->WriteValue(position);
+    }
+    return idx_length;
+  }
+
+  // See derived Gather classes below for the meaning of the parameters, pre and
+  // post-conditions.
+  template <bool kOutputIsZeroInitialized, typename IndexCType>
+  ARROW_FORCE_INLINE int64_t ExecuteWithNulls(const ArraySpan& src_validity,
+                                              int64_t idx_length, const IndexCType* idx,
+                                              const ArraySpan& idx_validity,
+                                              uint8_t* out_is_valid) {
+    auto* self = static_cast<GatherImpl*>(this);
+    OptionalBitBlockCounter indices_bit_counter(idx_validity.buffers[0].data,
+                                                idx_validity.offset, idx_length);
+    int64_t position = 0;
+    int64_t valid_count = 0;
+    while (position < idx_length) {
+      BitBlockCount block = indices_bit_counter.NextBlock();
+      if (!src_validity.MayHaveNulls()) {
+        // Source values are never null, so things are easier
+        valid_count += block.popcount;
+        if (block.popcount == block.length) {
+          // Fastest path: neither source values nor index nulls
+          bit_util::SetBitsTo(out_is_valid, position, block.length, true);
+          for (int64_t i = 0; i < block.length; ++i) {
+            self->WriteValue(position);
+            ++position;
+          }
+        } else if (block.popcount > 0) {
+          // Slow path: some indices but not all are null
+          for (int64_t i = 0; i < block.length; ++i) {
+            ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr);
+            if (idx_validity.IsValid(position)) {
+              // index is not null
+              bit_util::SetBit(out_is_valid, position);
+              self->WriteValue(position);
+            } else if constexpr (!kOutputIsZeroInitialized) {
+              self->WriteZero(position);
+            }
+            ++position;
+          }
+        } else {
+          self->WriteZeroSegment(position, block.length);
+          position += block.length;
+        }
+      } else {
+        // Source values may be null, so we must do random access into src_validity
+        if (block.popcount == block.length) {
+          // Faster path: indices are not null but source values may be
+          for (int64_t i = 0; i < block.length; ++i) {
+            ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
+            if (src_validity.IsValid(idx[position])) {
+              // value is not null
+              self->WriteValue(position);
+              bit_util::SetBit(out_is_valid, position);
+              ++valid_count;
+            } else if constexpr (!kOutputIsZeroInitialized) {
+              self->WriteZero(position);
+            }
+            ++position;
+          }
+        } else if (block.popcount > 0) {
+          // Slow path: some but not all indices are null. Since we are doing
+          // random access in general we have to check the value nullness one by
+          // one.
+          for (int64_t i = 0; i < block.length; ++i) {
+            ARROW_COMPILER_ASSUME(src_validity.buffers[0].data != nullptr);
+            ARROW_COMPILER_ASSUME(idx_validity.buffers[0].data != nullptr);
+            if (idx_validity.IsValid(position) && src_validity.IsValid(idx[position])) {
+              // index is not null && value is not null
+              self->WriteValue(position);
+              bit_util::SetBit(out_is_valid, position);
+              ++valid_count;
+            } else if constexpr (!kOutputIsZeroInitialized) {
+              self->WriteZero(position);
+            }
+            ++position;
+          }
+        } else {
+          if constexpr (!kOutputIsZeroInitialized) {
+            self->WriteZeroSegment(position, block.length);
+          }
+          position += block.length;
+        }
+      }
+    }
+    return valid_count;
+  }
+};
+
+template <int kValueWidthInBits, typename IndexCType, bool WithFactor,
+          std::enable_if_t<kValueWidthInBits % 8 == 0 || kValueWidthInBits == 1, bool> =
+              true>
+class Gather : public GatherBaseCRTP<Gather<kValueWidthInBits, IndexCType, WithFactor>> {
+ public:
+  static constexpr int kValueWidth = kValueWidthInBits / 8;
+
+ private:
+  const int64_t src_length_;  // number of elements of kValueWidth bytes in src_
+  const uint8_t* src_;
+  const int64_t idx_length_;  // number IndexCType elements in idx_
+  const IndexCType* idx_;
+  uint8_t* out_;
+  size_t factor_;
+
+ public:
+  void WriteValue(int64_t position) {
+    if constexpr (WithFactor) {
+      const size_t scaled_factor = kValueWidth * factor_;
+      memcpy(out_ + position * scaled_factor, src_ + idx_[position] * scaled_factor,
+             scaled_factor);
+    } else {
+      memcpy(out_ + position * kValueWidth, src_ + idx_[position] * kValueWidth,
+             kValueWidth);
+    }
+  }
+
+  void WriteZero(int64_t position) {
+    if constexpr (WithFactor) {
+      const size_t scaled_factor = kValueWidth * factor_;
+      memset(out_ + position * scaled_factor, 0, scaled_factor);
+    } else {
+      memset(out_ + position * kValueWidth, 0, kValueWidth);
+    }
+  }
+
+  void WriteZeroSegment(int64_t position, int64_t length) {
+    if constexpr (WithFactor) {
+      const size_t scaled_factor = kValueWidth * factor_;
+      memset(out_ + position * scaled_factor, 0, length * scaled_factor);
+    } else {
+      memset(out_ + position * kValueWidth, 0, length * kValueWidth);
+    }
+  }
+
+ public:
+  Gather(int64_t src_length, const uint8_t* src, int64_t zero_src_offset,
+         int64_t idx_length, const IndexCType* idx, uint8_t* out, size_t factor)
+      : src_length_(src_length),
+        src_(src),
+        idx_length_(idx_length),
+        idx_(idx),
+        out_(out),
+        factor_(factor) {
+    assert(zero_src_offset == 0);
+    assert(src && idx && out);
+    assert((WithFactor || factor == 1) &&
+           "When WithFactor is false, the factor is assumed to be 1 at compile time");
+  }
+
+  ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); }
+
+  /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized.
+  /// \pre Bits in out_is_valid have to always be zero initialized.
+  /// \post The bits for the valid elements (and only those) are set in out_is_valid.
+  /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null
+  ///       elements have 0s written to them. This might be less efficient than
+  ///       zero-initializing first and calling this->Execute() afterwards.
+  /// \return The number of valid elements in out.
+  template <bool kOutputIsZeroInitialized = false>
+  ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity,
+                                     const ArraySpan& idx_validity,
+                                     uint8_t* out_is_valid) {
+    assert(src_length_ == src_validity.length);
+    assert(idx_length_ == idx_validity.length);
+    assert(out_is_valid);
+    return this->template ExecuteWithNulls<kOutputIsZeroInitialized>(
+        src_validity, idx_length_, idx_, idx_validity, out_is_valid);
+  }
+};
+
+template <typename IndexCType>
+class Gather<1, IndexCType, /*WithFactor=*/false>
+    : public GatherBaseCRTP<Gather<1, IndexCType, false>> {
+ private:
+  const int64_t src_length_;  // number of elements of bits bytes in src_ after offset
+  const uint8_t* src_;        // the boolean array data buffer in bits
+  const int64_t src_offset_;  // offset in bits
+  const int64_t idx_length_;  // number IndexCType elements in idx_
+  const IndexCType* idx_;
+  uint8_t* out_;  // output boolean array data buffer in bits
+
+ public:
+  Gather(int64_t src_length, const uint8_t* src, int64_t src_offset, int64_t idx_length,
+         const IndexCType* idx, uint8_t* out, size_t factor)
+      : src_length_(src_length),
+        src_(src),
+        src_offset_(src_offset),
+        idx_length_(idx_length),
+        idx_(idx),
+        out_(out) {
+    assert(src && idx && out);
+    assert(factor == 1 &&
+           "factor != 1 is not supported when Gather is used to gather bits/booleans");
+  }
+
+  void WriteValue(int64_t position) {
+    bit_util::SetBitTo(out_, position,
+                       bit_util::GetBit(src_, src_offset_ + idx_[position]));
+  }
+
+  void WriteZero(int64_t position) { bit_util::ClearBit(out_, position); }
+
+  void WriteZeroSegment(int64_t position, int64_t block_length) {
+    bit_util::SetBitsTo(out_, position, block_length, false);
+  }
+
+  ARROW_FORCE_INLINE int64_t Execute() { return this->ExecuteNoNulls(idx_length_); }
+
+  /// \pre If kOutputIsZeroInitialized, then this->out_ has to be zero initialized.
+  /// \pre Bits in out_is_valid have to always be zero initialized.
+  /// \post The bits for the valid elements (and only those) are set in out_is_valid.
+  /// \post If !kOutputIsZeroInitialized, then positions in this->_out containing null
+  ///       elements have 0s written to them. This might be less efficient than
+  ///       zero-initializing first and calling this->Execute() afterwards.
+  /// \return The number of valid elements in out.
+  template <bool kOutputIsZeroInitialized = false>
+  ARROW_FORCE_INLINE int64_t Execute(const ArraySpan& src_validity,
+                                     const ArraySpan& idx_validity,
+                                     uint8_t* out_is_valid) {
+    assert(src_length_ == src_validity.length && src_offset_ == src_validity.offset);
+    assert(idx_length_ == idx_validity.length);
+    assert(out_is_valid);
+    return this->template ExecuteWithNulls<kOutputIsZeroInitialized>(
+        src_validity, idx_length_, idx_, idx_validity, out_is_valid);
+  }
+};
+
+}  // namespace gather_internal
+}  // namespace arrow::internal